In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## 載入資料集

In [2]:
import os
df_train = pd.core.frame.DataFrame()
for dirname, _, filenames in os.walk('/project/xt121-group5/scene2_data/'):
    for filename in filenames:
        if dirname.split('/')[-1] != 'test_data':
            print(os.path.join(dirname, filename))
            df_train2 = pd.read_csv(os.path.join(dirname, filename))
#             df_train = df_train.append(df_train2, ignore_index=True)
            df_train = pd.concat([df_train, df_train2])
        
df_train = df_train.drop(labels=['RecordTime'], axis=1) # 移除 RecordTime

/project/xt121-group5/scene2_data/alice/2023-05-01 232058.csv
/project/xt121-group5/scene2_data/alice/2023-05-01 231925.csv
/project/xt121-group5/scene2_data/larry/2023-05-01 232448.csv
/project/xt121-group5/scene2_data/larry/2023-05-01 232329.csv
/project/xt121-group5/scene2_data/charlie/2023-05-05 103150_c.csv
/project/xt121-group5/scene2_data/charlie/2023-05-05 103233_nc.csv
/project/xt121-group5/scene2_data/jason/2023-05-05 151434.csv
/project/xt121-group5/scene2_data/jason/2023-05-05 151410.csv
/project/xt121-group5/scene2_data/jason/2023-05-05 145529.csv
/project/xt121-group5/scene2_data/jason/2023-05-05 145621.csv
/project/xt121-group5/scene2_data/jason/2023-05-05 145600.csv
/project/xt121-group5/scene2_data/jason/2023-05-05 145633.csv
/project/xt121-group5/scene2_data/jason/2023-05-05 145640.csv
/project/xt121-group5/scene2_data/jason/2023-05-05 151354.csv
/project/xt121-group5/scene2_data/jason/2023-05-05 151333.csv


In [3]:
print(df_train.shape)

(6681, 103)


## 檢查缺失值

In [4]:
# checked missing data
print("Before data clean(NAN mount):", len(np.where(np.isnan(df_train)==1)[0]))

Before data clean(NAN mount): 0


In [5]:
# Find columns with only unique value.
# unique_col = []
# for i in df_train.columns:
#     if np.unique(df_train[i]).shape[0]==1:
#         print(f'Get column {i} with only unique value.')
#         unique_col.append(i) 

In [6]:
# print(unique_col)

In [7]:
# df_train.describe()[unique_col]

In [8]:
# Drop columns with only unique value.
# df_train = df_train.drop(unique_col, axis=1)

## 資料前處理

In [9]:
# 列出不重要的特徵。
# 先假設除了gaze vector以外的特徵都不重要，之後再調整。
del_col = ['FaceBoundingBox_X', 'FaceBoundingBox_Y','FaceBoundingBox_W', 'FaceBoundingBox_H', 'FaceLandmarks_1_X', 
           'FaceLandmarks_1_Y', 'FaceLandmarks_2_X','FaceLandmarks_2_Y', 'FaceLandmarks_3_X', 'FaceLandmarks_3_Y', 
           'FaceLandmarks_4_X', 'FaceLandmarks_4_Y','FaceLandmarks_5_X', 'FaceLandmarks_5_Y', 'FaceLandmarks_6_X', 
           'FaceLandmarks_6_Y', 'FaceLandmarks_7_X', 'FaceLandmarks_7_Y', 'FaceLandmarks_8_X', 'FaceLandmarks_8_Y', 
           'FaceLandmarks_9_X', 'FaceLandmarks_9_Y', 'FaceLandmarks_10_X', 'FaceLandmarks_10_Y', 'FaceLandmarks_11_X', 
           'FaceLandmarks_11_Y', 'FaceLandmarks_12_X', 'FaceLandmarks_12_Y', 'FaceLandmarks_13_X', 'FaceLandmarks_13_Y', 
           'FaceLandmarks_14_X', 'FaceLandmarks_14_Y', 'FaceLandmarks_15_X', 'FaceLandmarks_15_Y', 'FaceLandmarks_16_X', 
           'FaceLandmarks_16_Y', 'FaceLandmarks_17_X', 'FaceLandmarks_17_Y', 'FaceLandmarks_18_X', 'FaceLandmarks_18_Y', 
           'FaceLandmarks_19_X', 'FaceLandmarks_19_Y', 'FaceLandmarks_20_X', 'FaceLandmarks_20_Y', 'FaceLandmarks_21_X', 
           'FaceLandmarks_21_Y', 'FaceLandmarks_22_X', 'FaceLandmarks_22_Y', 'FaceLandmarks_23_X', 'FaceLandmarks_23_Y', 
           'FaceLandmarks_24_X', 'FaceLandmarks_24_Y', 'FaceLandmarks_25_X', 'FaceLandmarks_25_Y', 'FaceLandmarks_26_X', 
           'FaceLandmarks_26_Y', 'FaceLandmarks_27_X', 'FaceLandmarks_27_Y', 'FaceLandmarks_28_X', 'FaceLandmarks_28_Y', 
           'FaceLandmarks_29_X', 'FaceLandmarks_29_Y', 'FaceLandmarks_30_X', 'FaceLandmarks_30_Y', 'FaceLandmarks_31_X', 
           'FaceLandmarks_31_Y', 'FaceLandmarks_32_X', 'FaceLandmarks_32_Y', 'FaceLandmarks_33_X', 'FaceLandmarks_33_Y', 
           'FaceLandmarks_34_X', 'FaceLandmarks_34_Y', 'FaceLandmarks_35_X', 'FaceLandmarks_35_Y', 'HeadPoseAngles_X', 
           'HeadPoseAngles_Y','HeadPoseAngles_Z', 'LeftEyeBoundingBox_X', 'LeftEyeBoundingBox_Y', 'LeftEyeBoundingBox_W', 
           'LeftEyeBoundingBox_H', 'RightEyeBoundingBox_X', 'RightEyeBoundingBox_Y', 'RightEyeBoundingBox_W', 'RightEyeBoundingBox_H', 
           'EyeLandmarks_1_X', 'EyeLandmarks_1_Y', 'EyeLandmarks_2_X', 'EyeLandmarks_2_Y', 'EyeLandmarks_3_X', 
           'EyeLandmarks_3_Y', 'EyeLandmarks_4_X', 'EyeLandmarks_4_Y', 'LeftEyeMidPoint_X', 'LeftEyeMidPoint_Y', 
           'RightEyeMidPoint_X','RightEyeMidPoint_Y', 'EyeState_Left', 'EyeState_Right']

In [10]:
df_train = df_train.drop(del_col, axis = 1)

In [11]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import MinMaxScaler

In [12]:
# def data_preprocessing(df_input, train=True, sc=None):
#     # numeric feature standardization
#     if train:
#         sc = StandardScaler()
# #         sc = MinMaxScaler()
#         df = sc.fit_transform(df_input.iloc[:, 0:-1])
#     else:
#         df = sc.transform(df_input)
#     return df, sc

In [13]:
# X, train_sc = data_preprocessing(df_train)

In [14]:
# print(X.shape)

In [15]:
# print(train_sc.mean_)

In [16]:
# print(train_sc.var_)

In [17]:
y = df_train['RecordType'].values

In [18]:
print(y.shape)

(6681,)


In [19]:
np.unique(y)
print(y)

[0 0 0 ... 0 0 0]


In [20]:
df_train = df_train.drop('RecordType', axis = 1)

## 切割訓練集

In [21]:
npa_train = df_train.to_numpy()

In [22]:
random_seed = 5  # set seed for same train test data split
x_train, x_test, y_train, y_test = train_test_split(npa_train, 
                                                    y, 
                                                    test_size=0.25, 
                                                    random_state=random_seed)

In [23]:
print("shape of x_train: ", x_train.shape)
print("shape of x_test: ", x_test.shape)

shape of x_train:  (5010, 3)
shape of x_test:  (1671, 3)


In [24]:
clf = GradientBoostingClassifier()

In [25]:
clf.fit(x_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [26]:
y_pred = clf.predict(x_test)

In [27]:
print('Test data: ', y_test)
print('Predicted data:', y_pred)

Test data:  [0 0 1 ... 0 0 0]
Predicted data: [0 0 1 ... 1 0 0]


In [28]:
accuracy_score(y_pred, y_test)

0.9042489527229204

In [29]:
clf.feature_importances_

array([0.05847689, 0.09475463, 0.84676848])

## 模型測試

In [30]:
# Test
# 測試一個答案全部為1的資料。
df_test = pd.read_csv('/project/xt121-group5/scene2_data/test_data/2023-05-06 135830_c.csv')
# df_test = df_test.drop(unique_col, axis=1)
df_test = df_test.drop(del_col, axis=1)
df_test = df_test.drop(labels=['RecordTime'], axis=1)
df_test = df_test.drop(labels=['RecordType'], axis=1)

In [31]:
print(df_test.columns)
npa_test = df_test.to_numpy()
pred = clf.predict(npa_test)
print(pred.shape)
print(type(pred))

Index(['GazeVector_X', 'GazeVector_Y', 'GazeVector_Z'], dtype='object')
(280,)
<class 'numpy.ndarray'>


In [32]:
print(pred)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [33]:
# Test
# 測試一個答案全部為0的資料。
df_test = pd.read_csv('/project/xt121-group5/scene2_data/test_data/2023-05-06 135847_nc.csv')
# df_test = df_test.drop(unique_col, axis=1)
df_test = df_test.drop(del_col, axis=1)
df_test = df_test.drop(labels=['RecordTime'], axis=1)
df_test = df_test.drop(labels=['RecordType'], axis=1)
# test, _ = data_preprocessing(df_test, train=False, sc=train_sc)

In [34]:
npa_test = df_test.to_numpy()
pred = clf.predict(npa_test)
print(pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## 儲存模型

### 方法1. joblib

In [35]:
# Save model
import joblib
joblib.dump(clf, './GB_joblib_model')

# Load model
loaded_gb_model = joblib.load('./GB_joblib_model')

### 方法2. pickle

In [36]:
import pickle

# save
with open('GB_pickle_model.pkl','wb') as f:
    pickle.dump(clf,f)

# load
with open('GB_pickle_model.pkl', 'rb') as f:
    loaded_gb_model2 = pickle.load(f)
    
pred = loaded_gb_model2.predict(npa_test)
print(pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
