In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 去除雜訊

In [2]:
def RemoveNoise(dataset, remove_condition):
    # Remove data with gaze vector X, Y, Z have value 0.
    print(f'Total count: {len(dataset)}, gaze vector zero count: {len(dataset[remove_condition])}')

    dataset = dataset.drop(dataset[remove_condition].index)

    print(f'Total count: {len(dataset)}')
    
    return dataset

## 載入資料集

In [3]:
import os
df_train = pd.core.frame.DataFrame()
for dirname, _, filenames in os.walk('/project/xt121-group5/scene2_data/'):
    for filename in filenames:
        if dirname.split('/')[-1] != 'test_data':
            print(os.path.join(dirname, filename))
            df_train2 = pd.read_csv(os.path.join(dirname, filename))
            df_train2 = RemoveNoise(df_train2, 
                                    (df_train2['GazeVector_X'] == 0) & 
                                    (df_train2['GazeVector_Y'] == 0) & 
                                    (df_train2['GazeVector_Z'] == 0))
            df_train = pd.concat([df_train, df_train2])
        
df_train = df_train.drop(labels = ['RecordTime'], axis = 1) # 移除 RecordTime

/project/xt121-group5/scene2_data/test_data/.ipynb_checkpoints/2023-05-06 135830_c-checkpoint.csv
Total count: 280, gaze vector zero count: 0
Total count: 280
/project/xt121-group5/scene2_data/alice/2023-05-01 232058.csv
Total count: 420, gaze vector zero count: 311
Total count: 109
/project/xt121-group5/scene2_data/alice/2023-05-01 231925.csv
Total count: 412, gaze vector zero count: 16
Total count: 396
/project/xt121-group5/scene2_data/alice/.ipynb_checkpoints/2023-05-01 232058-checkpoint.csv
Total count: 420, gaze vector zero count: 311
Total count: 109
/project/xt121-group5/scene2_data/alice/.ipynb_checkpoints/2023-05-01 231925-checkpoint.csv
Total count: 412, gaze vector zero count: 16
Total count: 396
/project/xt121-group5/scene2_data/pinsian/2023-05-13 142553_c.csv
Total count: 280, gaze vector zero count: 0
Total count: 280
/project/xt121-group5/scene2_data/pinsian/2023-05-13 142611 nc.csv
Total count: 280, gaze vector zero count: 28
Total count: 252
/project/xt121-group5/scene

In [4]:
# check the data shape.
print(df_train.shape)
print(type(df_train))
print(df_train.info)

(16445, 103)
<class 'pandas.core.frame.DataFrame'>
<bound method DataFrame.info of      FaceBoundingBox_X  FaceBoundingBox_Y  FaceBoundingBox_W  \
0                  531                175                369   
1                  532                175                367   
2                  532                175                367   
3                  532                175                366   
4                  532                175                367   
..                 ...                ...                ...   
198                429                164                522   
199                431                164                519   
200                428                164                522   
201                431                164                516   
202                432                164                515   

     FaceBoundingBox_H  FaceLandmarks_1_X  FaceLandmarks_1_Y  \
0                  369                684                322   
1                  3

## 資料前處理

### 前處理－臉部特徵座標平移

In [5]:
def FaceLandmarksPreprocessing(df_data, verbose = False):
    for col_name in df_data.columns.to_list():
        if col_name.find('FaceLandmarks_') == 0 or \
        col_name.find('EyeLandmarks_') == 0 or \
        col_name.find('LeftEyeBoundingBox_') == 0 or \
        col_name.find('RightEyeBoundingBox_') == 0 or \
        col_name.find('LeftEyeMidPoint_') == 0 or \
        col_name.find('RightEyeMidPoint_') == 0:
            if col_name.find('_X') == len(col_name) - 2:
                df_data[col_name] = df_data[col_name] - df_data['FaceBoundingBox_X']
                if verbose:
                    print(f'Column {col_name} is subtracted by FaceBoundingBox_X.')
            elif col_name.find('_Y') == len(col_name) - 2:
                df_data[col_name] = df_data[col_name] - df_data['FaceBoundingBox_Y']
                if verbose:
                    print(f'Column {col_name} is subtracted by FaceBoundingBox_Y.')
            else:
#                 print(f'Ignore column {col_name}.')
                pass
        else:
#             print(f'Ignore column {col_name}')
            pass
    return df_data

In [6]:
print(df_train.head())

   FaceBoundingBox_X  FaceBoundingBox_Y  FaceBoundingBox_W  FaceBoundingBox_H  \
0                531                175                369                369   
1                532                175                367                367   
2                532                175                367                367   
3                532                175                366                366   
4                532                175                367                367   

   FaceLandmarks_1_X  FaceLandmarks_1_Y  FaceLandmarks_2_X  FaceLandmarks_2_Y  \
0                684                322                641                326   
1                683                321                640                325   
2                683                321                640                325   
3                682                322                640                325   
4                683                321                640                325   

   FaceLandmarks_3_X  Face

In [7]:
df_train = FaceLandmarksPreprocessing(df_train, True)

Column FaceLandmarks_1_X is subtracted by FaceBoundingBox_X.
Column FaceLandmarks_1_Y is subtracted by FaceBoundingBox_Y.
Column FaceLandmarks_2_X is subtracted by FaceBoundingBox_X.
Column FaceLandmarks_2_Y is subtracted by FaceBoundingBox_Y.
Column FaceLandmarks_3_X is subtracted by FaceBoundingBox_X.
Column FaceLandmarks_3_Y is subtracted by FaceBoundingBox_Y.
Column FaceLandmarks_4_X is subtracted by FaceBoundingBox_X.
Column FaceLandmarks_4_Y is subtracted by FaceBoundingBox_Y.
Column FaceLandmarks_5_X is subtracted by FaceBoundingBox_X.
Column FaceLandmarks_5_Y is subtracted by FaceBoundingBox_Y.
Column FaceLandmarks_6_X is subtracted by FaceBoundingBox_X.
Column FaceLandmarks_6_Y is subtracted by FaceBoundingBox_Y.
Column FaceLandmarks_7_X is subtracted by FaceBoundingBox_X.
Column FaceLandmarks_7_Y is subtracted by FaceBoundingBox_Y.
Column FaceLandmarks_8_X is subtracted by FaceBoundingBox_X.
Column FaceLandmarks_8_Y is subtracted by FaceBoundingBox_Y.
Column FaceLandmarks_9_X

In [8]:
print(df_train.head())

   FaceBoundingBox_X  FaceBoundingBox_Y  FaceBoundingBox_W  FaceBoundingBox_H  \
0                531                175                369                369   
1                532                175                367                367   
2                532                175                367                367   
3                532                175                366                366   
4                532                175                367                367   

   FaceLandmarks_1_X  FaceLandmarks_1_Y  FaceLandmarks_2_X  FaceLandmarks_2_Y  \
0                153                147                110                151   
1                151                146                108                150   
2                151                146                108                150   
3                150                147                108                150   
4                151                146                108                150   

   FaceLandmarks_3_X  Face

### 前處理－檢查缺失值

In [9]:
# checked missing data
print("Before data clean(NAN mount):", len(np.where(np.isnan(df_train)==1)[0]))

Before data clean(NAN mount): 0


In [10]:
# Find columns with only unique value.
# unique_col = []
# for i in df_train.columns:
#     if np.unique(df_train[i]).shape[0]==1:
#         print(f'Get column {i} with only unique value.')
#         unique_col.append(i) 

In [11]:
# print(unique_col)

In [12]:
# df_train.describe()[unique_col]

In [13]:
# Drop columns with only unique value.
# df_train = df_train.drop(unique_col, axis=1)

### 前處理－保留重要特徵

In [14]:
# 列出不重要的特徵。
# 先假設除了gaze vector以外的特徵都不重要，之後再調整。
# del_col = ['FaceBoundingBox_X', 'FaceBoundingBox_Y','FaceBoundingBox_W', 'FaceBoundingBox_H', 'FaceLandmarks_1_X', 
#            'FaceLandmarks_1_Y', 'FaceLandmarks_2_X','FaceLandmarks_2_Y', 'FaceLandmarks_3_X', 'FaceLandmarks_3_Y', 
#            'FaceLandmarks_4_X', 'FaceLandmarks_4_Y','FaceLandmarks_5_X', 'FaceLandmarks_5_Y', 'FaceLandmarks_6_X', 
#            'FaceLandmarks_6_Y', 'FaceLandmarks_7_X', 'FaceLandmarks_7_Y', 'FaceLandmarks_8_X', 'FaceLandmarks_8_Y', 
#            'FaceLandmarks_9_X', 'FaceLandmarks_9_Y', 'FaceLandmarks_10_X', 'FaceLandmarks_10_Y', 'FaceLandmarks_11_X', 
#            'FaceLandmarks_11_Y', 'FaceLandmarks_12_X', 'FaceLandmarks_12_Y', 'FaceLandmarks_13_X', 'FaceLandmarks_13_Y', 
#            'FaceLandmarks_14_X', 'FaceLandmarks_14_Y', 'FaceLandmarks_15_X', 'FaceLandmarks_15_Y', 'FaceLandmarks_16_X', 
#            'FaceLandmarks_16_Y', 'FaceLandmarks_17_X', 'FaceLandmarks_17_Y', 'FaceLandmarks_18_X', 'FaceLandmarks_18_Y', 
#            'FaceLandmarks_19_X', 'FaceLandmarks_19_Y', 'FaceLandmarks_20_X', 'FaceLandmarks_20_Y', 'FaceLandmarks_21_X', 
#            'FaceLandmarks_21_Y', 'FaceLandmarks_22_X', 'FaceLandmarks_22_Y', 'FaceLandmarks_23_X', 'FaceLandmarks_23_Y', 
#            'FaceLandmarks_24_X', 'FaceLandmarks_24_Y', 'FaceLandmarks_25_X', 'FaceLandmarks_25_Y', 'FaceLandmarks_26_X', 
#            'FaceLandmarks_26_Y', 'FaceLandmarks_27_X', 'FaceLandmarks_27_Y', 'FaceLandmarks_28_X', 'FaceLandmarks_28_Y', 
#            'FaceLandmarks_29_X', 'FaceLandmarks_29_Y', 'FaceLandmarks_30_X', 'FaceLandmarks_30_Y', 'FaceLandmarks_31_X', 
#            'FaceLandmarks_31_Y', 'FaceLandmarks_32_X', 'FaceLandmarks_32_Y', 'FaceLandmarks_33_X', 'FaceLandmarks_33_Y', 
#            'FaceLandmarks_34_X', 'FaceLandmarks_34_Y', 'FaceLandmarks_35_X', 'FaceLandmarks_35_Y', 'HeadPoseAngles_X', 
#            'HeadPoseAngles_Y','HeadPoseAngles_Z', 'LeftEyeBoundingBox_X', 'LeftEyeBoundingBox_Y', 'LeftEyeBoundingBox_W', 
#            'LeftEyeBoundingBox_H', 'RightEyeBoundingBox_X', 'RightEyeBoundingBox_Y', 'RightEyeBoundingBox_W', 'RightEyeBoundingBox_H', 
#            'EyeLandmarks_1_X', 'EyeLandmarks_1_Y', 'EyeLandmarks_2_X', 'EyeLandmarks_2_Y', 'EyeLandmarks_3_X', 
#            'EyeLandmarks_3_Y', 'EyeLandmarks_4_X', 'EyeLandmarks_4_Y', 'LeftEyeMidPoint_X', 'LeftEyeMidPoint_Y', 
#            'RightEyeMidPoint_X','RightEyeMidPoint_Y', 'EyeState_Left', 'EyeState_Right']
# del_col = ['FaceBoundingBox_X', 'FaceBoundingBox_Y','FaceBoundingBox_W', 'FaceBoundingBox_H', 'HeadPoseAngles_X', 
#            'HeadPoseAngles_Y','HeadPoseAngles_Z', 'LeftEyeBoundingBox_X', 'LeftEyeBoundingBox_Y', 'LeftEyeBoundingBox_W', 
#            'LeftEyeBoundingBox_H', 'RightEyeBoundingBox_X', 'RightEyeBoundingBox_Y', 'RightEyeBoundingBox_W', 'RightEyeBoundingBox_H', 
#            'EyeLandmarks_1_X', 'EyeLandmarks_1_Y', 'EyeLandmarks_2_X', 'EyeLandmarks_2_Y', 'EyeLandmarks_3_X', 
#            'EyeLandmarks_3_Y', 'EyeLandmarks_4_X', 'EyeLandmarks_4_Y', 'LeftEyeMidPoint_X', 'LeftEyeMidPoint_Y', 
#            'RightEyeMidPoint_X','RightEyeMidPoint_Y', 'EyeState_Left', 'EyeState_Right']

In [15]:
# df_train = df_train.drop(del_col, axis = 1)

In [16]:
# 列出重要的特徵。重要特徵選取，係由XGBoost的Feature importance圖選取前幾名。
col_high_rel = ['HeadPoseAngles_Y', 'GazeVector_X', 'GazeVector_Z', 'GazeVector_Y', 'HeadPoseAngles_Z']
col_deleted = ['FaceBoundingBox_X', 'FaceBoundingBox_Y']

In [17]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import MinMaxScaler

In [18]:
# def data_preprocessing(df_input, train=True, sc=None):
#     # numeric feature standardization
#     if train:
#         sc = StandardScaler()
# #         sc = MinMaxScaler()
#         df = sc.fit_transform(df_input.iloc[:, 0:-1])
#     else:
#         df = sc.transform(df_input)
#     return df, sc

In [19]:
# X, train_sc = data_preprocessing(df_train)

In [20]:
# X.shape

In [21]:
# train_sc.mean_

In [22]:
# train_sc.var_

In [23]:
y = df_train['RecordType'].values
print(y.shape)

(16445,)


In [24]:
# 刪除沒有被列在重要特徵列表的特徵。
def ReserveImportColumns(df_data, col_reserved):
    for col_name in df_data.columns.to_list():
        if col_name not in col_reserved:
            df_data = df_data.drop(col_name, axis = 1)
    return df_data

In [25]:
df_train.drop(col_deleted, axis = 1)
df_train = ReserveImportColumns(df_train, col_high_rel)

print(df_train.shape)
print(df_train.columns)

(16445, 5)
Index(['HeadPoseAngles_Y', 'HeadPoseAngles_Z', 'GazeVector_X', 'GazeVector_Y',
       'GazeVector_Z'],
      dtype='object')


In [26]:
np.unique(y)
print(y)

[1 1 1 ... 1 1 1]


## 切割訓練集

In [27]:
# npa_train = df_train.to_numpy()

In [28]:
from sklearn.model_selection import train_test_split
x_train , x_valid , y_train , y_valid = train_test_split(df_train, y, test_size=0.25, random_state=17, stratify=y)
#print(y_train)
#print(y_valid)

In [29]:
# unique, counts = np.unique(y.argmax(-1), return_counts=True)
# plt.bar(unique, counts)

In [30]:
# unique, counts = np.unique(y_train.argmax(-1), return_counts=True)
# plt.bar(unique, counts)

In [31]:
print("shape of x_train: ", x_train.shape)

shape of x_train:  (12333, 5)


In [32]:
print("shape of x_valid: ", x_valid.shape)

shape of x_valid:  (4112, 5)


In [33]:
clf = RandomForestClassifier()

In [34]:
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [35]:
y_pred = clf.predict(x_valid)

In [36]:
accuracy_score(y_valid, y_pred)

0.9832198443579766

In [37]:
clf.feature_importances_

array([0.18090774, 0.09030788, 0.11557388, 0.16881452, 0.44439597])

## 模型測試

In [38]:
def GetScene2AccuractFromTest(arrTestAns, correctAns):
    accurate_rate = np.count_nonzero(arrTestAns == correctAns) / len(arrTestAns)
#     print(accurate_rate)
    return accurate_rate

In [39]:
# Test
# 測試一個答案全部為1的資料。
df_test = pd.read_csv('/project/xt121-group5/scene2_data/test_data/2023-05-17 194144 c.csv')
df_test = RemoveNoise(df_test, 
                      (df_test['GazeVector_X'] == 0) & 
                      (df_test['GazeVector_Y'] == 0) & 
                      (df_test['GazeVector_Z'] == 0))
df_test = FaceLandmarksPreprocessing(df_test)
df_test = df_test.drop(col_deleted, axis = 1)
df_test = ReserveImportColumns(df_test, col_high_rel)
# test, _ = data_preprocessing(df_test, train=False, sc=train_sc)

Total count: 1082, gaze vector zero count: 0
Total count: 1082


In [40]:
print(df_test.columns)
# npa_test = df_test.to_numpy()
pred = clf.predict(df_test)
print(pred.shape)
print(type(pred))

Index(['HeadPoseAngles_Y', 'HeadPoseAngles_Z', 'GazeVector_X', 'GazeVector_Y',
       'GazeVector_Z'],
      dtype='object')
(1082,)
<class 'numpy.ndarray'>


In [41]:
print(pred)
print(f'Test accurate rate: {GetScene2AccuractFromTest(pred, 1)}')

[1 1 1 ... 1 1 1]
Test accurate rate: 0.8872458410351202


In [42]:
# Test
# 測試一個答案全部為0的資料。
df_test = pd.read_csv('/project/xt121-group5/scene2_data/test_data/2023-05-17 194223 nc.csv')
df_test = RemoveNoise(df_test, 
                      (df_test['GazeVector_X'] == 0) & 
                      (df_test['GazeVector_Y'] == 0) & 
                      (df_test['GazeVector_Z'] == 0))
df_test = FaceLandmarksPreprocessing(df_test)
df_test = df_test.drop(col_deleted, axis = 1)
df_test = ReserveImportColumns(df_test, col_high_rel)

Total count: 956, gaze vector zero count: 70
Total count: 886


In [43]:
# npa_test = df_test.to_numpy()
pred = clf.predict(df_test)
print(pred)
print(f'Test accurate rate: {GetScene2AccuractFromTest(pred, 0)}')

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

## 儲存模型

### 方法1. joblib

In [44]:
# Save model
import joblib
joblib.dump(clf, './RF_model')

# Load model
# loaded_rf_model = joblib.load('./RF_model')

['./RF_model']

### 方法2. pickle

In [45]:
import pickle

# save
with open('RF_pickle_model.pkl','wb') as f:
    pickle.dump(clf,f)

# load
with open('RF_pickle_model.pkl', 'rb') as f:
    clf2 = pickle.load(f)
    
pred = clf2.predict(df_test)
print(pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 