In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 載入資料集

In [41]:
import os
df_train = pd.core.frame.DataFrame()
for dirname, _, filenames in os.walk('/project/xt121-group5/scene2_data/charlie'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        df_train2 = pd.read_csv(os.path.join(dirname, filename))
        df_train = pd.concat([df_train, df_train2])

df_train = df_train.drop(labels=['RecordTime'], axis=1) # 移除 RecordTime

/project/xt121-group5/scene2_data/charlie/2023-05-05 103150_c.csv
/project/xt121-group5/scene2_data/charlie/2023-05-05 103233_nc.csv


In [42]:
# check the data shape
print(df_train.shape)

(1752, 103)


## 檢查缺失值

In [43]:
# checked missing data
print("Before data clean(NAN mount):", len(np.where(np.isnan(df_train)==1)[0]))

Before data clean(NAN mount): 0


In [44]:
# Find columns with only unique value.
# unique_col = []
# for i in df_train.columns:
#     if np.unique(df_train[i]).shape[0]==1:
#         print(f'Get column {i} with only unique value.')
#         unique_col.append(i) 

In [45]:
# print(unique_col)

In [46]:
# df_train.describe()[unique_col]

In [47]:
# Drop columns with only unique value.
# df_train = df_train.drop(unique_col, axis=1)

## 資料前處理

In [48]:
# 列出不重要的特徵。
# 先假設除了gaze vector以外的特徵都不重要，之後再調整。
del_col = ['FaceBoundingBox_X', 'FaceBoundingBox_Y', 'FaceBoundingBox_X', 'FaceBoundingBox_Y','FaceBoundingBox_W', 
           'FaceBoundingBox_H', 'FaceLandmarks_1_X', 'FaceLandmarks_1_Y', 'FaceLandmarks_2_X','FaceLandmarks_2_Y', 
           'FaceLandmarks_3_X', 'FaceLandmarks_3_Y', 'FaceLandmarks_4_X', 'FaceLandmarks_4_Y','FaceLandmarks_5_X', 
           'FaceLandmarks_5_Y', 'FaceLandmarks_6_X', 'FaceLandmarks_6_Y', 'FaceLandmarks_7_X', 'FaceLandmarks_7_Y',
           'FaceLandmarks_8_X', 'FaceLandmarks_8_Y', 'FaceLandmarks_9_X', 'FaceLandmarks_9_Y', 'FaceLandmarks_10_X', 
           'FaceLandmarks_10_Y', 'FaceLandmarks_11_X', 'FaceLandmarks_11_Y', 'FaceLandmarks_12_X', 'FaceLandmarks_12_Y', 
           'FaceLandmarks_13_X', 'FaceLandmarks_13_Y', 'FaceLandmarks_14_X', 'FaceLandmarks_14_Y', 'FaceLandmarks_15_X', 
           'FaceLandmarks_15_Y', 'FaceLandmarks_16_X', 'FaceLandmarks_16_Y', 'FaceLandmarks_17_X', 'FaceLandmarks_17_Y', 
           'FaceLandmarks_18_X', 'FaceLandmarks_18_Y', 'FaceLandmarks_19_X', 'FaceLandmarks_19_Y', 'FaceLandmarks_20_X', 
           'FaceLandmarks_20_Y', 'FaceLandmarks_21_X', 'FaceLandmarks_21_Y', 'FaceLandmarks_22_X', 'FaceLandmarks_22_Y', 
           'FaceLandmarks_23_X', 'FaceLandmarks_23_Y', 'FaceLandmarks_24_X', 'FaceLandmarks_24_Y', 'FaceLandmarks_25_X', 
           'FaceLandmarks_25_Y', 'FaceLandmarks_26_X', 'FaceLandmarks_26_Y', 'FaceLandmarks_27_X', 'FaceLandmarks_27_Y', 
           'FaceLandmarks_28_X', 'FaceLandmarks_28_Y', 'FaceLandmarks_29_X', 'FaceLandmarks_29_Y', 'FaceLandmarks_30_X', 
           'FaceLandmarks_30_Y', 'FaceLandmarks_31_X', 'FaceLandmarks_31_Y', 'FaceLandmarks_32_X', 'FaceLandmarks_32_Y', 
           'FaceLandmarks_33_X', 'FaceLandmarks_33_Y', 'FaceLandmarks_34_X', 'FaceLandmarks_34_Y', 'FaceLandmarks_35_X', 
           'FaceLandmarks_35_Y', 'HeadPoseAngles_X','HeadPoseAngles_Y','HeadPoseAngles_Z','LeftEyeBoundingBox_X', 
           'LeftEyeBoundingBox_Y', 'LeftEyeBoundingBox_W', 'LeftEyeBoundingBox_H', 'RightEyeBoundingBox_X', 'RightEyeBoundingBox_Y', 
           'RightEyeBoundingBox_W', 'RightEyeBoundingBox_H', 'EyeLandmarks_1_X', 'EyeLandmarks_1_Y', 'EyeLandmarks_2_X', 
           'EyeLandmarks_2_Y', 'EyeLandmarks_3_X', 'EyeLandmarks_3_Y', 'EyeLandmarks_4_X', 'EyeLandmarks_4_Y', 
           'LeftEyeMidPoint_X','LeftEyeMidPoint_Y','RightEyeMidPoint_X','RightEyeMidPoint_Y', 'EyeState_Left', 
           'EyeState_Right']

In [49]:
df_train = df_train.drop(del_col, axis = 1)

In [50]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [51]:
# def data_preprocessing(df_input, train=True, sc=None):
#     # numeric feature standardization
#     if train:
#         sc = StandardScaler()
# #         sc = MinMaxScaler()
#         df = sc.fit_transform(df_input.iloc[:, 0:-1])
#     else:
#         df = sc.transform(df_input)
#     return df, sc

In [52]:
# X, train_sc = data_preprocessing(df_train)

In [53]:
# print(X.shape)

In [54]:
# print(train_sc.mean_)

In [55]:
# print(train_sc.var_)

In [56]:
y = df_train['RecordType'].values

In [57]:
print(y.shape)

(1752,)


In [58]:
np.unique(y)
print(y)

[1 1 1 ... 0 0 0]


In [59]:
df_train = df_train.drop('RecordType', axis = 1)

## 切割訓練集

In [60]:
random_seed = 5  # set seed for same train test data split
x_train, x_test, y_train, y_test = train_test_split(df_train, 
                                                    y, 
                                                    random_state=random_seed)

In [61]:
print("shape of x_train: ", x_train.shape)

shape of x_train:  (1314, 3)


In [62]:
print("shape of x_test: ", x_test.shape)

shape of x_test:  (438, 3)


In [63]:
clf = DecisionTreeClassifier()

In [64]:
clf.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [65]:
y_pred = clf.predict(x_test)

In [66]:
accuracy_score(y_test, y_pred)

0.9657534246575342

In [67]:
clf.feature_importances_

array([0.12071461, 0.18401415, 0.69527124])

## 模型測試

In [68]:
# Test
# 測試一個答案全部為1的資料。
df_test = pd.read_csv('/project/xt121-group5/scene2_data/larry/2023-05-01 232329.csv')
# df_test = df_test.drop(unique_col, axis=1)
df_test = df_test.drop(del_col, axis=1)
df_test = df_test.drop(labels=['RecordTime'], axis=1)
df_test = df_test.drop(labels=['RecordType'], axis=1)
# test, _ = data_preprocessing(df_test, train=False, sc=train_sc)

In [69]:
pred = clf.predict(df_test)
print(pred.shape)
print(type(pred))

(305,)
<class 'numpy.ndarray'>


In [70]:
print(pred)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1]


In [71]:
# Test
# 測試一個答案全部為0的資料。
df_test = pd.read_csv('/project/xt121-group5/scene2_data/larry/2023-05-01 232448.csv')
# df_test = df_test.drop(unique_col, axis=1)
df_test = df_test.drop(del_col, axis=1)
df_test = df_test.drop(labels=['RecordTime'], axis=1)
df_test = df_test.drop(labels=['RecordType'], axis=1)
# test, _ = data_preprocessing(df_test, train=False, sc=train_sc)

In [72]:
pred = clf.predict(df_test)
print(pred)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [73]:
# Test
# 測試一個Charlie眼睛明顯看螢幕外面的資料。
df_test = pd.read_csv('/project/xt121-group5/scene2_data/charlie_old/2023-05-02 234906_nc.csv')
# df_test = df_test.drop(unique_col, axis=1)
df_test = df_test.drop(del_col, axis=1)
df_test = df_test.drop(labels=['RecordTime'], axis=1)
df_test = df_test.drop(labels=['RecordType'], axis=1)
# test, _ = data_preprocessing(df_test, train=False, sc=train_sc)

In [74]:
pred = clf.predict(df_test)
print(pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [75]:
# Test
# 測試一個Charlie眼睛專注電腦螢幕的資料。
df_test = pd.read_csv('/project/xt121-group5/scene2_data/charlie_old/2023-05-05 100605_c.csv')
# df_test = df_test.drop(unique_col, axis=1)
df_test = df_test.drop(del_col, axis=1)
df_test = df_test.drop(labels=['RecordTime'], axis=1)
df_test = df_test.drop(labels=['RecordType'], axis=1)
# test, _ = data_preprocessing(df_test, train=False, sc=train_sc)

In [76]:
pred = clf.predict(df_test)
print(pred)

[1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 0 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1]


In [77]:
# Save model
import joblib
joblib.dump(clf, './DT_model')

# Load model
# loaded_dt_model = joblib.load('./DT_model')

['./DT_model']