In [1]:
import mediapipe as mp
import cv2
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

# Drawing helpers
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

objc[58216]: Class CaptureDelegate is implemented in both /Users/fuixlabsdev1/Programming/PP/graduation-thesis/env/lib/python3.8/site-packages/mediapipe/.dylibs/libopencv_videoio.3.4.16.dylib (0x10ae08860) and /Users/fuixlabsdev1/Programming/PP/graduation-thesis/env/lib/python3.8/site-packages/cv2/cv2.abi3.so (0x17ce56480). One of the two will be used. Which one is undefined.
objc[58216]: Class CVWindow is implemented in both /Users/fuixlabsdev1/Programming/PP/graduation-thesis/env/lib/python3.8/site-packages/mediapipe/.dylibs/libopencv_highgui.3.4.16.dylib (0x105bd0a68) and /Users/fuixlabsdev1/Programming/PP/graduation-thesis/env/lib/python3.8/site-packages/cv2/cv2.abi3.so (0x17ce564d0). One of the two will be used. Which one is undefined.
objc[58216]: Class CVView is implemented in both /Users/fuixlabsdev1/Programming/PP/graduation-thesis/env/lib/python3.8/site-packages/mediapipe/.dylibs/libopencv_highgui.3.4.16.dylib (0x105bd0a90) and /Users/fuixlabsdev1/Programming/PP/graduation-th

## 1. Set up important functions

In [2]:
def rescale_frame(frame, percent=50):
    '''
    Rescale a frame to a certain percentage compare to its original frame
    '''
    width = int(frame.shape[1] * percent/ 100)
    height = int(frame.shape[0] * percent/ 100)
    dim = (width, height)
    return cv2.resize(frame, dim, interpolation = cv2.INTER_AREA)


def describe_dataset(dataset_path: str):
    '''
    Describe dataset
    '''

    data = pd.read_csv(dataset_path)
    print(f"Headers: {list(data.columns.values)}")
    print(f'Number of rows: {data.shape[0]} \nNumber of columns: {data.shape[1]}\n')
    print(f"Labels: \n{data['label'].value_counts()}\n")
    print(f"Missing values: {data.isnull().values.any()}\n")
    
    duplicate = data[data.duplicated()]
    print(f"Duplicate Rows : {len(duplicate.sum(axis=1))}")

    return data

## 2. Describe and process data

In [3]:
TRAIN_SET_PATH  = "./err.train.csv"
TEST_SET_PATH  = "./err.test.csv"

In [4]:
df = describe_dataset(TRAIN_SET_PATH)

df.tail(3)

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v', 'left_knee_x', 'left_knee_y', 'left_knee_z', 'left_knee_v', 'right_knee_x', 'right_knee_y', 'right_knee_z', 'right_knee_v', 'left_ankle_x', 'left_ankle_y', 'left_ankle_z', 'left_ankle_v', 'right_ankle_x', 'right_ankle_y', 'right_ankle_z', 'right_ankle_v', 'left_heel_x', 'left_heel_y', 'left_heel_z', 'left_heel_v', 'right_heel_x', 'right_heel_y', 'right_heel_z', 'right_heel_v', 'left_foot_index_x', 'left_foot_index_y', 'left_foot_index_z', 'left_foot_index_v', 'right_foot_index_x', 'right_foot_index_y', 'right_foot_index_z', 'right_foot_index_v']
Number of rows: 17907 
Number of columns: 53

Labels: 
L    9114
C    8793
Name: label, dtype: int64

Missing values:

Unnamed: 0,label,nose_x,nose_y,nose_z,nose_v,left_shoulder_x,left_shoulder_y,left_shoulder_z,left_shoulder_v,right_shoulder_x,...,right_heel_z,right_heel_v,left_foot_index_x,left_foot_index_y,left_foot_index_z,left_foot_index_v,right_foot_index_x,right_foot_index_y,right_foot_index_z,right_foot_index_v
17904,C,0.647438,0.442268,0.004114,0.999985,0.615798,0.51717,0.151706,0.999579,0.631354,...,-0.034228,0.979719,0.701826,0.880516,0.134222,0.979319,0.50488,0.881748,-0.027911,0.986165
17905,C,0.649652,0.419057,0.008783,0.999983,0.617577,0.503514,0.158545,0.999529,0.631972,...,-0.061176,0.980431,0.704606,0.880248,0.071476,0.979932,0.504513,0.881766,-0.088832,0.986975
17906,C,0.653556,0.400394,0.014852,0.99998,0.620734,0.486522,0.169807,0.999556,0.631171,...,-0.138678,0.979078,0.705475,0.878981,0.00369,0.979199,0.504067,0.882642,-0.183304,0.986824


In [5]:
# Extract features and class
X = df.drop("label", axis=1)
y = df["label"]

sc = StandardScaler()
X = pd.DataFrame(sc.fit_transform(X))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
y_test.head(3)

10827    L
11395    L
3742     C
Name: label, dtype: object

## 3. Train & Evaluate Model

### 3.1. Train and evaluate model with train set

In [7]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score

In [8]:
algorithms =[("LR", LogisticRegression()),
         ("SVC", SVC()),
         ('KNN',KNeighborsClassifier()),
         ("DTC", DecisionTreeClassifier()),
         ("SGDC", SGDClassifier()),
         ("Ridge", RidgeClassifier()),
         ('RF', RandomForestClassifier()),]

models = {}
final_results = []

for name, model in algorithms:
    trained_model = model.fit(X_train, y_train)
    models[name] = trained_model

    # Evaluate model
    model_results = model.predict(X_test)

    p_score = precision_score(y_test, model_results, average="macro")
    a_score = accuracy_score(y_test, model_results)
    r_score = recall_score(y_test, model_results, average="micro")
    f1_score_result = f1_score(y_test, model_results, average=None, labels=["C", "L"])
    final_results.append(( name, p_score, a_score, r_score, f1_score_result ))


In [9]:
# Sort results by F1 score
final_results.sort(key=lambda k: k[4][0] + k[4][1], reverse=True)

pd.DataFrame(final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score
0,KNN,0.999732,0.999721,0.999721,"[0.9997081995914795, 0.9997324056729997]"
1,RF,0.999173,0.999162,0.999162,"[0.9991245987744383, 0.9991972170189992]"
2,SVC,0.999198,0.999162,0.999162,"[0.9991240875912409, 0.9991976464295266]"
3,DTC,0.996039,0.996092,0.996092,"[0.995920745920746, 0.9962486602357985]"
4,SGDC,0.994456,0.994417,0.994417,"[0.9941588785046729, 0.9946524064171123]"
5,LR,0.993923,0.993858,0.993858,"[0.9935710111046172, 0.9941207910208444]"
6,Ridge,0.985028,0.985204,0.985204,"[0.9846153846153847, 0.9857488572196828]"


### 3.2. Test set evaluation

In [10]:
test_df = describe_dataset(TEST_SET_PATH)
test_df = test_df.sample(frac=1).reset_index(drop=True)

test_x = test_df.drop("label", axis=1)
test_y = test_df["label"]

test_x = pd.DataFrame(sc.transform(test_x))

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v', 'left_knee_x', 'left_knee_y', 'left_knee_z', 'left_knee_v', 'right_knee_x', 'right_knee_y', 'right_knee_z', 'right_knee_v', 'left_ankle_x', 'left_ankle_y', 'left_ankle_z', 'left_ankle_v', 'right_ankle_x', 'right_ankle_y', 'right_ankle_z', 'right_ankle_v', 'left_heel_x', 'left_heel_y', 'left_heel_z', 'left_heel_v', 'right_heel_x', 'right_heel_y', 'right_heel_z', 'right_heel_v', 'left_foot_index_x', 'left_foot_index_y', 'left_foot_index_z', 'left_foot_index_v', 'right_foot_index_x', 'right_foot_index_y', 'right_foot_index_z', 'right_foot_index_v']
Number of rows: 958 
Number of columns: 53

Labels: 
C    497
L    461
Name: label, dtype: int64

Missing values: Fal

In [11]:
testset_final_results = []

for name, model in models.items():
    # Evaluate model
    model_results = model.predict(test_x)

    p_score = precision_score(test_y, model_results, average="macro")
    a_score = accuracy_score(test_y, model_results)
    r_score = recall_score(test_y, model_results, average="macro")
    f1_score_result = f1_score(test_y, model_results, average=None, labels=["C", "L"])
    testset_final_results.append(( name, p_score, a_score, r_score, f1_score_result ))


testset_final_results.sort(key=lambda k: k[4][0] + k[4][1], reverse=True)
pd.DataFrame(testset_final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score
0,DTC,0.982268,0.982255,0.98219,"[0.9829145728643216, 0.9815418023887079]"
1,KNN,0.974601,0.97286,0.971958,"[0.9744094488188977, 0.9711111111111111]"
2,Ridge,0.965356,0.961378,0.95987,"[0.9641125121241514, 0.9581920903954801]"
3,LR,0.961039,0.956159,0.954447,"[0.9594594594594594, 0.9522727272727273]"
4,SGDC,0.948556,0.940501,0.938178,"[0.9457659372026642, 0.9341040462427745]"
5,RF,0.854673,0.820459,0.814549,"[0.8488576449912126, 0.7789203084832904]"
6,SVC,0.777929,0.76618,0.761999,"[0.7948717948717949, 0.7281553398058254]"


## 4. Dump Models 

According to the evaluation above, DTC and KNN models would be chosen for more eval.

In [14]:
with open("./model/err_DTC_model.pkl", "wb") as f:
    pickle.dump(models["DTC"], f)

In [15]:
with open("./model/err_KNN_model.pkl", "wb") as f:
    pickle.dump(models["KNN"], f)