In [1]:
import mediapipe as mp
import cv2
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

# Drawing helpers
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

In [2]:
def rescale_frame(frame, percent=50):
    '''
    Rescale a frame to a certain percentage compare to its original frame
    '''
    width = int(frame.shape[1] * percent/ 100)
    height = int(frame.shape[0] * percent/ 100)
    dim = (width, height)
    return cv2.resize(frame, dim, interpolation = cv2.INTER_AREA)


def describe_dataset(dataset_path: str):
    '''
    Describe dataset
    '''

    data = pd.read_csv(dataset_path)
    print(f"Headers: {list(data.columns.values)}")
    print(f'Number of rows: {data.shape[0]} \nNumber of columns: {data.shape[1]}\n')
    print(f"Labels: \n{data['label'].value_counts()}\n")
    print(f"Missing values: {data.isnull().values.any()}\n")
    
    duplicate = data[data.duplicated()]
    print(f"Duplicate Rows : {len(duplicate.sum(axis=1))}")

    return data


def round_up_metric_results(results) -> list:
    '''Round up metrics results such as precision score, recall score, ...'''
    return list(map(lambda el: round(el, 3), results))

In [4]:
df = describe_dataset("./train.csv")
df.loc[df["label"] == "no_error", "label"] = 0
df.loc[df["label"] == "knees_inward_error", "label"] = 1
df.tail(3)

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v', 'left_knee_x', 'left_knee_y', 'left_knee_z', 'left_knee_v', 'right_knee_x', 'right_knee_y', 'right_knee_z', 'right_knee_v', 'left_ankle_x', 'left_ankle_y', 'left_ankle_z', 'left_ankle_v', 'right_ankle_x', 'right_ankle_y', 'right_ankle_z', 'right_ankle_v']
Number of rows: 8660 
Number of columns: 37

Labels: 
no_error              6935
knees_inward_error    1725
Name: label, dtype: int64

Missing values: False

Duplicate Rows : 1


Unnamed: 0,label,nose_x,nose_y,nose_z,nose_v,left_shoulder_x,left_shoulder_y,left_shoulder_z,left_shoulder_v,right_shoulder_x,...,right_knee_z,right_knee_v,left_ankle_x,left_ankle_y,left_ankle_z,left_ankle_v,right_ankle_x,right_ankle_y,right_ankle_z,right_ankle_v
8657,1,0.472204,0.349444,0.706158,0.999721,0.432603,0.420305,0.289829,0.999627,0.570341,...,0.141294,0.946599,0.417689,0.847398,-0.203331,0.971113,0.582417,0.837936,0.005637,0.933843
8658,1,0.469109,0.324893,0.658966,0.999851,0.42756,0.396529,0.245558,0.999749,0.567193,...,0.171627,0.972071,0.419049,0.847843,-0.15489,0.978985,0.583015,0.836856,0.092888,0.952454
8659,1,0.461074,0.264189,0.595965,0.999841,0.422867,0.33673,0.179004,0.999717,0.562014,...,0.121482,0.987988,0.41902,0.847231,-0.186937,0.986825,0.584438,0.834376,-0.032386,0.967409


In [5]:
# Extract features and class
X = df.drop("label", axis=1)
y = df["label"].astype("int")

In [6]:
sc = StandardScaler()
X = pd.DataFrame(sc.fit_transform(X))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
y_test.head(3)

6297    0
8075    1
6247    0
Name: label, dtype: int32

In [8]:
algorithms =[("LR", LogisticRegression()),
         ("SVC", SVC(probability=True)),
         ('KNN',KNeighborsClassifier()),
         ("DTC", DecisionTreeClassifier()),
         ("SGDC", CalibratedClassifierCV(SGDClassifier())),
         ("NB", GaussianNB()),
         ('RF', RandomForestClassifier()),]

models = {}
final_results = []

for name, model in algorithms:
    trained_model = model.fit(X_train, y_train)
    models[name] = trained_model

    # Evaluate model
    model_results = model.predict(X_test)

    p_score = precision_score(y_test, model_results, average=None, labels=[0, 1, 2])
    a_score = accuracy_score(y_test, model_results)
    r_score = recall_score(y_test, model_results, average=None, labels=[0, 1, 2])
    f1_score_result = f1_score(y_test, model_results, average=None, labels=[0, 1, 2])
    cm = confusion_matrix(y_test, model_results, labels=[0, 1, 2])
    final_results.append(( name,  round_up_metric_results(p_score), a_score, round_up_metric_results(r_score), round_up_metric_results(f1_score_result), cm))

In [9]:
# Sort results by F1 score
final_results.sort(key=lambda k: sum(k[4]), reverse=True)

pd.DataFrame(final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score", "Confusion Matrix"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score,Confusion Matrix
0,KNN,"[0.977, 0.87, 0.0]",0.954965,"[0.967, 0.906, 0.0]","[0.972, 0.887, 0.0]","[[1347, 46, 0], [32, 307, 0], [0, 0, 0]]"
1,RF,"[0.933, 0.984, 0.0]",0.940531,"[0.997, 0.708, 0.0]","[0.964, 0.823, 0.0]","[[1389, 4, 0], [99, 240, 0], [0, 0, 0]]"
2,DTC,"[0.92, 0.651, 0.0]",0.865473,"[0.912, 0.676, 0.0]","[0.916, 0.663, 0.0]","[[1270, 123, 0], [110, 229, 0], [0, 0, 0]]"
3,SVC,"[0.88, 0.884, 0.0]",0.880485,"[0.986, 0.448, 0.0]","[0.93, 0.595, 0.0]","[[1373, 20, 0], [187, 152, 0], [0, 0, 0]]"
4,LR,"[0.866, 0.732, 0.0]",0.852194,"[0.966, 0.386, 0.0]","[0.913, 0.506, 0.0]","[[1345, 48, 0], [208, 131, 0], [0, 0, 0]]"
5,SGDC,"[0.837, 0.854, 0.0]",0.83776,"[0.991, 0.206, 0.0]","[0.908, 0.333, 0.0]","[[1381, 12, 0], [269, 70, 0], [0, 0, 0]]"
6,NB,"[0.876, 0.345, 0.0]",0.704388,"[0.737, 0.569, 0.0]","[0.8, 0.43, 0.0]","[[1027, 366, 0], [146, 193, 0], [0, 0, 0]]"


In [11]:
test_df = describe_dataset("./test.csv")
test_df = test_df.sample(frac=1).reset_index(drop=True)

test_df.loc[test_df["label"] == "no_error", "label"] = 0
test_df.loc[test_df["label"] == "knees_inward_error", "label"] = 1

test_x = test_df.drop("label", axis=1)
test_y = test_df["label"].astype("int")

test_x = pd.DataFrame(sc.transform(test_x))

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v', 'left_knee_x', 'left_knee_y', 'left_knee_z', 'left_knee_v', 'right_knee_x', 'right_knee_y', 'right_knee_z', 'right_knee_v', 'left_ankle_x', 'left_ankle_y', 'left_ankle_z', 'left_ankle_v', 'right_ankle_x', 'right_ankle_y', 'right_ankle_z', 'right_ankle_v']
Number of rows: 3362 
Number of columns: 37

Labels: 
no_error              2905
knees_inward_error     457
Name: label, dtype: int64

Missing values: False

Duplicate Rows : 0


In [12]:
testset_final_results = []

for name, model in models.items():
    # Evaluate model
    model_results = model.predict(test_x)

    p_score = precision_score(test_y, model_results, average=None, labels=[0, 1, 2])
    a_score = accuracy_score(test_y, model_results)
    r_score = recall_score(test_y, model_results, average=None, labels=[0, 1, 2])
    f1_score_result = f1_score(test_y, model_results, average=None, labels=[0, 1, 2])
    cm = confusion_matrix(test_y, model_results, labels=[0, 1, 2])
    testset_final_results.append(( name,  round_up_metric_results(p_score), a_score, round_up_metric_results(r_score), round_up_metric_results(f1_score_result), cm ))


testset_final_results.sort(key=lambda k: sum(k[4]), reverse=True)
pd.DataFrame(testset_final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score", "Confusion Matrix"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score,Confusion Matrix
0,LR,"[0.9, 0.377, 0.0]",0.831945,"[0.906, 0.363, 0.0]","[0.903, 0.37, 0.0]","[[2631, 274, 0], [291, 166, 0], [0, 0, 0]]"
1,SVC,"[0.89, 0.392, 0.0]",0.844438,"[0.936, 0.263, 0.0]","[0.912, 0.315, 0.0]","[[2719, 186, 0], [337, 120, 0], [0, 0, 0]]"
2,RF,"[0.881, 0.397, 0.0]",0.851576,"[0.958, 0.177, 0.0]","[0.918, 0.245, 0.0]","[[2782, 123, 0], [376, 81, 0], [0, 0, 0]]"
3,SGDC,"[0.881, 0.425, 0.0]",0.856038,"[0.964, 0.168, 0.0]","[0.92, 0.241, 0.0]","[[2801, 104, 0], [380, 77, 0], [0, 0, 0]]"
4,KNN,"[0.887, 0.233, 0.0]",0.762641,"[0.831, 0.326, 0.0]","[0.858, 0.272, 0.0]","[[2415, 490, 0], [308, 149, 0], [0, 0, 0]]"
5,DTC,"[0.884, 0.213, 0.0]",0.743605,"[0.809, 0.328, 0.0]","[0.845, 0.258, 0.0]","[[2350, 555, 0], [307, 150, 0], [0, 0, 0]]"
6,NB,"[0.897, 0.214, 0.0]",0.69304,"[0.728, 0.47, 0.0]","[0.804, 0.294, 0.0]","[[2115, 790, 0], [242, 215, 0], [0, 0, 0]]"


In [13]:
with open("./model/all_sklearn.pkl", "wb") as f:
    pickle.dump(models, f)

In [14]:
with open("./model/LR_model.pkl", "wb") as f:
    pickle.dump(models["LR"], f)

In [15]:
with open("./model/RF_model.pkl", "wb") as f:
    pickle.dump(models["RF"], f)

In [16]:
with open("./model/SVC_model.pkl", "wb") as f:
    pickle.dump(models["SVC"], f)

In [17]:
# Dump input scaler
with open("./model/ki_input_scaler.pkl", "wb") as f:
    pickle.dump(sc, f)