In [1]:
import mediapipe as mp
import cv2
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

# Drawing helpers
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

In [2]:
def rescale_frame(frame, percent=50):
    '''
    Rescale a frame to a certain percentage compare to its original frame
    '''
    width = int(frame.shape[1] * percent/ 100)
    height = int(frame.shape[0] * percent/ 100)
    dim = (width, height)
    return cv2.resize(frame, dim, interpolation = cv2.INTER_AREA)


def describe_dataset(dataset_path: str):
    '''
    Describe dataset
    '''

    data = pd.read_csv(dataset_path)
    print(f"Headers: {list(data.columns.values)}")
    print(f'Number of rows: {data.shape[0]} \nNumber of columns: {data.shape[1]}\n')
    print(f"Labels: \n{data['label'].value_counts()}\n")
    print(f"Missing values: {data.isnull().values.any()}\n")
    
    duplicate = data[data.duplicated()]
    print(f"Duplicate Rows : {len(duplicate.sum(axis=1))}")

    return data


def round_up_metric_results(results) -> list:
    '''Round up metrics results such as precision score, recall score, ...'''
    return list(map(lambda el: round(el, 3), results))

In [3]:
df = describe_dataset("./train.csv")
df.loc[df["label"] == "no_error", "label"] = 0
df.loc[df["label"] == "knees_forward_error", "label"] = 1
df.tail(3)

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v', 'left_knee_x', 'left_knee_y', 'left_knee_z', 'left_knee_v', 'right_knee_x', 'right_knee_y', 'right_knee_z', 'right_knee_v', 'left_ankle_x', 'left_ankle_y', 'left_ankle_z', 'left_ankle_v', 'right_ankle_x', 'right_ankle_y', 'right_ankle_z', 'right_ankle_v']
Number of rows: 5235 
Number of columns: 37

Labels: 
knees_forward_error    3931
no_error               1304
Name: label, dtype: int64

Missing values: False

Duplicate Rows : 0


Unnamed: 0,label,nose_x,nose_y,nose_z,nose_v,left_shoulder_x,left_shoulder_y,left_shoulder_z,left_shoulder_v,right_shoulder_x,...,right_knee_z,right_knee_v,left_ankle_x,left_ankle_y,left_ankle_z,left_ankle_v,right_ankle_x,right_ankle_y,right_ankle_z,right_ankle_v
5232,1,0.520244,0.426567,0.020885,0.999743,0.521043,0.511935,-0.262651,0.999787,0.6931,...,0.538038,0.272252,0.452226,0.892653,0.363293,0.782153,0.673285,0.838465,0.734128,0.338819
5233,1,0.491178,0.437701,0.018811,0.999677,0.501993,0.522092,-0.236658,0.999755,0.67884,...,0.503969,0.230165,0.449949,0.897297,0.395586,0.692538,0.699518,0.853737,0.708981,0.296915
5234,1,0.490406,0.428352,0.057923,0.999692,0.488195,0.512677,-0.206905,0.999775,0.673316,...,0.501764,0.197716,0.45034,0.900123,0.363398,0.656357,0.697853,0.834042,0.715955,0.260931


In [4]:
# Extract features and class
X = df.drop("label", axis=1)
y = df["label"].astype("int")

In [5]:
sc = StandardScaler()
X = pd.DataFrame(sc.fit_transform(X))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
y_test.head(3)

2515    1
857     0
4429    1
Name: label, dtype: int32

In [7]:
algorithms =[("LR", LogisticRegression()),
         ("SVC", SVC(probability=True)),
         ('KNN',KNeighborsClassifier()),
         ("DTC", DecisionTreeClassifier()),
         ("SGDC", CalibratedClassifierCV(SGDClassifier())),
         ("NB", GaussianNB()),
         ('RF', RandomForestClassifier()),]

models = {}
final_results = []

for name, model in algorithms:
    trained_model = model.fit(X_train, y_train)
    models[name] = trained_model

    # Evaluate model
    model_results = model.predict(X_test)

    p_score = precision_score(y_test, model_results, average=None, labels=[0, 1, 2])
    a_score = accuracy_score(y_test, model_results)
    r_score = recall_score(y_test, model_results, average=None, labels=[0, 1, 2])
    f1_score_result = f1_score(y_test, model_results, average=None, labels=[0, 1, 2])
    cm = confusion_matrix(y_test, model_results, labels=[0, 1, 2])
    final_results.append(( name,  round_up_metric_results(p_score), a_score, round_up_metric_results(r_score), round_up_metric_results(f1_score_result), cm))

In [8]:
# Sort results by F1 score
final_results.sort(key=lambda k: sum(k[4]), reverse=True)

pd.DataFrame(final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score", "Confusion Matrix"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score,Confusion Matrix
0,RF,"[0.918, 0.876, 0.0]",0.882521,"[0.589, 0.982, 0.0]","[0.717, 0.926, 0.0]","[[156, 109, 0], [14, 768, 0], [0, 0, 0]]"
1,KNN,"[0.786, 0.863, 0.0]",0.849093,"[0.555, 0.949, 0.0]","[0.65, 0.904, 0.0]","[[147, 118, 0], [40, 742, 0], [0, 0, 0]]"
2,DTC,"[0.642, 0.879, 0.0]",0.818529,"[0.642, 0.879, 0.0]","[0.642, 0.879, 0.0]","[[170, 95, 0], [95, 687, 0], [0, 0, 0]]"
3,SVC,"[0.743, 0.782, 0.0]",0.77937,"[0.196, 0.977, 0.0]","[0.31, 0.869, 0.0]","[[52, 213, 0], [18, 764, 0], [0, 0, 0]]"
4,LR,"[0.636, 0.773, 0.0]",0.764088,"[0.158, 0.969, 0.0]","[0.254, 0.86, 0.0]","[[42, 223, 0], [24, 758, 0], [0, 0, 0]]"
5,NB,"[0.527, 0.768, 0.0]",0.750716,"[0.147, 0.955, 0.0]","[0.23, 0.851, 0.0]","[[39, 226, 0], [35, 747, 0], [0, 0, 0]]"
6,SGDC,"[0.889, 0.752, 0.0]",0.753582,"[0.03, 0.999, 0.0]","[0.058, 0.858, 0.0]","[[8, 257, 0], [1, 781, 0], [0, 0, 0]]"


In [9]:
test_df = describe_dataset("./test.csv")
test_df = test_df.sample(frac=1).reset_index(drop=True)

test_df.loc[test_df["label"] == "no_error", "label"] = 0
test_df.loc[test_df["label"] == "knees_forward_error", "label"] = 1

test_x = test_df.drop("label", axis=1)
test_y = test_df["label"].astype("int")

test_x = pd.DataFrame(sc.transform(test_x))

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v', 'left_knee_x', 'left_knee_y', 'left_knee_z', 'left_knee_v', 'right_knee_x', 'right_knee_y', 'right_knee_z', 'right_knee_v', 'left_ankle_x', 'left_ankle_y', 'left_ankle_z', 'left_ankle_v', 'right_ankle_x', 'right_ankle_y', 'right_ankle_z', 'right_ankle_v']
Number of rows: 2525 
Number of columns: 37

Labels: 
knees_forward_error    1637
no_error                888
Name: label, dtype: int64

Missing values: False

Duplicate Rows : 0


In [10]:
testset_final_results = []

for name, model in models.items():
    # Evaluate model
    model_results = model.predict(test_x)

    p_score = precision_score(test_y, model_results, average=None, labels=[0, 1, 2])
    a_score = accuracy_score(test_y, model_results)
    r_score = recall_score(test_y, model_results, average=None, labels=[0, 1, 2])
    f1_score_result = f1_score(test_y, model_results, average=None, labels=[0, 1, 2])
    cm = confusion_matrix(test_y, model_results, labels=[0, 1, 2])
    testset_final_results.append(( name,  round_up_metric_results(p_score), a_score, round_up_metric_results(r_score), round_up_metric_results(f1_score_result), cm ))


testset_final_results.sort(key=lambda k: sum(k[4]), reverse=True)
pd.DataFrame(testset_final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score", "Confusion Matrix"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score,Confusion Matrix
0,KNN,"[0.58, 0.697, 0.0]",0.676436,"[0.291, 0.886, 0.0]","[0.387, 0.78, 0.0]","[[258, 630, 0], [187, 1450, 0], [0, 0, 0]]"
1,DTC,"[0.452, 0.689, 0.0]",0.620594,"[0.369, 0.757, 0.0]","[0.406, 0.721, 0.0]","[[328, 560, 0], [398, 1239, 0], [0, 0, 0]]"
2,RF,"[0.575, 0.687, 0.0]",0.670495,"[0.242, 0.903, 0.0]","[0.341, 0.78, 0.0]","[[215, 673, 0], [159, 1478, 0], [0, 0, 0]]"
3,LR,"[0.649, 0.677, 0.0]",0.674455,"[0.162, 0.952, 0.0]","[0.259, 0.791, 0.0]","[[144, 744, 0], [78, 1559, 0], [0, 0, 0]]"
4,SVC,"[0.602, 0.672, 0.0]",0.666139,"[0.15, 0.946, 0.0]","[0.24, 0.786, 0.0]","[[133, 755, 0], [88, 1549, 0], [0, 0, 0]]"
5,NB,"[0.432, 0.655, 0.0]",0.63802,"[0.092, 0.934, 0.0]","[0.152, 0.77, 0.0]","[[82, 806, 0], [108, 1529, 0], [0, 0, 0]]"
6,SGDC,"[0.818, 0.652, 0.0]",0.653861,"[0.02, 0.998, 0.0]","[0.04, 0.789, 0.0]","[[18, 870, 0], [4, 1633, 0], [0, 0, 0]]"


In [11]:
with open("./model/all_sklearn.pkl", "wb") as f:
    pickle.dump(models, f)

In [12]:
with open("./model/SGDC_model.pkl", "wb") as f:
    pickle.dump(models["SGDC"], f)

In [13]:
with open("./model/LR_model.pkl", "wb") as f:
    pickle.dump(models["LR"], f)

In [14]:
with open("./model/SVC_model.pkl", "wb") as f:
    pickle.dump(models["SVC"], f)

In [15]:
# Dump input scaler
with open("./model/kf_input_scaler.pkl", "wb") as f:
    pickle.dump(sc, f)