In [24]:
import mediapipe as mp
import cv2
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score

import warnings
warnings.filterwarnings('ignore')

# Drawing helpers
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

### 1. Train model

#### 1.1. Describe data and split dataset

In [21]:
def rescale_frame(frame, percent=50):
    '''
    Rescale a frame to a certain percentage compare to its original frame
    '''
    width = int(frame.shape[1] * percent/ 100)
    height = int(frame.shape[0] * percent/ 100)
    dim = (width, height)
    return cv2.resize(frame, dim, interpolation = cv2.INTER_AREA)


def describe_dataset(dataset_path: str):
    '''
    Describe dataset
    '''

    data = pd.read_csv(dataset_path)
    print(f"Headers: {list(data.columns.values)}")
    print(f'Number of rows: {data.shape[0]} \nNumber of columns: {data.shape[1]}\n')
    print(f"Labels: \n{data['label'].value_counts()}\n")
    print(f"Missing values: {data.isnull().values.any()}\n")
    
    duplicate = data[data.duplicated()]
    print(f"Duplicate Rows : {len(duplicate.sum(axis=1))}")

    return data

In [22]:
df = describe_dataset("./dataset.csv")

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'right_elbow_x', 'right_elbow_y', 'right_elbow_z', 'right_elbow_v', 'left_elbow_x', 'left_elbow_y', 'left_elbow_z', 'left_elbow_v', 'right_wrist_x', 'right_wrist_y', 'right_wrist_z', 'right_wrist_v', 'left_wrist_x', 'left_wrist_y', 'left_wrist_z', 'left_wrist_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v']
Number of rows: 15372 
Number of columns: 37

Labels: 
C    8238
L    7134
Name: label, dtype: int64

Missing values: False

Duplicate Rows : 0


In [26]:
# Extract features and class

X = df.drop("label", axis=1) # features
y = df["label"]

In [27]:
sc = StandardScaler()
X = pd.DataFrame(sc.fit_transform(X))

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
y_test.head(5)

1922     L
6789     L
8933     L
1985     L
14857    C
Name: label, dtype: object

#### 1.2. Train model using Scikit-learn

In [30]:
algorithms =[("LR", LogisticRegression()),
         ("SVC", SVC()),
         ('KNN',KNeighborsClassifier()),
         ("DTC", DecisionTreeClassifier()),
         ("SGDC", SGDClassifier()),
         ("Ridge", RidgeClassifier()),
         ('RF', RandomForestClassifier()),]

models = {}
final_results = []

for name, model in algorithms:
    trained_model = model.fit(X_train, y_train)
    models[name] = trained_model

    # Evaluate model
    model_results = model.predict(X_test)

    p_score = precision_score(y_test, model_results, average="macro")
    a_score = accuracy_score(y_test, model_results)
    f1_score_result = f1_score(y_test, model_results, average=None, labels=["C", "L"])
    final_results.append(( name, p_score, a_score, f1_score_result ))


final_results.sort(key=lambda k: k[3][0] + k[3][1], reverse=True)

In [31]:
pd.DataFrame(final_results, columns=["Model", "Precision Score", "Accuracy score" , "F1 score"])

Unnamed: 0,Model,Precision Score,Accuracy score,F1 score
0,RF,0.998688,0.998699,"[0.9988088147706968, 0.998567335243553]"
1,KNN,0.998155,0.998049,"[0.9982153480071386, 0.9978479196556671]"
2,SVC,0.996006,0.996098,"[0.9964243146603099, 0.9957050823192556]"
3,DTC,0.993272,0.993496,"[0.9940334128878282, 0.9928520371694067]"
4,LR,0.980368,0.980813,"[0.9823722736779206, 0.9789511237959329]"
5,Ridge,0.974849,0.97561,"[0.9774977497749776, 0.9733759318423855]"
6,SGDC,0.974958,0.97561,"[0.977538185085355, 0.9733191035218783]"


#### 1.3. Evaluation with Test set

In [36]:
test_df = describe_dataset("./test.csv")
test_df = test_df.sample(frac=1).reset_index(drop=True)

test_x = test_df.drop("label", axis=1)
test_y = test_df["label"]

test_x = pd.DataFrame(sc.transform(test_x))

Headers: ['label', 'nose_x', 'nose_y', 'nose_z', 'nose_v', 'left_shoulder_x', 'left_shoulder_y', 'left_shoulder_z', 'left_shoulder_v', 'right_shoulder_x', 'right_shoulder_y', 'right_shoulder_z', 'right_shoulder_v', 'right_elbow_x', 'right_elbow_y', 'right_elbow_z', 'right_elbow_v', 'left_elbow_x', 'left_elbow_y', 'left_elbow_z', 'left_elbow_v', 'right_wrist_x', 'right_wrist_y', 'right_wrist_z', 'right_wrist_v', 'left_wrist_x', 'left_wrist_y', 'left_wrist_z', 'left_wrist_v', 'left_hip_x', 'left_hip_y', 'left_hip_z', 'left_hip_v', 'right_hip_x', 'right_hip_y', 'right_hip_z', 'right_hip_v']
Number of rows: 604 
Number of columns: 37

Labels: 
C    339
L    265
Name: label, dtype: int64

Missing values: False

Duplicate Rows : 0


In [8]:
with open("./model/RF_model.pkl", "rb") as f:
    RF_model = pickle.load(f)

with open("./model/KNN_model.pkl", "rb") as f2:
    KNN_model = pickle.load(f2)

models = {
    "RF": RF_model,
    "KNN": KNN_model,
}

In [37]:
testset_final_results = []

for name, model in models.items():
    # Evaluate model
    model_results = model.predict(test_x)

    p_score = precision_score(test_y, model_results, average="macro")
    a_score = accuracy_score(test_y, model_results)
    r_score = recall_score(test_y, model_results, average="macro")
    f1_score_result = f1_score(test_y, model_results, average=None, labels=["C", "L"])
    testset_final_results.append(( name, p_score, a_score, r_score, f1_score_result ))


testset_final_results.sort(key=lambda k: k[4][0] + k[4][1], reverse=True)

LR LogisticRegression()
SVC SVC()
KNN KNeighborsClassifier()
DTC DecisionTreeClassifier()
SGDC SGDClassifier()
Ridge RidgeClassifier()
RF RandomForestClassifier()


In [38]:
pd.DataFrame(testset_final_results, columns=["Model", "Precision Score", "Accuracy score", "Recall Score", "F1 score"])

Unnamed: 0,Model,Precision Score,Accuracy score,Recall Score,F1 score
0,KNN,0.975401,0.971854,0.968336,"[0.9754689754689754, 0.966990291262136]"
1,RF,0.961853,0.953642,0.94717,"[0.9603399433427762, 0.9442231075697212]"
2,SVC,0.929966,0.932119,0.933762,"[0.9383458646616543, 0.9244935543278086]"
3,Ridge,0.804842,0.804636,0.809484,"[0.8156249999999999, 0.7922535211267606]"
4,LR,0.792694,0.761589,0.737775,"[0.8144329896907215, 0.6666666666666666]"
5,DTC,0.754312,0.735099,0.712529,"[0.7916666666666666, 0.6363636363636364]"
6,SGDC,0.773991,0.731788,0.702989,"[0.7969924812030076, 0.6048780487804878]"


#### 1.4. Dump model pickle

In [39]:
with open("./model/RF_model.pkl", "wb") as f:
    pickle.dump(models["RF"], f)

In [40]:
with open("./model/KNN_model.pkl", "wb") as f:
    pickle.dump(models["KNN"], f)

In [42]:
with open("./model/SVC_model.pkl", "wb") as f:
    pickle.dump(models["SVC"], f)

In [43]:
with open("./model/input_scaler.pkl", "wb") as f:
    pickle.dump(sc, f)