# Using RandomForestClassifier

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle

In [2]:
# Path to the folder containing the training data
gesture_folder_path = "Gesture" 

In [21]:
# Read CSV files into DataFrames and concatenate them into a single DataFrame
def load_csv_to_df(gesture_folder_path:str)->object:
    
    """
    Load CSV files into DataFrames and concatenate them into a single DataFrame
    
    Parameters:
        - gesture_folder_path (str): Path to the folder containing subfolders correspond to each person (class) and each containing audio clips.
    
    Returns:
        - df: Data frame containing all the readings, labeled by the person with relative timing calculated for each reading. 
    """
    
    dfs = []
    # for person 1 to 5: 
    for label in range(1, 6):
        # generate path from subfolder 1 to 5, eg "Gesture/1"
        folder_path = f"{gesture_folder_path}/{label}"
        # for each csv file in the folder: 
        for file_name in os.listdir(folder_path):
                # to ignore any DS_Store file 
                if not file_name.startswith('.'): 
                    file_path = f"{folder_path}/{file_name}"
                    df = pd.read_csv(file_path)
                    # Label each row with the Person Number as the label
                    df["Person"] = label
                    # Calculate the relative time difference for each row of reading 
                    # compared to the 1st row of reading in each CSV file / reading 
                    df['relative_time'] = df['Timestamp'].apply(lambda x: (x - df['Timestamp'][0]))
                    dfs.append(df)
    df = pd.concat(dfs)
    return df

In [22]:
# Load the data  
df = load_csv_to_df(gesture_folder_path)

# Inspect the data 
df.tail()

Unnamed: 0,Timestamp,AccelX,AccelY,AccelZ,GyroX,GyroY,GyroZ,Person,relative_time
2851,150826668708568,-0.280122,3.53863,8.052,1.331534,-0.940274,0.085063,5,2980211645
2852,150826670708368,-0.280122,3.53863,8.052,1.331534,-0.940274,0.085063,5,2982211445
2853,150826670708368,-0.280122,3.53863,8.052,1.33581,-0.978148,0.085674,5,2982211445
2854,150826672708168,-0.289698,3.550601,8.011298,1.33581,-0.978148,0.085674,5,2984211245
2855,150826672708168,-0.289698,3.550601,8.011298,1.338864,-0.995863,0.08262,5,2984211245


In [5]:
df.head()

Unnamed: 0,Timestamp,AccelX,AccelY,AccelZ,GyroX,GyroY,GyroZ,Person,relative_time
0,30779640458668,4.419696,7.029675,5.210082,3.634725,-5.124854,-0.266185,1,0
1,30779642986703,4.419696,7.029675,5.210082,3.634725,-5.124854,-0.266185,1,2528035
2,30779644981203,4.419696,7.029675,5.210082,3.634725,-5.124854,-0.266185,1,4522535
3,30779646981536,4.419696,7.029675,5.210082,3.634725,-5.124854,-0.266185,1,6522868
4,30779648981786,4.419696,7.029675,5.210082,3.634725,-5.124854,-0.266185,1,8523118


In [6]:
# Split data into features and labels
X = df.drop(["Person", "Timestamp"], axis=1)  # Drop "Person" and "Timestamp" columns as features
y = df["Person"]  # Use "Person" column as labels

In [7]:
# Inspect the data again 
X.head()

Unnamed: 0,AccelX,AccelY,AccelZ,GyroX,GyroY,GyroZ,relative_time
0,4.419696,7.029675,5.210082,3.634725,-5.124854,-0.266185,0
1,4.419696,7.029675,5.210082,3.634725,-5.124854,-0.266185,2528035
2,4.419696,7.029675,5.210082,3.634725,-5.124854,-0.266185,4522535
3,4.419696,7.029675,5.210082,3.634725,-5.124854,-0.266185,6522868
4,4.419696,7.029675,5.210082,3.634725,-5.124854,-0.266185,8523118


In [8]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Person, dtype: int64

In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Initialize Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
clf.fit(X_train, y_train)

In [10]:
# Predict on test data
y_pred = clf.predict(X_test)

# Calculate accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

# Print accuracy and confusion matrix
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)

Accuracy: 0.9998041882729756
Confusion Matrix:
 [[56635    14     0     0     1]
 [   28 56986     0     0     0]
 [    0     0 57568     3     3]
 [    0     0     0 57074     1]
 [    5     0     0     1 57670]]


In [24]:
# Read a CSV file into DataFrames
def predict_new_reading(csv_file_path:str, clf:object)->int:
    df = pd.read_csv(csv_file_path)
    # Calculate the relative time difference for each row of reading 
    # compared to the 1st row of reading in each CSV file / reading 
    df['relative_time'] = df['Timestamp'].apply(lambda x: (x - df['Timestamp'][0]))
    df = df.drop(["Timestamp"], axis=1)  # Drop "Timestamp" columns as features
    result_list = clf.predict(df)
    counts = np.bincount(result_list)
    result = np.argmax(counts)
    return result

print(predict_new_reading("Gesture/3/1677689464170.csv", clf))

3


In [20]:
# Save the trained model to a file
filename = 'gesture_random_forest_classifier_model_relative_time.pkl'
with open(filename, 'wb') as file:
    pickle.dump(clf, file)

In [3]:
# Load the trained model from the file
filename = 'gesture_random_forest_classifier_model_relative_time.pkl'
with open(filename, 'rb') as file:
    clf = pickle.load(file)

## Unused funtion

In [None]:
def load_csv_to_df_knn(file_path_str):
    X = []
    y =[]
    for label in range(1, 6):
        folder_path = f"{gesture_folder_path}/{label}"
        for file_name in os.listdir(folder_path):
                if not file_name.startswith('.'): 
                    file_path = f"{folder_path}/{file_name}"
                    df = pd.read_csv(file_path)
                    X.append(df.drop("Timestamp", axis=1))  # "Timestamp" is not a feature
                    y.append(label)
    return X, y

In [None]:
def load_csv_to_list(file_path_str):
    X = []
    y =[]
    for label in range(1, 6):
        folder_path = f"{gesture_folder_path}/{label}"
        for file_name in os.listdir(folder_path):
                if not file_name.startswith('.'): 
                    file_path = f"{folder_path}/{file_name}"
                    df = pd.read_csv(file_path)
                    X.append(df.drop(["Timestamp"], axis=1).values)  # "Timestamp" is not a feature
                    y.append(label)
    return X, y