In [14]:
#Import Packages and Functions required
import pandas as pd
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling  import SMOTE
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import warnings
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint

# Suppress warnings
warnings.filterwarnings("ignore")


In [15]:
# Import Dataset
# Define the folder path containing CSV files
folder_path = './../har70plus'

# List to hold individual DataFrames
dataframes = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Construct full file path
        file_path = os.path.join(folder_path, filename)
        # Read the CSV file into a DataFrame
        df_raw = pd.read_csv(file_path)
        # Add a new column with the file name
        df_raw['source_file'] = filename.replace('.csv','')
        # Append the DataFrame to the list
        dataframes.append(df_raw)

# Concatenate all DataFrames in the list into a single DataFrame
df = pd.concat(dataframes, ignore_index=True)

# Convert date time to unix timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['unix_timestamp_ms'] = (df['timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1ms')


In [16]:
#Check if any null value is present
df.isna().sum()

timestamp            0
back_x               0
back_y               0
back_z               0
thigh_x              0
thigh_y              0
thigh_z              0
label                0
source_file          0
unix_timestamp_ms    0
dtype: int64

In [17]:
#Sepetrate out the target and features
y = df.iloc[:,7:8]
x = df[df.columns.difference(['label','timestamp']) ]

In [18]:
# Splitting data into training and testing sets (80% training, 20% testing) -- Add stratification as target classes are imbalanced
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

# Validation set not required as we have Cross Validation in Grid Search CV
# # Splitting training set into training and validation sets (75% training, 25% validation) -- Add stratification as target classes are imbalanced
# x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42, stratify=y_train)

In [19]:
x_train.shape, x_test.shape, x.shape

((1807677, 8), (451920, 8), (2259597, 8))

In [20]:
#EDA
def check_df(data, head=5):
    print("\n------Shape------")
    print(f'Shape     : {df.shape}\n'
          f'Size      : {df.size}\n'
          f'Dimension : {df.ndim}')
    print("\n------Types------")
    print(data.dtypes)
    print("\n------Head------")
    print(data.head(head))
    print("\n------Tail------")
    print(data.tail(head))
    print("\n------Missing Values------")
    print(data.isnull().sum())
    print("\n------Duplicated Values------")
    print(data.duplicated().sum())
    print("\n------Unique Values------")
    print(data.nunique())
    print("\n------Describe------")
    print(data.describe().T)

check_df(x_train)


------Shape------
Shape     : (2259597, 10)
Size      : 22595970
Dimension : 2

------Types------
back_x               float64
back_y               float64
back_z               float64
source_file           object
thigh_x              float64
thigh_y              float64
thigh_z              float64
unix_timestamp_ms      int64
dtype: object

------Head------
           back_x    back_y    back_z source_file   thigh_x   thigh_y  \
268015  -0.924072 -0.091064  0.311768         503  0.312500 -0.078369   
2087578 -0.841309 -0.079834 -0.106689         517 -0.845947 -0.010986   
946707  -0.932129  0.003418 -0.328613         508 -0.988281  0.041748   
1312161 -0.924561  0.027832 -0.103027         511 -1.001221 -0.132324   
519686  -0.729492 -0.171875  0.593750         505  0.098145 -0.079590   

          thigh_z  unix_timestamp_ms  
268015  -1.047119      1617029603659  
2087578  0.213623      1623149048927  
946707  -0.144531      1618585773845  
1312161 -0.077637      1620384409703  
519

In [21]:
# Seperate out the num and cat features
cat_cols = ['source_file']
num_cols = [cols for cols in x.columns if cols not in cat_cols]

In [22]:
#Check for imbalance in dataset
y['label'].value_counts()

label
1    1079312
7     483452
6     418055
8     203182
3      66058
5       4978
4       4560
Name: count, dtype: int64

In [23]:
# #Feature Engineering

# #use rf to get feature inportances
# rf_classifier_fi = RandomForestClassifier(n_estimators=100, random_state=42, verbose=0)
# rf_classifier_fi.fit(x_train, y_train)
# feature_importance = rf_classifier_fi.feature_importances_
# feature_importance_df = pd.DataFrame({'Feature':x_train.columns, 'Feature_importance':feature_importance}).sort_values(by='Feature_importance', ascending=False)
# feature_importance_df

In [24]:
#Selecting fetaures who importance more tha 0.05
x_train = x_train[x_train.columns.difference(['source_file'])]
# x_val = x_val[x_val.columns.difference(['source_file'])]
x_test = x_test[x_test.columns.difference(['source_file'])]

In [25]:
# Resample the training data using SMOTE
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)

In [26]:
#Apply Random Forest to the data with Grid search CV to get optimal hyperparameters 
# Define the parameter distributions
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 75, 100],     
    'max_depth': [None, 10, 20]
}

# Initialize the Random Forest classifier with class weights
rf_classifier = RandomForestClassifier(random_state=42, class_weight='balanced')

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='f1_weighted')
grid_search.fit(x_train, y_train)


# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Get the best model
best_model = grid_search.best_estimator_


Best parameters: {'max_depth': None, 'n_estimators': 75}


In [None]:
# #Apply Random Forest to the data
# # Initialize your Random Forest model with class weights
# rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42, verbose=0, max_depth=10 ,class_weight='balanced')

# # Train the model on the selected features
# rf_classifier.fit(x_resampled, y_resampled)
# # Make predictions on the validation set
# predictions = rf_classifier.predict(x_val)

In [32]:
# # Evaluate the model with validation data

# #Calculate accuracy
# accuracy = accuracy_score(y_val, predictions)

# # Calculate precision
# precision = precision_score(y_val, predictions, average='weighted')

# # Calculate recall
# recall = recall_score(y_val, predictions, average='weighted')

# # Calculate F1-score
# f1 = f1_score(y_val, predictions, average='weighted')

# # Calculate ROC-AUC (for multiclass classification, you need to use one-vs-all strategy)
# roc_auc = roc_auc_score(y_val, rf_classifier.predict_proba(x_val), average='weighted', multi_class='ovr')

# # Generate confusion matrix
# conf_matrix = confusion_matrix(y_val, predictions)

# print("Accuracy:", accuracy)
# print("Precision:", precision)
# print("Recall:", recall)
# print("F1-score:", f1)
# print("ROC-AUC:", roc_auc)
# print("Confusion Matrix:")
# print(conf_matrix)

# # # Calculate class-wise accuracy
# # class_wise_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)

# # # Print class-wise accuracy
# # for i, accuracy in enumerate(class_wise_accuracy):
# #     print(f"Class {i} Accuracy: {accuracy:.4f}")


In [30]:
# Evaluate the model with test data

# Make predictions on the test set
predictions_test = best_model.predict(x_test)

#Calculate accuracy
accuracy = accuracy_score(y_test, predictions_test)

# Calculate precision
precision = precision_score(y_test, predictions_test, average='weighted')

# Calculate recall
recall = recall_score(y_test, predictions_test, average='weighted')

# Calculate F1-score
f1 = f1_score(y_test, predictions_test, average='weighted')

# Calculate ROC-AUC (for multiclass classification, you need to use one-vs-all strategy)
roc_auc = roc_auc_score(y_test, best_model.predict_proba(x_test), average='weighted', multi_class='ovr')

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, predictions_test)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC-AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)

# # Calculate class-wise accuracy
# class_wise_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)

# # Print class-wise accuracy
# for i, accuracy in enumerate(class_wise_accuracy):
#     print(f"Class {i} Accuracy: {accuracy:.4f}")

Accuracy: 0.954635776243583
Precision: 0.9525493084277606
Recall: 0.954635776243583
F1-score: 0.9477249139335647
ROC-AUC: 0.9953934904777091
Confusion Matrix:
[[212801    301     18      9   2708     26      0]
 [  7660   3168      6      2   2376      0      0]
 [   569      4    319      5     14      1      0]
 [   740      0     10    236     10      0      0]
 [  5681    311      2      0  77617      0      0]
 [    35      0      0      0      0  96653      2]
 [     0      0      0      0      0     11  40625]]


In [31]:
#Save the trained model using pickle
model_filename = 'HAR_Random_Forest_Model.pkl'
with open(model_filename,'wb') as file:
    pickle.dump(rf_classifier,file)

print(f"Model saved to {model_filename}")

Model saved to HAR_Random_Forest_Model.pkl
