In [14]:
#Import Packages and Functions required
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling  import SMOTE
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import warnings
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Suppress warnings
warnings.filterwarnings("ignore")


In [15]:
# Import Dataset
# Define the folder path containing CSV files
folder_path = './../har70plus'

# List to hold individual DataFrames
dataframes = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Construct full file path
        file_path = os.path.join(folder_path, filename)
        # Read the CSV file into a DataFrame
        df_raw = pd.read_csv(file_path)
        # Add a new column with the file name
        df_raw['source_file'] = filename.replace('.csv','')
        # Append the DataFrame to the list
        dataframes.append(df_raw)

# Concatenate all DataFrames in the list into a single DataFrame
df = pd.concat(dataframes, ignore_index=True)

# Convert date time to unix timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['unix_timestamp_ms'] = (df['timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1ms')


In [16]:
#Check if any null value is present
df.isna().sum()

timestamp            0
back_x               0
back_y               0
back_z               0
thigh_x              0
thigh_y              0
thigh_z              0
label                0
source_file          0
unix_timestamp_ms    0
dtype: int64

In [17]:
#Sepetrate out the target and features
y = df.iloc[:,7:8]
x = df[df.columns.difference(['label','timestamp']) ]

In [18]:
# Splitting data into training and testing sets (80% training, 20% testing) -- Add stratification as target classes are imbalanced
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

# Splitting training set into training and validation sets (75% training, 25% validation) -- Add stratification as target classes are imbalanced
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42, stratify=y_train)

In [19]:
x_train.shape, x_val.shape, x_test.shape, x.shape

((1355757, 8), (451920, 8), (451920, 8), (2259597, 8))

In [20]:
#EDA
def check_df(data, head=5):
    print("\n------Shape------")
    print(f'Shape     : {df.shape}\n'
          f'Size      : {df.size}\n'
          f'Dimension : {df.ndim}')
    print("\n------Types------")
    print(data.dtypes)
    print("\n------Head------")
    print(data.head(head))
    print("\n------Tail------")
    print(data.tail(head))
    print("\n------Missing Values------")
    print(data.isnull().sum())
    print("\n------Duplicated Values------")
    print(data.duplicated().sum())
    print("\n------Unique Values------")
    print(data.nunique())
    print("\n------Describe------")
    print(data.describe().T)

check_df(x_train)


------Shape------
Shape     : (2259597, 10)
Size      : 22595970
Dimension : 2

------Types------
back_x               float64
back_y               float64
back_z               float64
source_file           object
thigh_x              float64
thigh_y              float64
thigh_z              float64
unix_timestamp_ms      int64
dtype: object

------Head------
           back_x    back_y    back_z source_file   thigh_x   thigh_y  \
1751225 -0.381592  0.109619  0.917236         515  0.034424  0.045166   
2114166 -0.995361  0.064453 -0.018555         517 -0.941895 -0.047363   
1526276 -0.866455 -0.076172 -0.139648         513 -0.525635  0.170654   
1914772 -0.951172 -0.058838 -0.399414         516 -0.983643  0.191406   
918631  -0.533936  0.056396 -0.272705         508 -0.704834  0.236572   

          thigh_z  unix_timestamp_ms  
1751225 -0.995850      1622109167299  
2114166  0.138184      1623149583766  
1526276  0.019287      1621597004619  
1914772 -0.001465      1622469804180  
918

In [21]:
# Seperate out the num and cat features
cat_cols = ['source_file']
num_cols = [cols for cols in x.columns if cols not in cat_cols]

In [22]:
#Check for imbalance in dataset
y['label'].value_counts()

label
1    1079312
7     483452
6     418055
8     203182
3      66058
5       4978
4       4560
Name: count, dtype: int64

In [23]:
# #Feature Engineering

# #use rf to get feature inportances
# rf_classifier_fi = RandomForestClassifier(n_estimators=100, random_state=42, verbose=0)
# rf_classifier_fi.fit(x_train, y_train)
# feature_importance = rf_classifier_fi.feature_importances_
# feature_importance_df = pd.DataFrame({'Feature':x_train.columns, 'Feature_importance':feature_importance}).sort_values(by='Feature_importance', ascending=False)
# feature_importance_df

In [24]:
#Selecting fetaures who importance more tha 0.05
x_train = x_train[x_train.columns.difference(['source_file'])]
x_val = x_val[x_val.columns.difference(['source_file'])]
x_test = x_test[x_test.columns.difference(['source_file'])]

In [25]:
# Resample the training data using SMOTE
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)

In [26]:
# #Apply Random Forest to the data
# # Define the parameter distributions
# param_dist = {
#     'n_estimators': randint(50, 100),     
#     'max_depth': [None, 10, 20]
# }

# # Initialize the Random Forest classifier with class weights
# rf_classifier = RandomForestClassifier(random_state=42, class_weight='balanced')

# # Perform RandomizedSearchCV
# random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist, n_iter=10, cv=3, scoring='f1_weighted', random_state=42)
# random_search.fit(x_train, y_train)

# # Get the best parameters
# best_params = random_search.best_params_
# print("Best parameters:", best_params)

# # Get the best model
# best_model = random_search.best_estimator_

# # Evaluate the best model on the test set
# predictions = best_model.predict(x_val)

In [27]:
#Apply Random Forest to the data
# Initialize your Random Forest model with class weights
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, verbose=0, class_weight='balanced')

# Train the model on the selected features
rf_classifier.fit(x_resampled, y_resampled)
# Make predictions on the validation set
predictions = rf_classifier.predict(x_val)

In [29]:
# Evaluate the model with validation data

#Calculate accuracy
accuracy = accuracy_score(y_val, predictions)

# Calculate precision
precision = precision_score(y_val, predictions, average='weighted')

# Calculate recall
recall = recall_score(y_val, predictions, average='weighted')

# Calculate F1-score
f1 = f1_score(y_val, predictions, average='weighted')

# Calculate ROC-AUC (for multiclass classification, you need to use one-vs-all strategy)
roc_auc = roc_auc_score(y_val, rf_classifier.predict_proba(x_val), average='weighted', multi_class='ovr')

# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC-AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)

# # Calculate class-wise accuracy
# class_wise_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)

# # Print class-wise accuracy
# for i, accuracy in enumerate(class_wise_accuracy):
#     print(f"Class {i} Accuracy: {accuracy:.4f}")


Accuracy: 0.9486059479553903
Precision: 0.9594313675490554
Recall: 0.9486059479553903
F1-score: 0.9527433686734739
ROC-AUC: 0.9960321060192917
Confusion Matrix:
[[203297   8564    261    474   3234     33      0]
 [  2374   9682     21     53   1081      0      0]
 [   156     15    701     31      8      1      0]
 [   278     38     16    650     13      0      0]
 [  2617   3860     30     38  77066      0      0]
 [    26      0      0      0      0  96664      1]
 [     0      0      0      0      0      3  40634]]


In [30]:
# Evaluate the model with test data

# Make predictions on the test set
predictions_test = rf_classifier.predict(x_test)

#Calculate accuracy
accuracy = accuracy_score(y_test, predictions_test)

# Calculate precision
precision = precision_score(y_test, predictions_test, average='weighted')

# Calculate recall
recall = recall_score(y_test, predictions_test, average='weighted')

# Calculate F1-score
f1 = f1_score(y_test, predictions_test, average='weighted')

# Calculate ROC-AUC (for multiclass classification, you need to use one-vs-all strategy)
roc_auc = roc_auc_score(y_test, rf_classifier.predict_proba(x_test), average='weighted', multi_class='ovr')

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, predictions_test)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC-AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)

# # Calculate class-wise accuracy
# class_wise_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)

# # Print class-wise accuracy
# for i, accuracy in enumerate(class_wise_accuracy):
#     print(f"Class {i} Accuracy: {accuracy:.4f}")

Accuracy: 0.9486568419189237
Precision: 0.9591083797767159
Recall: 0.9486568419189237
F1-score: 0.9526717971544484
ROC-AUC: 0.9959563283569196
Confusion Matrix:
[[203569   8287    274    479   3225     29      0]
 [  2458   9594     17     51   1092      0      0]
 [   167     17    697     24      7      0      0]
 [   281     22     22    665      6      0      0]
 [  2685   3952     33     41  76900      0      0]
 [    28      0      0      0      0  96660      2]
 [     0      0      0      0      0      4  40632]]
