In [91]:
#Import Packages and Functions required
import pandas as pd
import os
import pickle
# import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
# from imblearn.over_sampling  import SMOTE
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import warnings
# from sklearn.model_selection import GridSearchCV

# Suppress warnings
warnings.filterwarnings("ignore")


In [2]:
# Import Dataset
# Define the folder path containing CSV files
folder_path = './../har70plus'

# List to hold individual DataFrames
dataframes = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Construct full file path
        file_path = os.path.join(folder_path, filename)
        # Read the CSV file into a DataFrame
        df_raw = pd.read_csv(file_path)
        # Add a new column with the file name
        df_raw['source_file'] = filename.replace('.csv','')
        # Append the DataFrame to the list
        dataframes.append(df_raw)

# Concatenate all DataFrames in the list into a single DataFrame
df = pd.concat(dataframes, ignore_index=True)

# Convert date time to unix timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['unix_timestamp_ms'] = (df['timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1ms')


In [3]:
#Check if any null value is present
df.isna().sum()

timestamp            0
back_x               0
back_y               0
back_z               0
thigh_x              0
thigh_y              0
thigh_z              0
label                0
source_file          0
unix_timestamp_ms    0
dtype: int64

In [4]:
#Sepetrate out the target and features
y = df.iloc[:,7:8]
x = df[df.columns.difference(['label','timestamp']) ]

In [5]:
# Splitting data into training and testing sets (80% training, 20% testing) -- Add stratification as target classes are imbalanced
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)


In [6]:
#EDA
def check_df(data, head=5):
    print("\n------Shape------")
    print(f'Shape     : {df.shape}\n'
          f'Size      : {df.size}\n'
          f'Dimension : {df.ndim}')
    print("\n------Types------")
    print(data.dtypes)
    print("\n------Head------")
    print(data.head(head))
    print("\n------Tail------")
    print(data.tail(head))
    print("\n------Missing Values------")
    print(data.isnull().sum())
    print("\n------Duplicated Values------")
    print(data.duplicated().sum())
    print("\n------Unique Values------")
    print(data.nunique())
    print("\n------Describe------")
    print(data.describe().T)

check_df(x_train)


------Shape------
Shape     : (2259597, 10)
Size      : 22595970
Dimension : 2

------Types------
back_x               float64
back_y               float64
back_z               float64
source_file           object
thigh_x              float64
thigh_y              float64
thigh_z              float64
unix_timestamp_ms      int64
dtype: object

------Head------
           back_x    back_y    back_z source_file   thigh_x   thigh_y  \
268015  -0.924072 -0.091064  0.311768         503  0.312500 -0.078369   
2087578 -0.841309 -0.079834 -0.106689         517 -0.845947 -0.010986   
946707  -0.932129  0.003418 -0.328613         508 -0.988281  0.041748   
1312161 -0.924561  0.027832 -0.103027         511 -1.001221 -0.132324   
519686  -0.729492 -0.171875  0.593750         505  0.098145 -0.079590   

          thigh_z  unix_timestamp_ms  
268015  -1.047119      1617029603659  
2087578  0.213623      1623149048927  
946707  -0.144531      1618585773845  
1312161 -0.077637      1620384409703  
519

In [7]:
# Seperate out the num and cat features
cat_cols = ['source_file']
num_cols = [cols for cols in x.columns if cols not in cat_cols]

In [8]:
#Check for imbalance in dataset
y['label'].value_counts()

label
1    1079312
7     483452
6     418055
8     203182
3      66058
5       4978
4       4560
Name: count, dtype: int64

In [54]:
#Feature Engineering

#Checking for linearity between target and features using linear regression analysis
# One-hot encode the categorical target variable
y_train_encoded = pd.get_dummies(y_train['label'], prefix='label')

# Function to fit linear regression and return p-values
def get_pvalues(feature, y_encoded):
    X = sm.add_constant(y_encoded)
    model = sm.OLS(feature, X).fit()
    return model.pvalues[1:]  # Exclude the constant

# Dictionary to store p-values
pvalues = {}

# Calculate p-values for each numerical feature
for col in x_train.columns:
    pvalues[col] = get_pvalues(x_train[col].astype(float), y_train_encoded.astype(float))

# Convert p-values dictionary to DataFrame
pvalues_df = pd.DataFrame(pvalues)

# Define a significance level
significance_level = 0.05

# Determine which features have significant p-values for any category
linear_features = [col for col in pvalues_df.columns if any(pvalues_df[col] < significance_level)]
non_linear_features = [col for col in x_train.columns if col not in linear_features]


In [57]:
# # Generate polynomial features as all my features are non linear
# poly = PolynomialFeatures(degree=2, include_bias=False)
# x_train = poly.fit_transform(x_train)
# x_test = poly.transform(x_test)

In [94]:
# Standardize the features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [95]:
# One-vs-Rest strategy with Logistic Regression
log_reg = LogisticRegression(max_iter=10000, penalty='l2')
ovr_classifier = OneVsRestClassifier(log_reg)

# Fit the model
ovr_classifier.fit(x_train, y_train['label'])

In [96]:
predictions_test = ovr_classifier.predict(x_test)
np.unique(predictions_test, return_counts=True)

(array(['1', '6', '7', '8'], dtype='<U1'),
 array([304967,   7500,  98635,  40818], dtype=int64))

In [97]:
# Evaluate the model with test data

# Make predictions on the test set
predictions_test = ovr_classifier.predict(x_test)
predictions_test = predictions_test.astype(int)

#Calculate accuracy
accuracy = accuracy_score(y_test, predictions_test)

# Calculate precision
precision = precision_score(y_test, predictions_test, average='weighted')

# Calculate recall
recall = recall_score(y_test, predictions_test, average='weighted')

# Calculate F1-score
f1 = f1_score(y_test, predictions_test, average='weighted')

# Calculate ROC-AUC (for multiclass classification, you need to use one-vs-all strategy)
roc_auc = roc_auc_score(y_test, ovr_classifier.predict_proba(x_test), average='weighted', multi_class='ovr')

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, predictions_test)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC-AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)

# # Calculate class-wise accuracy
# class_wise_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)

# # Print class-wise accuracy
# for i, accuracy in enumerate(class_wise_accuracy):
#     print(f"Class {i} Accuracy: {accuracy:.4f}")

Accuracy: 0.7612431403788281
Precision: 0.6825843319684788
Recall: 0.7612431403788281
F1-score: 0.6851284624753377
ROC-AUC: 0.8647068067324104
Confusion Matrix:
[[208219      0      0      0   4767   2850     27]
 [ 13185      0      0      0     25      2      0]
 [   865      0      0      0      5     42      0]
 [   984      0      0      0      7      5      0]
 [ 80919      0      0      0   2692      0      0]
 [   781      0      0      0      4  94112   1793]
 [    14      0      0      0      0   1624  38998]]
