In [73]:
#Import Packages and Fumctions required
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [44]:
# Import Dataset
# Define the folder path containing CSV files
folder_path = 'har70plus'

# List to hold individual DataFrames
dataframes = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Construct full file path
        file_path = os.path.join(folder_path, filename)
        # Read the CSV file into a DataFrame
        df_raw = pd.read_csv(file_path)
        # Add a new column with the file name
        df_raw['source_file'] = filename.replace('.csv','')
        # Append the DataFrame to the list
        dataframes.append(df_raw)

# Concatenate all DataFrames in the list into a single DataFrame
df = pd.concat(dataframes, ignore_index=True)

# Convert date time to unix timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['unix_timestamp_ms'] = (df['timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1ms')


In [None]:
#Check if any null value is present
df.isna().sum()

In [45]:
#Sepetrate out the target and features
y = df.iloc[:,7:8]
x = df[df.columns.difference(['label','timestamp']) ]

In [46]:
# Splitting data into training and testing sets (80% training, 20% testing)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Splitting training set into training and validation sets (75% training, 25% validation)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42)

In [50]:
#EDA
def check_df(data, head=5):
    print("\n------Shape------")
    print(f'Shape     : {df.shape}\n'
          f'Size      : {df.size}\n'
          f'Dimension : {df.ndim}')
    print("\n------Types------")
    print(data.dtypes)
    print("\n------Head------")
    print(data.head(head))
    print("\n------Tail------")
    print(data.tail(head))
    print("\n------Missing Values------")
    print(data.isnull().sum())
    print("\n------Duplicated Values------")
    print(data.duplicated().sum())
    print("\n------Unique Values------")
    print(data.nunique())
    print("\n------Describe------")
    print(data.describe().T)

check_df(x_train)


------Shape------
Shape     : (2259597, 10)
Size      : 22595970
Dimension : 2

------Types------
back_x               float64
back_y               float64
back_z               float64
source_file           object
thigh_x              float64
thigh_y              float64
thigh_z              float64
unix_timestamp_ms      int64
dtype: object

------Head------
           back_x    back_y    back_z source_file   thigh_x   thigh_y  \
1390278 -0.993896  0.056152 -0.073730         512 -0.980225 -0.054443   
1630365 -0.919922 -0.186279 -0.579834         514 -0.965576  0.178467   
2160575 -0.676758 -0.063965  0.725098         518  0.002930  0.001221   
2113868 -0.995361  0.053223 -0.074463         517 -0.921631 -0.031250   
1703861 -0.978760  0.044434  0.240234         515 -0.216064  0.231934   

          thigh_z  unix_timestamp_ms  
1390278 -0.171875      1620403139500  
1630365 -0.153076      1621607461200  
2160575 -0.999268      1623407943358  
2113868  0.202637      1623149577806  
170

In [51]:
# Seperate out the num and cat features
cat_cols = ['source_file']
num_cols = [cols for cols in x.columns if cols not in cat_cols]

In [70]:
x.head()

Unnamed: 0,back_x,back_y,back_z,source_file,thigh_x,thigh_y,thigh_z,unix_timestamp_ms
0,-0.999023,-0.063477,0.140625,501,-0.980469,-0.112061,-0.048096,1616596923839
1,-0.980225,-0.079346,0.140625,501,-0.961182,-0.121582,-0.051758,1616596923859
2,-0.950195,-0.076416,0.140625,501,-0.949463,-0.080566,-0.067139,1616596923880
3,-0.954834,-0.059082,0.140381,501,-0.95752,-0.046143,-0.050781,1616596923900
4,-0.972412,-0.042969,0.142822,501,-0.977051,-0.023682,-0.026611,1616596923920


In [55]:
#Check for imbalance in dataset
y['label'].value_counts()

label
1    1079312
7     483452
6     418055
8     203182
3      66058
5       4978
4       4560
Name: count, dtype: int64

In [62]:
#Feature Engineering

#use rf to get feature inportances
rf_classifier_fi = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_fi.fit(x_train, y_train)
feature_importance = rf_classifier_fi.feature_importances_
feature_importance_df = pd.DataFrame({'Feature':x_train.columns, 'Feature_importance':feature_importance}).sort_values(by='Feature_importance', ascending=False)

  return fit_method(estimator, *args, **kwargs)


In [75]:
#Selecting fetaures who importance more tha 0.05
x_train = x_train[x_train.columns.difference(['source_file'])]
x_val = x_val[x_val.columns.difference(['source_file'])]

In [78]:
#Apply Random Forest to the data
# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the selected features
rf_classifier.fit(x_train, y_train)
# Make predictions on the test set
predictions = rf_classifier.predict(x_val)

  return fit_method(estimator, *args, **kwargs)


In [79]:
# Evaluate the model

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.329264029031687
