<a href="https://colab.research.google.com/github/MSchukking/FirstRepo/blob/main/240719_2049_interview_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib

In [16]:
# Load the dataset: Need to upload the file first to this Jupyter Notebook
# environment
data = pd.read_csv('/dataset_assignment_2.csv')

In [17]:
# Display the first few rows of the dataset
print(data.head())

   User Activity  Activity_Number         Window_Start           Window_End  \
0     7  walking                9  2024-04-11 09:33:43  2024-04-11 09:33:48   
1     7  walking                9  2024-04-11 09:33:44  2024-04-11 09:33:49   
2     7  walking                9  2024-04-11 09:33:45  2024-04-11 09:33:50   
3     7  walking                9  2024-04-11 09:33:46  2024-04-11 09:33:51   
4     7  walking                9  2024-04-11 09:33:47  2024-04-11 09:33:52   

     Mean_x    Mean_y    Mean_z     Std_x     Std_y  ...  PSD_ratio_1_y  \
0 -5.119725  8.190490  1.447585  3.506086  3.752261  ...       0.092969   
1 -5.179565  8.145401  1.610093  3.574599  3.718254  ...       0.092848   
2 -5.145065  8.205241  1.611283  3.585068  3.737471  ...       0.091979   
3 -5.230602  8.196438  1.599386  3.596076  3.690269  ...       0.089673   
4 -5.341122  8.213688  1.540617  3.597734  3.614020  ...       0.085312   

   PSD_ratio_3_y  PSD_ratio_5_y  PSD_ratio_10_y  PSD_ratio_1_z  PSD_ratio_

In [18]:
# 'Activity' is the target variable and the rest are features:
X = data.drop(columns=['Activity', 'Activity_Number'])
y = data['Activity']

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

In [28]:
# Preprocessing pipeline for numerical data:
# 1. SimpleImputer(): Here it replaces missing values with the mean
# 2. StandardScaler(): Performs z-score normalization / feature scaling so
# that each feature contributes equally to the model's performance.
# it:
# A. Centers the data by substracting the mean of the feature from each data point
# B. Scaling to Unit Variance by dividing each feature by its standard deviation
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
    ,('scaler', StandardScaler())])

In [29]:
# Preprocessing pipeline for categorical data:
# 1. SimpleImputer(): Here it replaces missing values with the most frequent value
# 2. OneHotEncoder() transforms string labels into binary so that the
# machine learning algorithm can deal with categorical data.
# * handle_unknown = 'ignore' ensures that unknown categories in the test are
# ignored instead of causing an error
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
    ,('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [21]:
# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols)
        ,('cat', categorical_transformer, categorical_cols)
])

In [22]:
# Create the full pipeline with a RandomForestClassifier
model = Pipeline(steps = [
    ('preprocessor', preprocessor)
    ,('classifier', RandomForestClassifier(random_state=42))
])

In [23]:
# Train the model
model.fit(X_train, y_train)

In [24]:
# Predict on the test set
y_pred = model.predict(X_test)

In [27]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9793333333333333
              precision    recall  f1-score   support

     cycling       0.99      0.94      0.97       679
     running       1.00      0.98      0.99       553
     sitting       0.97      1.00      0.98       843
     walking       0.97      0.99      0.98       925

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000

