<a href="https://colab.research.google.com/github/MSchukking/FirstRepo/blob/main/240720_2154_interview_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib

In [3]:
# Load the dataset: Need to upload the file first to this Jupyter Notebook
# environment
data = pd.read_csv('/content/dataset_assignment_2.csv')

In [7]:
# Display the first few rows of the dataset
print(data.head())

   User Activity  Activity_Number         Window_Start           Window_End  \
0     7  walking                9  2024-04-11 09:33:43  2024-04-11 09:33:48   
1     7  walking                9  2024-04-11 09:33:44  2024-04-11 09:33:49   
2     7  walking                9  2024-04-11 09:33:45  2024-04-11 09:33:50   
3     7  walking                9  2024-04-11 09:33:46  2024-04-11 09:33:51   
4     7  walking                9  2024-04-11 09:33:47  2024-04-11 09:33:52   

     Mean_x    Mean_y    Mean_z     Std_x     Std_y  ...  PSD_ratio_1_y  \
0 -5.119725  8.190490  1.447585  3.506086  3.752261  ...       0.092969   
1 -5.179565  8.145401  1.610093  3.574599  3.718254  ...       0.092848   
2 -5.145065  8.205241  1.611283  3.585068  3.737471  ...       0.091979   
3 -5.230602  8.196438  1.599386  3.596076  3.690269  ...       0.089673   
4 -5.341122  8.213688  1.540617  3.597734  3.614020  ...       0.085312   

   PSD_ratio_3_y  PSD_ratio_5_y  PSD_ratio_10_y  PSD_ratio_1_z  PSD_ratio_

In [4]:
# 'Activity' with 'Activity_number' are the target variables:
X = data.drop(columns=['Activity', 'Activity_Number'])
y = data['Activity']

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

In [7]:
# Preprocessing pipeline for numerical data:
# 1. SimpleImputer(): Here it replaces missing values with the mean
# 2. StandardScaler(): Performs z-score normalization / feature scaling so
# that each feature contributes equally to the model's performance.
# it:
# A. Centers the data by substracting the mean of the feature from each data point
# B. Scaling to Unit Variance by dividing each feature by its standard deviation
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
    ,('scaler', StandardScaler())])

In [8]:
# Preprocessing pipeline for categorical data:
# 1. SimpleImputer(): Here it replaces missing values with the most frequent value
# 2. OneHotEncoder() transforms string labels into binary so that the
# machine learning algorithm can deal with categorical data.
# * handle_unknown = 'ignore' ensures that unknown categories in the test are
# ignored instead of causing an error
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
    ,('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [9]:
# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols)
        ,('cat', categorical_transformer, categorical_cols)
])

In [10]:
# Create the full pipeline with a RandomForestClassifier
model = Pipeline(steps = [
    ('preprocessor', preprocessor)
    ,('classifier', RandomForestClassifier(random_state=42))
])

In [11]:
# Train the model
model.fit(X_train, y_train)

In [12]:
# Predict on the test set
y_pred = model.predict(X_test)

In [13]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9789144865286997
              precision    recall  f1-score   support

     cycling       0.98      0.97      0.97       610
     running       0.99      0.98      0.99       553
     sitting       0.97      0.99      0.98       747
     walking       0.97      0.98      0.98       651

    accuracy                           0.98      2561
   macro avg       0.98      0.98      0.98      2561
weighted avg       0.98      0.98      0.98      2561



In [14]:
# ------------------------------------------------------------------------------
# Below here is for further model improvement
# - Analyzing the samples that were misclassified
# - And investigating the feature values that might have contributed to the misclassification
# ------------------------------------------------------------------------------
# Step 1. Identify wrongly predicted samples: Compare the predicted samples (y_pred)
# with the true labels (y_test) to identify the misclassified samples

# Create a boolean mask for wrongly predicted samples
wrong_predictions = y_pred != y_test
print(wrong_predictions.head())

3374     False
1074     False
8744     False
10977    False
2294     False
Name: Activity, dtype: bool


In [15]:
# Step 2. Extract wrongly predicted samples and their feature values
# Use the boolean mask to filter out the misclassified samples and their
# corresponding feature values

# Extract the indices of wrongly predicted samples
wrong_indices = np.where(wrong_predictions)[0]

# Get the wrongly predicted samples and their true values
wrong_samples = X_test.iloc[wrong_indices]
wrong_true_labels = y_test.iloc[wrong_indices]
wrong_predicted_lables = y_pred[wrong_indices]

In [17]:
# Step 3. Combine and display the results
# Combine the wrongly predicted samples, their true lables and predicted labels
# into a single DataFrame for better visualization

# Create a DataFrame and display the results
wrong_predictions_df = wrong_samples.copy()
wrong_predictions_df['True Label'] = wrong_true_labels.values
wrong_predictions_df['Predicted Label'] = wrong_predicted_lables
print(wrong_predictions_df.head())

       User         Window_Start           Window_End    Mean_x    Mean_y  \
12188    11  2024-04-25 18:02:32  2024-04-25 18:02:37 -5.177543  9.125804   
12750    20  2024-04-16 15:10:59  2024-04-16 15:11:04 -5.300673 -3.300251   
9989     28  2024-04-27 23:01:48  2024-04-27 23:01:53 -8.842307 -0.395683   
12193    11  2024-04-25 18:02:37  2024-04-25 18:02:42 -6.775264 -3.918044   
3743     16  2024-04-20 09:47:03  2024-04-20 09:47:08 -5.903120  7.155362   

         Mean_z     Std_x     Std_y     Std_z  Skewness_x  ...  PSD_ratio_5_y  \
12188 -0.236862  2.264881  2.337203  1.848208   -1.932785  ...       0.002066   
12750  1.127684  3.440438  3.993118  6.466993   -0.260551  ...       0.017164   
9989  -4.286602  0.135897  0.082636  0.113694   -0.615227  ...       0.007462   
12193 -0.355353  3.985229  5.394687  3.153733   -1.417723  ...       0.006217   
3743   0.518575  3.238599  5.552699  2.654547   -0.141053  ...       0.006581   

       PSD_ratio_10_y  PSD_ratio_1_z  PSD_ratio_3_

In [18]:
# The model performed worst for cycling:
# Get a DataFrame with only True Label = cycling:
cycling_wrong_predictions = wrong_predictions_df[wrong_predictions_df['True Label'] == 'cycling']

# RESEARCH QUESTION: As what activity is cycling wrongly predicted?
predicted_label_counts = cycling_wrong_predictions['Predicted Label'].value_counts()
print(predicted_label_counts)

# RESEARCH QUESTION: For how many users did these wrong predicted cycle activity occur?
sorted_cycling_wrong_predictions = cycling_wrong_predictions.sort_values(by=['User'])
user_ids_with_wrong_predictions = sorted_cycling_wrong_predictions['User'].unique()
print(user_ids_with_wrong_predictions)

Predicted Label
walking    11
sitting     7
running     2
Name: count, dtype: int64
[ 2 10 11 13]


In [19]:
# RESEARCH QUESTION: How many users are there in total in the test set?
print(y_test.head())
print(y_pred)
print(X_test.head())
sorted_X_test = X_test.sort_values(by=['User'])
total_unique_test_users = sorted_X_test['User'].unique()
print(total_unique_test_users)

3374     sitting
1074     walking
8744     running
10977    sitting
2294     sitting
Name: Activity, dtype: object
['sitting' 'walking' 'running' ... 'sitting' 'sitting' 'sitting']
       User         Window_Start           Window_End     Mean_x    Mean_y  \
3374      7  2024-04-11 09:17:49  2024-04-11 09:17:54  -5.137213 -8.383572   
1074     35  2024-04-25 11:19:42  2024-04-25 11:19:47  -4.274944 -2.755027   
8744     28  2024-04-25 22:45:14  2024-04-25 22:45:19 -10.249206  4.855025   
10977     3  2024-04-16 13:00:48  2024-04-16 13:00:53  -5.403103 -2.875540   
2294     24  2024-04-15 11:24:10  2024-04-15 11:24:15  -2.671632 -7.574957   

         Mean_z      Std_x      Std_y     Std_z  Skewness_x  ...  \
3374  -0.752463   0.129860   0.101844  0.153220   -0.823633  ...   
1074   2.141160   3.760148   7.272409  2.260016   -0.985152  ...   
8744   1.309822  15.966101  11.079219  6.965885   -0.844526  ...   
10977  7.830854   0.026034   0.033868  0.024469   -0.504934  ...   
2294   2.7

In [20]:
# RESEARCH QUESTION: How many users were there in the training set?
sorted_X_train = X_train.sort_values(by=['User'])
total_unique_train_users = sorted_X_train['User'].unique()
print(total_unique_train_users)

[ 2  3  7 10 11 12 13 15 16 20 24 28 35]


In [21]:
# RESEARCH QUESTION: How often do all of the 4 activities occur per user in
# training vs test set?
sorted_data = data.sort_values(by=['User', 'Activity'])
activity_count_per_user = sorted_data.groupby(['User', 'Activity']).size().unstack()
print(activity_count_per_user)


Activity  cycling  running  sitting  walking
User                                        
2           575.0      NaN      NaN      NaN
3             NaN      NaN   1463.0      NaN
7             NaN      NaN    606.0   1879.0
10          775.0      NaN      NaN      NaN
11         1342.0      NaN      NaN    690.0
12            NaN      NaN      NaN    293.0
13          355.0      NaN      NaN      NaN
15            NaN      NaN    657.0    100.0
16            NaN    595.0      NaN      NaN
20            NaN      NaN    414.0      NaN
24            NaN      NaN    595.0      NaN
28            NaN   2171.0      NaN      NaN
35            NaN      NaN      NaN    295.0


In [25]:
# RESEARCH QUESTION: Does User 11 have cycling never wrongly predicted as walking?
user11_wrong_predictions = wrong_predictions_df[wrong_predictions_df['User'] == 11]
sorted_user11_wrong_predictions = user11_wrong_predictions.sort_values(by=['Window_Start'])
print(sorted_user11_wrong_predictions)

user11_data = data[data['User'] == 11]
sorted_user11_data = user11_data.sort_values(by=['Window_Start'])
print(sorted_user11_data)


       User         Window_Start           Window_End    Mean_x    Mean_y  \
7373     11  2024-04-22 17:55:13  2024-04-22 17:55:18 -8.458759 -0.371533   
11526    11  2024-04-25 17:51:30  2024-04-25 17:51:35 -4.591157  3.779091   
12111    11  2024-04-25 18:01:15  2024-04-25 18:01:20 -5.763334  7.707247   
12158    11  2024-04-25 18:02:02  2024-04-25 18:02:07 -7.212822 -3.835957   
12159    11  2024-04-25 18:02:03  2024-04-25 18:02:08 -7.175348 -3.685821   
12188    11  2024-04-25 18:02:32  2024-04-25 18:02:37 -5.177543  9.125804   
12189    11  2024-04-25 18:02:33  2024-04-25 18:02:38 -5.805805  7.551877   
12193    11  2024-04-25 18:02:37  2024-04-25 18:02:42 -6.775264 -3.918044   
12196    11  2024-04-25 18:02:40  2024-04-25 18:02:45 -6.574329  2.265837   
12197    11  2024-04-25 18:02:41  2024-04-25 18:02:46 -5.731094  4.004176   
12215    11  2024-04-25 18:02:59  2024-04-25 18:03:04 -4.224859  3.817993   
12248    11  2024-04-25 18:03:32  2024-04-25 18:03:37 -6.632504  3.357117   

In [35]:
# RESEARCH QUESTION: For each user, what are the min en max Window_start
# date/times and the total time between this min and max?
window_per_user = data.groupby('User').agg({
    'Window_Start': 'min'
    ,'Window_End': 'max'
    })

# Ensure the columns are in datetime format before substraction
window_per_user['Window_Start'] = pd.to_datetime(window_per_user['Window_Start'])
window_per_user['Window_End'] = pd.to_datetime(window_per_user['Window_End'])

# Calculate the total time for each user:
window_per_user['Total_time'] = window_per_user['Window_End'] - window_per_user['Window_Start']

# Rename the columns
window_per_user.rename(columns={
    'Window_Start': 'Min_Window_Start'
    ,'Window_End': 'Max_Window_End'
}, inplace=True)

# Display the windo info per user
print(window_per_user)

        Min_Window_Start      Max_Window_End      Total_time
User                                                        
2    2024-04-18 10:07:20 2024-04-18 10:16:59 0 days 00:09:39
3    2024-04-11 11:45:00 2024-04-16 13:09:59 5 days 01:24:59
7    2024-04-11 09:09:25 2024-04-15 18:52:59 4 days 09:43:34
10   2024-04-16 09:12:00 2024-04-16 09:24:59 0 days 00:12:59
11   2024-04-22 17:50:00 2024-04-25 18:05:59 3 days 00:15:59
12   2024-04-18 09:47:00 2024-04-18 09:51:57 0 days 00:04:57
13   2024-04-10 12:47:00 2024-04-10 12:52:59 0 days 00:05:59
15   2024-04-18 12:07:42 2024-04-19 10:03:59 0 days 21:56:17
16   2024-04-20 09:45:00 2024-04-20 09:54:59 0 days 00:09:59
20   2024-04-16 15:05:00 2024-04-16 15:11:58 0 days 00:06:58
24   2024-04-15 11:15:00 2024-04-15 11:24:59 0 days 00:09:59
28   2024-04-25 22:40:00 2024-04-27 23:17:59 2 days 00:37:59
35   2024-04-25 11:15:00 2024-04-25 11:19:59 0 days 00:04:59
