<a href="https://colab.research.google.com/github/MSchukking/FirstRepo/blob/main/240721_1610_interview_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Made by: Monique Schukking
Date: Sunday Jul 21, 2024
Purpose: NOWATCH Interview Assignment

In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [17]:
# ------------------------------------------------------------------------------
# BUILD MACHINE LEARNING MODEL
# ------------------------------------------------------------------------------

# Load the dataset:
# NOTE: Need to upload the file first to this Jupyter Notebook environment
data = pd.read_csv('/content/dataset_assignment_2.csv')

In [18]:
# Display the first few rows of the dataset
print(data.head())

   User Activity  Activity_Number         Window_Start           Window_End  \
0     7  walking                9  2024-04-11 09:33:43  2024-04-11 09:33:48   
1     7  walking                9  2024-04-11 09:33:44  2024-04-11 09:33:49   
2     7  walking                9  2024-04-11 09:33:45  2024-04-11 09:33:50   
3     7  walking                9  2024-04-11 09:33:46  2024-04-11 09:33:51   
4     7  walking                9  2024-04-11 09:33:47  2024-04-11 09:33:52   

     Mean_x    Mean_y    Mean_z     Std_x     Std_y  ...  PSD_ratio_1_y  \
0 -5.119725  8.190490  1.447585  3.506086  3.752261  ...       0.092969   
1 -5.179565  8.145401  1.610093  3.574599  3.718254  ...       0.092848   
2 -5.145065  8.205241  1.611283  3.585068  3.737471  ...       0.091979   
3 -5.230602  8.196438  1.599386  3.596076  3.690269  ...       0.089673   
4 -5.341122  8.213688  1.540617  3.597734  3.614020  ...       0.085312   

   PSD_ratio_3_y  PSD_ratio_5_y  PSD_ratio_10_y  PSD_ratio_1_z  PSD_ratio_

In [19]:
# 'Activity' with 'Activity_number' are the target variables:
X = data.drop(columns=['Activity', 'Activity_Number'])
y = data['Activity']

In [20]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, stratify=y)

In [21]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

In [22]:
# Preprocessing pipeline for numerical data:
# 1. SimpleImputer(): Here it replaces missing values with the mean
# 2. StandardScaler(): Performs z-score normalization / feature scaling so
# that each feature contributes equally to the model's performance.
# it:
# A. Centers the data by substracting the mean of the feature from each data point
# B. Scaling to Unit Variance by dividing each feature by its standard deviation
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
    ,('scaler', StandardScaler())])

In [23]:
# Preprocessing pipeline for categorical data:
# 1. SimpleImputer(): Here it replaces missing values with the most frequent value
# 2. OneHotEncoder() transforms string labels into binary so that the
# machine learning algorithm can deal with categorical data.
# * handle_unknown = 'ignore' ensures that unknown categories in the test are
# ignored instead of causing an error
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
    ,('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [24]:
# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols)
        ,('cat', categorical_transformer, categorical_cols)
])

In [25]:
# Create the full pipeline with a RandomForestClassifier
activity_type_model = Pipeline(steps = [
    ('preprocessor', preprocessor)
    ,('classifier', RandomForestClassifier(random_state=42))
])

In [26]:
# Train the model
activity_type_model.fit(X_train, y_train)

In [27]:
# Predict on the test set
y_pred = activity_type_model.predict(X_test)

In [28]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9793333333333333
              precision    recall  f1-score   support

     cycling       0.99      0.94      0.97       679
     running       1.00      0.98      0.99       553
     sitting       0.97      1.00      0.98       843
     walking       0.97      0.99      0.98       925

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000



In [29]:
# ------------------------------------------------------------------------------
# MODEL IMPROVEMENTS INVESTIGATION
# ------------------------------------------------------------------------------

# Code below is for further model improvement:
# GOAL: Analyzing the samples that were misclassified for cycling

# Step 1. Identify wrongly predicted samples: Compare the predicted samples (y_pred)
# with the true labels (y_test) to identify the misclassified samples

# Create a boolean mask for wrongly predicted samples
wrong_predictions = y_pred != y_test
print(wrong_predictions.head())

4026     False
10193    False
13658    False
756      False
7126     False
Name: Activity, dtype: bool


In [30]:
# Step 2. Extract wrongly predicted samples and their feature values
# Use the boolean mask to filter out the misclassified samples and their
# corresponding feature values

# Extract the indices of wrongly predicted samples
wrong_indices = np.where(wrong_predictions)[0]

# Get the wrongly predicted samples and their true values
wrong_samples = X_test.iloc[wrong_indices]
wrong_true_labels = y_test.iloc[wrong_indices]
wrong_predicted_lables = y_pred[wrong_indices]

In [31]:
# Step 3. Combine and display the results
# Combine the wrongly predicted samples, their true lables and predicted labels
# into a single DataFrame for better visualization

# Create a DataFrame and display the results
wrong_predictions_df = wrong_samples.copy()
wrong_predictions_df['True Label'] = wrong_true_labels.values
wrong_predictions_df['Predicted Label'] = wrong_predicted_lables
print(wrong_predictions_df.head())

       User         Window_Start           Window_End    Mean_x    Mean_y  \
10012    28  2024-04-27 23:02:11  2024-04-27 23:02:16 -6.969179  4.231164   
12239    11  2024-04-25 18:03:23  2024-04-25 18:03:28 -3.915546  9.231923   
3832     16  2024-04-20 09:48:32  2024-04-20 09:48:37 -4.206657  7.803729   
4992     12  2024-04-18 09:47:02  2024-04-18 09:47:07  7.173682 -1.424267   
9982     28  2024-04-27 23:01:41  2024-04-27 23:01:46 -8.937956 -0.973503   

         Mean_z     Std_x     Std_y     Std_z  Skewness_x  ...  PSD_ratio_5_y  \
10012 -1.939631  3.874660  4.611639  4.229294   -0.919580  ...       0.007035   
12239  0.915567  1.046030  3.224779  0.715942   -0.487373  ...       0.003205   
3832  -4.263642  0.495968  0.370727  0.340531   -3.802613  ...       0.000334   
4992   6.493551  0.330976  0.390785  0.223511   -2.063862  ...       0.000769   
9982  -3.983476  0.139151  0.085940  0.184574   -0.356658  ...       0.001389   

       PSD_ratio_10_y  PSD_ratio_1_z  PSD_ratio_3_

In [46]:
# The model performed worst for cycling:
# Get a DataFrame with only True Label = cycling:
cycling_wrong_predictions = wrong_predictions_df[wrong_predictions_df['True Label'] == 'cycling']
sorted_cycling_wrong_predictions = cycling_wrong_predictions.sort_values(by=['User'])

# RESEARCH QUESTION: As what activity is cycling wrongly predicted?
predicted_label_counts = cycling_wrong_predictions['Predicted Label'].value_counts()
print(predicted_label_counts)
# Print the table with only user, true label and predicted label:
sorted_cycling_wrong_predictions_counts = sorted_cycling_wrong_predictions[['User', 'True Label', 'Predicted Label']].value_counts()
print(sorted_cycling_wrong_predictions_counts)

# RESEARCH QUESTION: For how many users did these wrong predicted cycle activity occur?
user_ids_with_wrong_predictions = sorted_cycling_wrong_predictions['User'].unique()
print(user_ids_with_wrong_predictions)

Predicted Label
walking    23
sitting    15
running     1
Name: count, dtype: int64
User  True Label  Predicted Label
11    cycling     walking            21
                  sitting            12
2     cycling     sitting             3
10    cycling     walking             1
11    cycling     running             1
13    cycling     walking             1
Name: count, dtype: int64
[ 2 10 11 13]


In [50]:
# RESEARCH QUESTION: Which activities were wrongly predicted as cycling?
cycling_wrong_true_label = wrong_predictions_df[wrong_predictions_df['Predicted Label']== 'cycling']
true_label_counts = cycling_wrong_true_label['True Label'].value_counts()
print(true_label_counts)
# Print the user, true label and predicted label from these results:
print(cycling_wrong_true_label[['User', 'True Label', 'Predicted Label']])

True Label
sitting    3
running    1
walking    1
Name: count, dtype: int64
       User True Label Predicted Label
1643     15    sitting         cycling
1642     15    sitting         cycling
3790     16    running         cycling
2390     15    walking         cycling
12753    20    sitting         cycling


In [49]:
# RESEARCH QUESTION: How many users are there in total in the test set?
sorted_X_test = X_test.sort_values(by=['User'])
total_unique_test_users = sorted_X_test['User'].unique()
print(total_unique_test_users)

[ 2  3  7 10 11 12 13 15 16 20 24 28 35]


In [51]:
# RESEARCH QUESTION: How many users were there in the training set?
sorted_X_train = X_train.sort_values(by=['User'])
total_unique_train_users = sorted_X_train['User'].unique()
print(total_unique_train_users)

[ 2  3  7 10 11 12 13 15 16 20 24 28 35]


In [52]:
# RESEARCH QUESTION: How often do all of the 4 activities occur per user in
# the whole dataset?
sorted_data = data.sort_values(by=['User', 'Activity'])
activity_count_per_user = sorted_data.groupby(['User', 'Activity']).size().unstack()
print(activity_count_per_user)


Activity  cycling  running  sitting  walking
User                                        
2           575.0      NaN      NaN      NaN
3             NaN      NaN   1463.0      NaN
7             NaN      NaN    606.0   2297.0
10         1121.0      NaN      NaN      NaN
11         1342.0      NaN      NaN   1165.0
12            NaN      NaN      NaN    768.0
13          355.0      NaN      NaN      NaN
15            NaN      NaN    657.0    100.0
16            NaN    595.0      NaN      NaN
20            NaN      NaN    895.0      NaN
24            NaN      NaN    595.0      NaN
28            NaN   2171.0      NaN      NaN
35            NaN      NaN      NaN    295.0


In [54]:
# RESEARCH QUESTION: For each user, what are the min en max Window_start
# date/times and the total time between this min and max?
window_per_user = data.groupby('User').agg({
    'Window_Start': 'min'
    ,'Window_End': 'max'
    })

# Ensure the columns are in datetime format before substraction
window_per_user['Window_Start'] = pd.to_datetime(window_per_user['Window_Start'])
window_per_user['Window_End'] = pd.to_datetime(window_per_user['Window_End'])

# Calculate the total time for each user:
window_per_user['Total_time'] = window_per_user['Window_End'] - window_per_user['Window_Start']

# Rename the columns
window_per_user.rename(columns={
    'Window_Start': 'Min_Window_Start'
    ,'Window_End': 'Max_Window_End'
}, inplace=True)

# Display the windo info per user
print(window_per_user)

        Min_Window_Start      Max_Window_End      Total_time
User                                                        
2    2024-04-18 10:07:20 2024-04-18 10:16:59 0 days 00:09:39
3    2024-04-11 11:45:00 2024-04-16 13:09:59 5 days 01:24:59
7    2024-04-11 09:09:25 2024-04-15 18:52:59 4 days 09:43:34
10   2024-04-15 17:25:00 2024-04-16 09:24:59 0 days 15:59:59
11   2024-04-22 15:37:00 2024-04-25 18:05:59 3 days 02:28:59
12   2024-04-18 09:47:00 2024-04-23 17:20:28 5 days 07:33:28
13   2024-04-10 12:47:00 2024-04-10 12:52:59 0 days 00:05:59
15   2024-04-18 12:07:42 2024-04-19 10:03:59 0 days 21:56:17
16   2024-04-20 09:45:00 2024-04-20 09:54:59 0 days 00:09:59
20   2024-04-16 15:05:00 2024-04-16 15:19:59 0 days 00:14:59
24   2024-04-15 11:15:00 2024-04-15 11:24:59 0 days 00:09:59
28   2024-04-25 22:40:00 2024-04-27 23:17:59 2 days 00:37:59
35   2024-04-25 11:15:00 2024-04-25 11:19:59 0 days 00:04:59
