# Mount Google Drive

In [52]:
# Mount Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Unzip the dataset

In [53]:
# Unzip the dataset from the Google Drive location
!unzip "/content/drive/My Drive/UCI HAR Dataset.zip"

Archive:  /content/drive/My Drive/UCI HAR Dataset.zip
replace UCI HAR Dataset/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

## Importing libraries

In [54]:
# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Read the training dataset with 561 features

In [55]:
# Read the training dataset with 561 features
train_df = pd.read_csv("UCI HAR Dataset/train/X_train.txt", delim_whitespace=True, names=list(range(0, 561)))

# Read and add the 'Subject' column to the training dataset
subject_train = pd.read_csv("UCI HAR Dataset/train/subject_train.txt", header=None)
train_df['Subject'] = subject_train.squeeze()

# Read and add the 'activity' column to the training dataset
activity_train = pd.read_csv("UCI HAR Dataset/train/y_train.txt", header=None)
train_df['activity'] = activity_train.squeeze()

# Display the first few rows of the training dataframe to check the result
print(train_df.head())


          0         1         2         3         4         5         6  \
0  0.288585 -0.020294 -0.132905 -0.995279 -0.983111 -0.913526 -0.995112   
1  0.278419 -0.016411 -0.123520 -0.998245 -0.975300 -0.960322 -0.998807   
2  0.279653 -0.019467 -0.113462 -0.995380 -0.967187 -0.978944 -0.996520   
3  0.279174 -0.026201 -0.123283 -0.996091 -0.983403 -0.990675 -0.997099   
4  0.276629 -0.016570 -0.115362 -0.998139 -0.980817 -0.990482 -0.998321   

          7         8         9  ...       553       554       555       556  \
0 -0.983185 -0.923527 -0.934724  ... -0.710304 -0.112754  0.030400 -0.464761   
1 -0.974914 -0.957686 -0.943068  ... -0.861499  0.053477 -0.007435 -0.732626   
2 -0.963668 -0.977469 -0.938692  ... -0.760104 -0.118559  0.177899  0.100699   
3 -0.982750 -0.989302 -0.938692  ... -0.482845 -0.036788 -0.012892  0.640011   
4 -0.979672 -0.990441 -0.942469  ... -0.699205  0.123320  0.122542  0.693578   

        557       558       559       560  Subject  activity  
0 -0.

# Read feature names

In [56]:
# Read feature names from the features.txt file
features = list()
with open("UCI HAR Dataset/features.txt") as f:
    for line in f:
        features.append(line.split()[1])

In [57]:
#display features
features

['tBodyAcc-mean()-X',
 'tBodyAcc-mean()-Y',
 'tBodyAcc-mean()-Z',
 'tBodyAcc-std()-X',
 'tBodyAcc-std()-Y',
 'tBodyAcc-std()-Z',
 'tBodyAcc-mad()-X',
 'tBodyAcc-mad()-Y',
 'tBodyAcc-mad()-Z',
 'tBodyAcc-max()-X',
 'tBodyAcc-max()-Y',
 'tBodyAcc-max()-Z',
 'tBodyAcc-min()-X',
 'tBodyAcc-min()-Y',
 'tBodyAcc-min()-Z',
 'tBodyAcc-sma()',
 'tBodyAcc-energy()-X',
 'tBodyAcc-energy()-Y',
 'tBodyAcc-energy()-Z',
 'tBodyAcc-iqr()-X',
 'tBodyAcc-iqr()-Y',
 'tBodyAcc-iqr()-Z',
 'tBodyAcc-entropy()-X',
 'tBodyAcc-entropy()-Y',
 'tBodyAcc-entropy()-Z',
 'tBodyAcc-arCoeff()-X,1',
 'tBodyAcc-arCoeff()-X,2',
 'tBodyAcc-arCoeff()-X,3',
 'tBodyAcc-arCoeff()-X,4',
 'tBodyAcc-arCoeff()-Y,1',
 'tBodyAcc-arCoeff()-Y,2',
 'tBodyAcc-arCoeff()-Y,3',
 'tBodyAcc-arCoeff()-Y,4',
 'tBodyAcc-arCoeff()-Z,1',
 'tBodyAcc-arCoeff()-Z,2',
 'tBodyAcc-arCoeff()-Z,3',
 'tBodyAcc-arCoeff()-Z,4',
 'tBodyAcc-correlation()-X,Y',
 'tBodyAcc-correlation()-X,Z',
 'tBodyAcc-correlation()-Y,Z',
 'tGravityAcc-mean()-X',
 'tGravityA

# Read the test dataset

In [58]:
# Read the main test dataset
test_df = pd.read_csv("UCI HAR Dataset/test/X_test.txt", delim_whitespace=True, names=list(range(0, 561)))

# Read and add the 'Subject' column to the test datase
subject_test = pd.read_csv("UCI HAR Dataset/test/subject_test.txt", header=None)
test_df['Subject'] = subject_test.squeeze()

# Read and add the 'activity' column to the test dataset
activity_test = pd.read_csv("UCI HAR Dataset/test/y_test.txt", header=None)
test_df['activity'] = activity_test.squeeze()
print(test_df.head())

          0         1         2         3         4         5         6  \
0  0.257178 -0.023285 -0.014654 -0.938404 -0.920091 -0.667683 -0.952501   
1  0.286027 -0.013163 -0.119083 -0.975415 -0.967458 -0.944958 -0.986799   
2  0.275485 -0.026050 -0.118152 -0.993819 -0.969926 -0.962748 -0.994403   
3  0.270298 -0.032614 -0.117520 -0.994743 -0.973268 -0.967091 -0.995274   
4  0.274833 -0.027848 -0.129527 -0.993852 -0.967445 -0.978295 -0.994111   

          7         8         9  ...       553       554       555       556  \
0 -0.925249 -0.674302 -0.894088  ... -0.705974  0.006462  0.162920 -0.825886   
1 -0.968401 -0.945823 -0.894088  ... -0.594944 -0.083495  0.017500 -0.434375   
2 -0.970735 -0.963483 -0.939260  ... -0.640736 -0.034956  0.202302  0.064103   
3 -0.974471 -0.968897 -0.938610  ... -0.736124 -0.017067  0.154438  0.340134   
4 -0.965953 -0.977346 -0.938610  ... -0.846595 -0.002223 -0.040046  0.736715   

        557       558       559       560  Subject  activity  
0  0.

In [59]:
# Save the processed training and test dataframes to CSV files
train_df.to_csv('train.csv',index_label=False)
test_df.to_csv('test.csv',index_label=False)

In [60]:
# Read the saved CSV files into dataframes
train_df= pd.read_csv('/content/train.csv')

In [61]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,553,554,555,556,557,558,559,560,Subject,activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,5
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,5
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,5
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,5
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,5


In [62]:
test_df= pd.read_csv('/content/test.csv')

In [63]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,553,554,555,556,557,558,559,560,Subject,activity
0,0.257178,-0.023285,-0.014654,-0.938404,-0.920091,-0.667683,-0.952501,-0.925249,-0.674302,-0.894088,...,-0.705974,0.006462,0.16292,-0.825886,0.271151,-0.720009,0.276801,-0.057978,2,5
1,0.286027,-0.013163,-0.119083,-0.975415,-0.967458,-0.944958,-0.986799,-0.968401,-0.945823,-0.894088,...,-0.594944,-0.083495,0.0175,-0.434375,0.920593,-0.698091,0.281343,-0.083898,2,5
2,0.275485,-0.02605,-0.118152,-0.993819,-0.969926,-0.962748,-0.994403,-0.970735,-0.963483,-0.93926,...,-0.640736,-0.034956,0.202302,0.064103,0.145068,-0.702771,0.280083,-0.079346,2,5
3,0.270298,-0.032614,-0.11752,-0.994743,-0.973268,-0.967091,-0.995274,-0.974471,-0.968897,-0.93861,...,-0.736124,-0.017067,0.154438,0.340134,0.296407,-0.698954,0.284114,-0.077108,2,5
4,0.274833,-0.027848,-0.129527,-0.993852,-0.967445,-0.978295,-0.994111,-0.965953,-0.977346,-0.93861,...,-0.846595,-0.002223,-0.040046,0.736715,-0.118545,-0.692245,0.290722,-0.073857,2,5


#Split the training data into features (X) and target (y)

In [64]:
# Split the training data into features (X) and target (y)
from sklearn.model_selection import train_test_split
x=train_df.drop('Subject',axis=1)
y=train_df['Subject']
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size= 0.30)


In [65]:
cols_to_remove = list(range(40, 80))  # 41 to 80 in 0-based index
X_train = X_train.drop(X_train.columns[cols_to_remove], axis=1)
X_train.shape

(5146, 522)

#Feature Extraction

In [66]:
# Compute the correlation matrix between features and the target variable
correlation_matrix = X_train.corrwith(y_train)

# Get absolute correlation values and rank them
correlation_abs = correlation_matrix.abs()
top_features = correlation_abs.sort_values(ascending=False).head(40).index

# Extract the top 50 features
X_train = X_train[top_features]
X_test = X_test[top_features]

In [67]:
X_train

Unnamed: 0,490,317,396,497,403,407,36,118,190,178,...,168,186,548,351,318,255,275,546,541,377
1551,-0.623952,-0.308582,-0.385310,-0.553268,-0.353794,-0.120573,0.067899,-0.336607,0.302409,-0.711503,...,-0.299297,0.323736,0.204319,0.229859,-0.195203,0.087502,-0.152240,0.165287,0.165287,-0.246167
171,-0.951936,-0.756477,-0.778700,-0.942005,-0.728072,-0.729972,0.044600,-0.243832,0.225311,-0.936161,...,-0.649409,0.356741,-0.593434,-0.326339,-0.854191,-0.717430,-0.118790,-0.673200,-0.673200,-0.359219
1377,-0.999959,-0.999861,-0.999884,-0.999952,-0.999838,-0.999825,-0.050203,-0.479535,0.065580,-0.999949,...,-0.995062,0.130407,-0.985467,-0.989202,-0.999905,-0.987060,-0.977218,-0.984204,-0.984204,-0.551310
6610,-0.917430,-0.791467,-0.768802,-0.893579,-0.755970,-0.675935,0.168579,-0.114773,0.078196,-0.883832,...,-0.593861,0.006501,-0.647153,-0.231569,-0.691413,-0.696792,-0.259788,-0.676922,-0.676922,-0.236115
7127,-0.953738,-0.945047,-0.958459,-0.942260,-0.929183,-0.915077,0.130736,-0.273353,0.143408,-0.952807,...,-0.689518,0.002351,-0.754433,-0.641993,-0.933954,-0.834850,-0.242751,-0.824436,-0.824436,-0.416743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3129,-0.998448,-0.999588,-0.999885,-0.997591,-0.999751,-0.999751,0.501884,0.137343,0.386369,-0.999249,...,-0.985954,0.133982,-0.941364,-0.985635,-0.999348,-0.951032,-0.972457,-0.964352,-0.964352,-0.824731
5862,-0.901839,-0.730626,-0.702405,-0.868952,-0.631383,-0.620218,0.364378,-0.103998,0.275611,-0.879645,...,-0.565241,0.118514,-0.731271,-0.196340,-0.807766,-0.669463,-0.246227,-0.690097,-0.690097,-0.355231
994,-0.999863,-0.999287,-0.999519,-0.999842,-0.999508,-0.999580,-0.327210,0.295671,0.247810,-0.999854,...,-0.987362,0.186675,-0.992026,-0.980550,-0.999886,-0.992343,-0.978636,-0.992285,-0.992285,-0.316250
6586,-0.544280,-0.544821,-0.562006,-0.472955,-0.489181,-0.089458,0.080717,-0.160645,-0.015756,-0.665640,...,-0.214148,-0.031323,-0.293851,0.343734,0.063287,-0.332943,-0.355932,-0.288447,-0.288447,-0.366978


In [68]:
X_test

Unnamed: 0,490,317,396,497,403,407,36,118,190,178,...,168,186,548,351,318,255,275,546,541,377
2920,-0.999978,-0.999918,-0.999917,-0.999978,-0.999923,-0.999915,-0.137371,0.587190,-0.036313,-0.999981,...,-0.996944,-0.097704,-0.998713,-0.992992,-0.999938,-0.998689,-0.997090,-0.998998,-0.998998,-0.542373
6900,-0.957974,-0.768410,-0.832768,-0.937808,-0.799264,-0.688287,0.244286,0.157458,0.012023,-0.898840,...,-0.550341,0.103495,-0.559937,-0.160992,-0.707676,-0.622446,-0.437816,-0.622659,-0.622659,-0.515131
4760,-0.999957,-0.999842,-0.999825,-0.999953,-0.999827,-0.999806,0.222464,-0.040655,0.360209,-0.999960,...,-0.995026,-0.013240,-0.986200,-0.988769,-0.999886,-0.985103,-0.994447,-0.985832,-0.985832,-0.338620
4764,-0.998968,-0.996840,-0.995564,-0.998276,-0.994752,-0.994750,0.204992,0.037575,0.425203,-0.998910,...,-0.967124,0.539873,-0.903424,-0.921326,-0.997163,-0.885125,-0.919858,-0.919566,-0.919566,-0.315067
4205,-0.999911,-0.999288,-0.999625,-0.999894,-0.999479,-0.999562,-0.385715,0.070215,0.264978,-0.999870,...,-0.988720,-0.223070,-0.983677,-0.978149,-0.999642,-0.986335,-0.968556,-0.986278,-0.986278,-0.449889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3086,-0.999884,-0.996246,-0.996925,-0.999845,-0.996401,-0.996553,0.008072,-0.006619,0.017840,-0.999338,...,-0.975699,-0.074406,-0.978441,-0.934630,-0.998534,-0.962408,-0.774147,-0.972786,-0.972786,-0.490436
2283,-0.999968,-0.999753,-0.999821,-0.999966,-0.999763,-0.999759,-0.093911,0.200917,0.051700,-0.999933,...,-0.992566,0.020940,-0.993771,-0.987172,-0.999870,-0.993765,-0.970478,-0.992632,-0.992632,-0.282022
1792,-0.999941,-0.999949,-0.999983,-0.999904,-0.999967,-0.999930,-0.293353,0.497517,0.025292,-0.999961,...,-0.995690,-0.042563,-0.996382,-0.992878,-0.999921,-0.998521,-0.982663,-0.998286,-0.998286,-0.512476
1441,-0.908989,-0.826266,-0.832271,-0.886382,-0.731730,-0.626873,0.273110,-0.602817,0.167562,-0.853918,...,-0.460707,0.125815,-0.227123,-0.170091,-0.663232,-0.212275,-0.449557,-0.184928,-0.184928,-0.444964


In [69]:
y_train

1551     7
171      1
1377     7
6610    28
7127    30
        ..
3129    16
5862    26
994      6
6586    28
3288    17
Name: Subject, Length: 5146, dtype: int64

In [70]:
y_test

2920    16
6900    29
4760    23
4764    23
4205    21
        ..
3086    16
2283    14
1792     8
1441     7
5876    27
Name: Subject, Length: 2206, dtype: int64

# Display the shapes of the split data

In [71]:
X_train.shape, X_test.shape

((5146, 40), (2206, 40))

In [72]:
y_train.shape, y_test.shape

((5146,), (2206,))

# Train a Logistic Regression model

In [73]:
from sklearn.linear_model import LogisticRegression
model= LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

# Model's accuracy on the training data

In [74]:
# Evaluate the model's accuracy on the training data
model.score(X_train,y_train)


0.3785464438398756

# Predictions on the test data

In [75]:
# Make predictions on the test data
pred=model.predict(X_test)

In [76]:
y_probs = model.predict_proba(X_test)

In [77]:
y_probs

array([[2.71764397e-02, 1.44520423e-02, 5.67940357e-03, ...,
        3.15047168e-02, 2.19647608e-01, 2.43076572e-02],
       [1.71056032e-02, 1.92708100e-02, 4.46829937e-04, ...,
        3.66294745e-02, 4.50489736e-01, 9.10996192e-02],
       [3.65264142e-02, 3.59218425e-02, 6.17811131e-02, ...,
        5.59528714e-02, 1.31635724e-02, 1.84949457e-02],
       ...,
       [3.42490865e-02, 2.57971360e-02, 1.33918140e-02, ...,
        3.42622772e-02, 1.17030461e-01, 2.41581079e-02],
       [1.00284580e-01, 1.92214105e-03, 6.15333394e-05, ...,
        2.50712364e-02, 2.52878077e-03, 4.44318270e-02],
       [3.34724081e-02, 3.57474798e-02, 1.71674439e-02, ...,
        4.90843518e-02, 8.79385174e-02, 3.14618613e-02]])

In [78]:
pred

array([29, 29, 27, ..., 29,  7, 29])

In [79]:
y_test

2920    16
6900    29
4760    23
4764    23
4205    21
        ..
3086    16
2283    14
1792     8
1441     7
5876    27
Name: Subject, Length: 2206, dtype: int64

# Model's accuracy on the test data

In [80]:
# Evaluate the model's accuracy on the test data
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.34814143245693563

# Classification report

In [81]:
# Print a detailed classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           1       0.55      0.35      0.43        99
           3       0.34      0.31      0.32        96
           5       0.51      0.47      0.49        80
           6       0.42      0.46      0.44        84
           7       0.48      0.32      0.38       101
           8       0.50      0.23      0.31        75
          11       0.45      0.35      0.40        99
          14       0.73      0.48      0.58       102
          15       0.42      0.20      0.27        89
          16       0.28      0.22      0.25       116
          17       0.25      0.23      0.24       122
          19       0.19      0.18      0.18        96
          21       0.30      0.26      0.28       122
          22       0.47      0.30      0.36        94
          23       0.59      0.54      0.56       115
          25       0.28      0.58      0.38       119
          26       0.35      0.40      0.37       130
          27       0.30    

#Confusion matrix

In [82]:
# Compute and display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred)

array([[35,  1,  0,  2,  3,  2,  1,  2,  1,  3,  3,  0,  4,  2,  1,  4,
         7, 10,  2,  8,  8],
       [ 0, 30,  3,  1,  1,  0,  4,  0,  0,  4,  5,  1,  1,  6,  1,  9,
         7,  3,  2,  4, 14],
       [ 0,  3, 38, 10,  3,  0,  0,  0,  1,  2,  3,  2,  4,  0,  0,  1,
         6,  2,  1,  1,  3],
       [ 2,  7,  9, 39,  2,  0,  0,  0,  0,  1,  3,  1,  0,  0,  3,  8,
         2,  2,  3,  0,  2],
       [ 4,  1,  1,  5, 32,  1,  1,  0,  0,  2,  3,  8,  4,  1,  8, 10,
         3,  2,  3,  3,  9],
       [ 2,  3,  0,  0,  0, 17,  2,  1,  0,  0,  7,  2,  1,  1,  1,  9,
         8,  2,  3, 12,  4],
       [ 1,  3,  4,  2,  0,  0, 35,  0,  3,  6,  2,  0,  2,  0,  0, 12,
         3, 13,  0, 12,  1],
       [ 0,  0,  0,  5,  1,  0,  1, 49,  1,  1,  2,  0,  2,  0,  2, 18,
         1,  1,  1,  8,  9],
       [ 3,  1,  2,  0,  2,  1,  8,  0, 18,  2,  9,  2,  4,  4,  1,  7,
         5,  5,  2,  7,  6],
       [ 0,  4,  0,  1,  1,  0,  6,  1,  2, 26, 10,  3, 11,  1,  3, 13,
         3,  9,  8,

#  F1 score

In [83]:
# Compute and display the F1 score
from sklearn.metrics import f1_score
f1_score(y_test, pred, average='macro')

0.3533865866249005