# Mount Google Drive

In [249]:
# Mount Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Unzip the dataset

In [250]:
# Unzip the dataset from the Google Drive location
!unzip "/content/drive/My Drive/UCI HAR Dataset.zip"

Archive:  /content/drive/My Drive/UCI HAR Dataset.zip
replace UCI HAR Dataset/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

## Importing libraries

In [251]:
# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Read the training dataset with 561 features

In [252]:
# Read the training dataset with 561 features
train_df = pd.read_csv("UCI HAR Dataset/train/X_train.txt", delim_whitespace=True, names=list(range(0, 561)))

# Read and add the 'Subject' column to the training dataset
subject_train = pd.read_csv("UCI HAR Dataset/train/subject_train.txt", header=None)
train_df['Subject'] = subject_train.squeeze()

# Read and add the 'activity' column to the training dataset
activity_train = pd.read_csv("UCI HAR Dataset/train/y_train.txt", header=None)
train_df['activity'] = activity_train.squeeze()

# Display the first few rows of the training dataframe to check the result
print(train_df.head())


          0         1         2         3         4         5         6  \
0  0.288585 -0.020294 -0.132905 -0.995279 -0.983111 -0.913526 -0.995112   
1  0.278419 -0.016411 -0.123520 -0.998245 -0.975300 -0.960322 -0.998807   
2  0.279653 -0.019467 -0.113462 -0.995380 -0.967187 -0.978944 -0.996520   
3  0.279174 -0.026201 -0.123283 -0.996091 -0.983403 -0.990675 -0.997099   
4  0.276629 -0.016570 -0.115362 -0.998139 -0.980817 -0.990482 -0.998321   

          7         8         9  ...       553       554       555       556  \
0 -0.983185 -0.923527 -0.934724  ... -0.710304 -0.112754  0.030400 -0.464761   
1 -0.974914 -0.957686 -0.943068  ... -0.861499  0.053477 -0.007435 -0.732626   
2 -0.963668 -0.977469 -0.938692  ... -0.760104 -0.118559  0.177899  0.100699   
3 -0.982750 -0.989302 -0.938692  ... -0.482845 -0.036788 -0.012892  0.640011   
4 -0.979672 -0.990441 -0.942469  ... -0.699205  0.123320  0.122542  0.693578   

        557       558       559       560  Subject  activity  
0 -0.

# Read feature names

In [253]:
# Read feature names from the features.txt file
features = list()
with open("UCI HAR Dataset/features.txt") as f:
    for line in f:
        features.append(line.split()[1])

In [254]:
#display features
features

['tBodyAcc-mean()-X',
 'tBodyAcc-mean()-Y',
 'tBodyAcc-mean()-Z',
 'tBodyAcc-std()-X',
 'tBodyAcc-std()-Y',
 'tBodyAcc-std()-Z',
 'tBodyAcc-mad()-X',
 'tBodyAcc-mad()-Y',
 'tBodyAcc-mad()-Z',
 'tBodyAcc-max()-X',
 'tBodyAcc-max()-Y',
 'tBodyAcc-max()-Z',
 'tBodyAcc-min()-X',
 'tBodyAcc-min()-Y',
 'tBodyAcc-min()-Z',
 'tBodyAcc-sma()',
 'tBodyAcc-energy()-X',
 'tBodyAcc-energy()-Y',
 'tBodyAcc-energy()-Z',
 'tBodyAcc-iqr()-X',
 'tBodyAcc-iqr()-Y',
 'tBodyAcc-iqr()-Z',
 'tBodyAcc-entropy()-X',
 'tBodyAcc-entropy()-Y',
 'tBodyAcc-entropy()-Z',
 'tBodyAcc-arCoeff()-X,1',
 'tBodyAcc-arCoeff()-X,2',
 'tBodyAcc-arCoeff()-X,3',
 'tBodyAcc-arCoeff()-X,4',
 'tBodyAcc-arCoeff()-Y,1',
 'tBodyAcc-arCoeff()-Y,2',
 'tBodyAcc-arCoeff()-Y,3',
 'tBodyAcc-arCoeff()-Y,4',
 'tBodyAcc-arCoeff()-Z,1',
 'tBodyAcc-arCoeff()-Z,2',
 'tBodyAcc-arCoeff()-Z,3',
 'tBodyAcc-arCoeff()-Z,4',
 'tBodyAcc-correlation()-X,Y',
 'tBodyAcc-correlation()-X,Z',
 'tBodyAcc-correlation()-Y,Z',
 'tGravityAcc-mean()-X',
 'tGravityA

# Read the test dataset

In [255]:
# Read the main test dataset
test_df = pd.read_csv("UCI HAR Dataset/test/X_test.txt", delim_whitespace=True, names=list(range(0, 561)))

# Read and add the 'Subject' column to the test datase
subject_test = pd.read_csv("UCI HAR Dataset/test/subject_test.txt", header=None)
test_df['Subject'] = subject_test.squeeze()

# Read and add the 'activity' column to the test dataset
activity_test = pd.read_csv("UCI HAR Dataset/test/y_test.txt", header=None)
test_df['activity'] = activity_test.squeeze()
print(test_df.head())

          0         1         2         3         4         5         6  \
0  0.257178 -0.023285 -0.014654 -0.938404 -0.920091 -0.667683 -0.952501   
1  0.286027 -0.013163 -0.119083 -0.975415 -0.967458 -0.944958 -0.986799   
2  0.275485 -0.026050 -0.118152 -0.993819 -0.969926 -0.962748 -0.994403   
3  0.270298 -0.032614 -0.117520 -0.994743 -0.973268 -0.967091 -0.995274   
4  0.274833 -0.027848 -0.129527 -0.993852 -0.967445 -0.978295 -0.994111   

          7         8         9  ...       553       554       555       556  \
0 -0.925249 -0.674302 -0.894088  ... -0.705974  0.006462  0.162920 -0.825886   
1 -0.968401 -0.945823 -0.894088  ... -0.594944 -0.083495  0.017500 -0.434375   
2 -0.970735 -0.963483 -0.939260  ... -0.640736 -0.034956  0.202302  0.064103   
3 -0.974471 -0.968897 -0.938610  ... -0.736124 -0.017067  0.154438  0.340134   
4 -0.965953 -0.977346 -0.938610  ... -0.846595 -0.002223 -0.040046  0.736715   

        557       558       559       560  Subject  activity  
0  0.

In [256]:
# Save the processed training and test dataframes to CSV files
train_df.to_csv('train.csv',index_label=False)
test_df.to_csv('test.csv',index_label=False)

In [257]:
# Read the saved CSV files into dataframes
train_df= pd.read_csv('/content/train.csv')

In [258]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,553,554,555,556,557,558,559,560,Subject,activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,5
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,5
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,5
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,5
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,5


In [259]:
test_df= pd.read_csv('/content/test.csv')

In [260]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,553,554,555,556,557,558,559,560,Subject,activity
0,0.257178,-0.023285,-0.014654,-0.938404,-0.920091,-0.667683,-0.952501,-0.925249,-0.674302,-0.894088,...,-0.705974,0.006462,0.16292,-0.825886,0.271151,-0.720009,0.276801,-0.057978,2,5
1,0.286027,-0.013163,-0.119083,-0.975415,-0.967458,-0.944958,-0.986799,-0.968401,-0.945823,-0.894088,...,-0.594944,-0.083495,0.0175,-0.434375,0.920593,-0.698091,0.281343,-0.083898,2,5
2,0.275485,-0.02605,-0.118152,-0.993819,-0.969926,-0.962748,-0.994403,-0.970735,-0.963483,-0.93926,...,-0.640736,-0.034956,0.202302,0.064103,0.145068,-0.702771,0.280083,-0.079346,2,5
3,0.270298,-0.032614,-0.11752,-0.994743,-0.973268,-0.967091,-0.995274,-0.974471,-0.968897,-0.93861,...,-0.736124,-0.017067,0.154438,0.340134,0.296407,-0.698954,0.284114,-0.077108,2,5
4,0.274833,-0.027848,-0.129527,-0.993852,-0.967445,-0.978295,-0.994111,-0.965953,-0.977346,-0.93861,...,-0.846595,-0.002223,-0.040046,0.736715,-0.118545,-0.692245,0.290722,-0.073857,2,5


#Split the training data into features (X) and target (y)

In [261]:
# Split the training data into features (X) and target (y)
from sklearn.model_selection import train_test_split
x=train_df.drop('Subject',axis=1)
y=train_df['Subject']
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size= 0.30)


In [262]:
cols_to_remove = list(range(40, 80))  # 41 to 80 in 0-based index
X_train = X_train.drop(X_train.columns[cols_to_remove], axis=1)
X_train.shape

(5146, 522)

#Feature Extraction

In [303]:
# Compute the correlation matrix between features and the target variable
correlation_matrix = X_train.corrwith(y_train)

# Get absolute correlation values and rank them
correlation_abs = correlation_matrix.abs()
top_features = correlation_abs.sort_values(ascending=False).head(40).index

# Extract the top 50 features
X_train = X_train[top_features]
X_test = X_test[top_features]

In [264]:
X_train

Unnamed: 0,490,497,396,317,403,407,118,178,36,97,...,554,81,236,371,537,80,334,225,212,192
4899,-0.999957,-0.999947,-0.999896,-0.999880,-0.999858,-0.999771,0.022998,-0.999903,-0.506183,-0.999768,...,-0.005346,0.014753,-0.272995,-0.12,-1.000000,0.074816,-0.999797,-0.074682,-0.074682,-0.418558
5664,-0.875926,-0.844137,-0.908651,-0.897282,-0.831887,-0.830883,-0.115218,-0.870411,-0.197691,-0.841893,...,0.388117,-0.027005,0.123338,-0.24,-0.743590,0.186407,-0.971050,0.285368,0.285368,-0.390069
5426,-0.946549,-0.945418,-0.772411,-0.681440,-0.748326,-0.757769,0.373903,-0.940700,-0.024706,-0.800573,...,-0.453407,-0.163172,0.471029,-0.40,-0.948718,-0.011614,-0.984889,0.224991,0.224991,-0.344070
5327,-0.999952,-0.999943,-0.999891,-0.999835,-0.999881,-0.999834,0.154380,-0.999946,-0.087352,-0.999819,...,-0.019586,0.014212,-0.181954,0.24,-1.000000,0.076118,-0.999910,-0.004389,-0.004389,0.158302
3055,-0.999961,-0.999956,-0.999982,-0.999970,-0.999958,-0.999906,-0.084185,-0.999964,-0.106121,-0.999892,...,0.022757,0.018071,-0.110580,-0.44,-1.000000,0.079256,-0.999858,0.111409,0.111409,-0.023143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4511,-0.948672,-0.921348,-0.706202,-0.626917,-0.605939,-0.642011,0.226254,-0.916099,0.409848,-0.632046,...,-0.460265,-0.354185,0.225806,0.12,-1.000000,0.437384,-0.905995,-0.384310,-0.384310,-0.084471
7314,-0.913084,-0.899703,-0.616637,-0.636196,-0.582952,-0.529158,-0.087170,-0.829253,0.304088,-0.500993,...,-0.287340,-0.222253,0.191910,-0.48,-0.948718,0.050334,-0.682800,0.439950,0.439950,0.248667
1620,-0.935482,-0.929242,-0.904979,-0.905243,-0.871792,-0.855836,0.092462,-0.918992,-0.227059,-0.868574,...,-0.031070,-0.021350,0.137185,-0.88,-0.897436,0.590621,-0.971333,0.174409,0.174409,0.245179
1209,-0.985847,-0.980498,-0.849188,-0.832903,-0.855738,-0.869294,-0.335693,-0.982013,-0.106934,-0.880337,...,-0.112471,-0.244028,-0.267598,-0.36,-0.743590,0.634998,-0.981218,-0.069982,-0.069982,-0.004535


In [265]:
X_test

Unnamed: 0,490,497,396,317,403,407,118,178,36,97,...,554,81,236,371,537,80,334,225,212,192
4778,-0.999955,-0.999952,-0.999954,-0.999929,-0.999954,-0.999883,0.021169,-0.999968,-0.011398,-0.999812,...,-0.020649,0.010671,-0.522916,0.12,-1.000000,0.072267,-0.999663,-0.513413,-0.513413,-0.199504
841,-0.770551,-0.710919,-0.648512,-0.573209,-0.564978,-0.597375,0.208459,-0.752195,-0.417095,-0.669825,...,-0.107090,-0.326899,0.387166,-0.20,-0.846154,-0.293509,-0.991358,0.138373,0.138373,0.025583
989,-0.420167,-0.397632,-0.305295,-0.053207,-0.291386,-0.239658,-0.322303,-0.625119,-0.200255,-0.383064,...,-0.671447,-0.234818,0.188955,-0.24,-0.025641,-0.266081,-0.922262,-0.103562,-0.103562,0.314432
4063,-0.999973,-0.999966,-0.999851,-0.999851,-0.999864,-0.999836,0.025990,-0.999978,-0.264130,-0.999816,...,0.029525,0.015063,-0.176196,-0.12,-0.948718,0.082446,-0.999904,-0.141838,-0.141838,-0.623253
7022,-0.999963,-0.999961,-0.999840,-0.999819,-0.999764,-0.999719,0.263104,-0.999956,-0.049919,-0.999690,...,0.046436,-0.003716,-0.165586,-0.36,-1.000000,0.078088,-0.999708,-0.543791,-0.543791,0.152280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3075,-0.999905,-0.999831,-0.999882,-0.999707,-0.999863,-0.999748,-0.099770,-0.999913,-0.287979,-0.999733,...,-0.077919,0.030242,-0.348325,0.00,-1.000000,0.065809,-0.999030,-0.211978,-0.211978,-0.301095
4726,-0.999559,-0.999480,-0.998982,-0.998581,-0.998653,-0.998857,-0.477991,-0.999177,-0.101263,-0.998966,...,0.042968,0.039058,-0.038106,-0.28,-1.000000,0.096740,-0.999126,-0.214865,-0.214865,-0.677780
3440,-0.999972,-0.999964,-0.999974,-0.999972,-0.999980,-0.999962,0.532370,-0.999967,-0.107693,-0.999955,...,-0.012459,0.008935,-0.086569,-0.24,-1.000000,0.073640,-0.999827,-0.062956,-0.062956,-0.042127
2272,-0.999986,-0.999981,-0.999932,-0.999929,-0.999912,-0.999915,0.301726,-0.999964,-0.192207,-0.999929,...,-0.054788,0.009478,-0.096096,-0.56,-0.846154,0.071741,-0.999860,-0.123522,-0.123522,0.185281


In [266]:
y_train

4899    23
5664    26
5426    25
5327    25
3055    16
        ..
4511    22
7314    30
1620     7
1209     6
1771     8
Name: Subject, Length: 5146, dtype: int64

In [267]:
y_test

4778    23
841      5
989      5
4063    21
7022    30
        ..
3075    16
4726    23
3440    17
2272    14
4108    21
Name: Subject, Length: 2206, dtype: int64

# Display the shapes of the split data

In [268]:
X_train.shape, X_test.shape

((5146, 500), (2206, 500))

In [269]:
y_train.shape, y_test.shape

((5146,), (2206,))

# Train a Logistic Regression model

In [270]:
from sklearn.linear_model import LogisticRegression
model= LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

# Model's accuracy on the training data

In [271]:
# Evaluate the model's accuracy on the training data
model.score(X_train,y_train)


0.8181111542945978

# Predictions on the test data

In [272]:
# Make predictions on the test data
pred=model.predict(X_test)

In [285]:
pred

array([21,  5,  5, ..., 14, 16, 21])

In [288]:
y_probs = model.predict_proba(X_test)

In [290]:
y_probs

array([[3.80901681e-04, 6.97949510e-03, 5.30953724e-04, ...,
        8.07234247e-02, 5.95277079e-05, 3.32822252e-03],
       [2.13218484e-02, 1.45985944e-03, 9.50804354e-01, ...,
        1.20884199e-04, 1.22688349e-05, 9.29209997e-07],
       [5.47411830e-04, 2.47500952e-04, 9.35462191e-01, ...,
        2.95926533e-03, 1.34730933e-05, 3.58862361e-07],
       ...,
       [2.54555270e-02, 7.78688322e-03, 4.15728453e-04, ...,
        3.26690735e-03, 8.32110684e-03, 1.28426394e-02],
       [2.84162989e-03, 3.00807941e-03, 1.26782007e-04, ...,
        4.87241146e-03, 1.23668569e-02, 1.33905328e-01],
       [4.40781549e-04, 9.88483469e-03, 1.20111980e-03, ...,
        5.53791350e-02, 1.12749099e-04, 1.73204520e-04]])

In [289]:
y_pred=[]
for i in range(len(y_probs)):
  flag=0
  for j in range(len(y_probs[i])):
    if y_probs[i][j]>0.5:
      y_pred.append(j+1)
      flag=1
      break
  if flag==0:
    y_pred.append(-1)


In [293]:
for val in pred: print(val,end=",")

21,5,5,21,30,6,6,27,1,16,30,26,21,29,25,26,23,19,14,27,5,28,7,19,26,6,19,29,8,27,26,22,21,25,8,15,5,3,15,11,25,11,3,14,16,1,29,14,15,6,28,14,11,7,28,16,6,28,22,5,14,28,1,22,25,26,7,19,25,30,8,25,8,28,17,25,11,26,16,25,22,17,16,3,27,8,25,29,28,26,3,29,26,21,21,16,1,27,29,11,5,23,1,26,29,19,27,30,30,22,30,29,16,17,27,1,22,15,7,21,23,23,29,8,29,23,21,6,6,6,26,25,27,26,16,15,21,16,28,28,30,14,28,21,23,19,25,27,29,26,11,11,21,7,7,5,1,28,8,14,15,1,26,7,28,19,1,11,3,19,5,28,21,17,3,26,28,5,11,29,30,17,16,17,19,26,1,1,14,19,1,28,5,27,17,22,3,11,26,26,25,16,1,21,8,8,11,7,25,21,3,17,21,26,28,19,30,17,3,30,11,6,14,23,16,30,14,1,17,30,14,1,16,29,27,3,23,19,11,8,26,7,19,29,28,1,29,22,30,25,27,30,27,29,3,17,30,5,14,21,29,21,22,5,30,11,19,6,7,6,29,17,3,19,29,28,23,17,27,7,8,28,21,30,27,30,19,17,5,27,30,26,21,14,15,23,22,1,1,15,14,11,21,27,25,14,16,11,28,27,23,11,1,3,16,30,3,3,11,21,25,22,11,16,16,5,14,1,19,1,23,11,28,30,17,27,22,30,26,29,7,30,30,23,27,22,16,28,25,19,6,5,22,26,7,30,14,15,26,11,16,14,1

In [302]:
y_test

4778    23
841      5
989      5
4063    21
7022    30
        ..
3075    16
4726    23
3440    17
2272    14
4108    21
Name: Subject, Length: 2206, dtype: int64

# Model's accuracy on the test data

In [300]:
# Evaluate the model's accuracy on the test data
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7094288304623754

In [301]:
accuracy_score(y_test, y_pred)

0.02629193109700816

# Classification report

In [297]:
# Print a detailed classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           1       0.73      0.71      0.72        93
           3       0.72      0.57      0.64       120
           5       0.80      0.81      0.81        86
           6       0.77      0.84      0.80        99
           7       0.69      0.55      0.61        93
           8       0.62      0.49      0.55        81
          11       0.68      0.75      0.72        84
          14       0.80      0.79      0.79        98
          15       0.69      0.56      0.62        94
          16       0.73      0.86      0.79       113
          17       0.76      0.76      0.76       104
          19       0.70      0.61      0.65       118
          21       0.64      0.76      0.70       112
          22       0.66      0.64      0.65       101
          23       0.82      0.88      0.85       113
          25       0.75      0.72      0.73       121
          26       0.60      0.71      0.65       110
          27       0.70    

#Confusion matrix

In [298]:
# Compute and display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred)

array([[66,  1,  0,  0,  0,  3,  2,  1,  2,  3,  0,  1,  1,  0,  1,  2,
         4,  0,  0,  3,  3],
       [ 0, 68,  4,  3,  2,  2,  1,  3,  1,  2,  3,  6,  1,  7,  0,  0,
         3,  0,  9,  1,  4],
       [ 1,  2, 70,  5,  0,  0,  0,  0,  1,  0,  2,  0,  4,  0,  0,  0,
         0,  0,  1,  0,  0],
       [ 0,  4,  3, 83,  1,  1,  0,  0,  0,  0,  2,  2,  0,  0,  0,  0,
         1,  0,  2,  0,  0],
       [ 0,  1,  0,  0, 51,  7,  0,  0,  0,  3,  2,  0,  4,  2,  6,  0,
         6,  3,  6,  0,  2],
       [ 4,  1,  2,  1,  3, 40,  4,  0,  0,  3,  0,  0,  2,  3,  2,  0,
         2,  2,  2,  7,  3],
       [ 2,  1,  0,  2,  0,  1, 63,  0,  3,  1,  0,  1,  0,  1,  0,  0,
         0,  1,  3,  3,  2],
       [ 0,  0,  0,  0,  2,  0,  0, 77,  1,  4,  1,  0,  3,  1,  1,  2,
         1,  0,  1,  0,  4],
       [ 1,  1,  4,  0,  1,  0, 12,  1, 53,  1,  2,  2,  0,  0,  0,  1,
        10,  0,  1,  4,  0],
       [ 1,  0,  0,  1,  2,  0,  0,  0,  3, 97,  2,  0,  3,  0,  1,  0,
         1,  1,  0,

#  F1 score

In [299]:
# Compute and display the F1 score
from sklearn.metrics import f1_score
f1_score(y_test, pred, average='macro')

0.7060564843885672