In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

from sklearn.naive_bayes import  GaussianNB

%matplotlib inline

### 1.Load the kinematics dataset as measured on mobile sensors from the file “run_or_walk.csv”. List out the columns in the dataset. 

In [2]:
data = pd.read_csv('run_or_walk.csv')

In [7]:
columns = data.columns
columns

Index(['date', 'time', 'username', 'wrist', 'activity', 'acceleration_x',
       'acceleration_y', 'acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z'],
      dtype='object')

In [3]:
data.head()

Unnamed: 0,date,time,username,wrist,activity,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z
0,2017-6-30,13:51:15:847724020,viktor,0,0,0.265,-0.7814,-0.0076,-0.059,0.0325,-2.9296
1,2017-6-30,13:51:16:246945023,viktor,0,0,0.6722,-1.1233,-0.2344,-0.1757,0.0208,0.1269
2,2017-6-30,13:51:16:446233987,viktor,0,0,0.4399,-1.4817,0.0722,-0.9105,0.1063,-2.4367
3,2017-6-30,13:51:16:646117985,viktor,0,0,0.3031,-0.8125,0.0888,0.1199,-0.4099,-2.9336
4,2017-6-30,13:51:16:846738994,viktor,0,0,0.4814,-0.9312,0.0359,0.0527,0.4379,2.4922


In [4]:
data.shape

(88588, 11)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88588 entries, 0 to 88587
Data columns (total 11 columns):
date              88588 non-null object
time              88588 non-null object
username          88588 non-null object
wrist             88588 non-null int64
activity          88588 non-null int64
acceleration_x    88588 non-null float64
acceleration_y    88588 non-null float64
acceleration_z    88588 non-null float64
gyro_x            88588 non-null float64
gyro_y            88588 non-null float64
gyro_z            88588 non-null float64
dtypes: float64(6), int64(2), object(3)
memory usage: 7.4+ MB


In [5]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
wrist,88588.0,0.52217,0.499511,0.0,0.0,1.0,1.0,1.0
activity,88588.0,0.500801,0.500002,0.0,0.0,1.0,1.0,1.0
acceleration_x,88588.0,-0.074811,1.009299,-5.3505,-0.3818,-0.0595,0.3555,5.6033
acceleration_y,88588.0,-0.562585,0.658458,-3.299,-1.0335,-0.7591,-0.241775,2.668
acceleration_z,88588.0,-0.313956,0.486815,-3.7538,-0.376,-0.221,-0.0859,1.6403
gyro_x,88588.0,0.00416,1.253423,-4.4306,-0.9207,0.0187,0.8888,4.8742
gyro_y,88588.0,0.037203,1.198725,-7.4647,-0.644825,0.0393,0.7337,8.498
gyro_z,88588.0,0.022327,1.914423,-9.48,-1.345125,0.0069,1.3982,11.2662


In [10]:
# Feature engineering: drop the columns- 'date', 'time', 'username'
data = data.drop(['date', 'time', 'username'],axis=1)

In [18]:
# Target variable:
data['activity'].value_counts()

1    44365
0    44223
Name: activity, dtype: int64

### 2. Let the target variable ‘y’ be the activity and assign all the columns after it to ‘x’. 

In [11]:
x = data.drop(['activity'],axis=1)
y = data['activity']

x_train, x_test , y_train, y_test = train_test_split(x,y,train_size=0.8)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(70870, 7)
(17718, 7)
(70870,)
(17718,)


### 3.Using Scikit-learn fit a Gaussian Naive Bayes model and observe the accuracy. Generate a classification report using scikit learn.


In [12]:
gnb = GaussianNB()

gnb = gnb.fit(x_train,y_train)
y_predict = gnb.predict(x_test)

print('Accuracy:',accuracy_score(y_test,y_predict))
print('classification report:', classification_report(y_test,y_predict))
print('confusion_matrix:', confusion_matrix(y_test,y_predict))

Accuracy: 0.9562027316852918
classification report:               precision    recall  f1-score   support

           0       0.93      0.99      0.96      8814
           1       0.99      0.92      0.95      8904

    accuracy                           0.96     17718
   macro avg       0.96      0.96      0.96     17718
weighted avg       0.96      0.96      0.96     17718

confusion_matrix: [[8733   81]
 [ 695 8209]]


### 4.Repeat the model once using only the acceleration values as predictors and then using only the gyro values as predictors. Comment on the difference in accuracy between both the models. 

In [14]:
x = data[['acceleration_x','acceleration_y','acceleration_z']]
y = data['activity']

x_train, x_test , y_train, y_test = train_test_split(x,y,train_size=0.8)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
         
gnb = GaussianNB()

gnb = gnb.fit(x_train,y_train)
y_predict = gnb.predict(x_test)

print('Accuracy:',accuracy_score(y_test,y_predict))
print('classification report:', classification_report(y_test,y_predict))
print('confusion_matrix:', confusion_matrix(y_test,y_predict))

(70870, 3)
(17718, 3)
(70870,)
(17718,)
Accuracy: 0.9559205327915115
classification report:               precision    recall  f1-score   support

           0       0.92      0.99      0.96      8849
           1       0.99      0.92      0.95      8869

    accuracy                           0.96     17718
   macro avg       0.96      0.96      0.96     17718
weighted avg       0.96      0.96      0.96     17718

confusion_matrix: [[8799   50]
 [ 731 8138]]


In [15]:
x = data[['gyro_x','gyro_y','gyro_z']]
y = data['activity']

x_train, x_test , y_train, y_test = train_test_split(x,y,train_size=0.8)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
         
gnb = GaussianNB()

gnb = gnb.fit(x_train,y_train)
y_predict = gnb.predict(x_test)

print('Accuracy:',accuracy_score(y_test,y_predict))
print('classification report:', classification_report(y_test,y_predict))
print('confusion_matrix:', confusion_matrix(y_test,y_predict))

(70870, 3)
(17718, 3)
(70870,)
(17718,)
Accuracy: 0.6512021672875042
classification report:               precision    recall  f1-score   support

           0       0.63      0.75      0.68      8828
           1       0.69      0.56      0.62      8890

    accuracy                           0.65     17718
   macro avg       0.66      0.65      0.65     17718
weighted avg       0.66      0.65      0.65     17718

confusion_matrix: [[6591 2237]
 [3943 4947]]


In [16]:
# From the comparision of accuracy score between both the models it is evident that the acceleration variables have significant
# impact in the prediction of target variable activity