In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

## 1. Load the kinematics dataset as measured on mobile sensors from the file “run_or_walk.csv”. List out the columns in the dataset. 

In [77]:
run_data = pd.read_csv('run_or_walk.csv')
run_data.head()

Unnamed: 0,date,time,username,wrist,activity,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z
0,2017-6-30,13:51:15:847724020,viktor,0,0,0.265,-0.7814,-0.0076,-0.059,0.0325,-2.9296
1,2017-6-30,13:51:16:246945023,viktor,0,0,0.6722,-1.1233,-0.2344,-0.1757,0.0208,0.1269
2,2017-6-30,13:51:16:446233987,viktor,0,0,0.4399,-1.4817,0.0722,-0.9105,0.1063,-2.4367
3,2017-6-30,13:51:16:646117985,viktor,0,0,0.3031,-0.8125,0.0888,0.1199,-0.4099,-2.9336
4,2017-6-30,13:51:16:846738994,viktor,0,0,0.4814,-0.9312,0.0359,0.0527,0.4379,2.4922


In [78]:
run_data.activity.value_counts()

1    44365
0    44223
Name: activity, dtype: int64

In [79]:
run_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88588 entries, 0 to 88587
Data columns (total 11 columns):
date              88588 non-null object
time              88588 non-null object
username          88588 non-null object
wrist             88588 non-null int64
activity          88588 non-null int64
acceleration_x    88588 non-null float64
acceleration_y    88588 non-null float64
acceleration_z    88588 non-null float64
gyro_x            88588 non-null float64
gyro_y            88588 non-null float64
gyro_z            88588 non-null float64
dtypes: float64(6), int64(2), object(3)
memory usage: 7.4+ MB


In [80]:
run_data.dtypes

date               object
time               object
username           object
wrist               int64
activity            int64
acceleration_x    float64
acceleration_y    float64
acceleration_z    float64
gyro_x            float64
gyro_y            float64
gyro_z            float64
dtype: object

In [81]:
run_data.wrist.value_counts()

1    46258
0    42330
Name: wrist, dtype: int64

In [82]:
run_data.shape

(88588, 11)

In [83]:
run_data.describe()

Unnamed: 0,wrist,activity,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z
count,88588.0,88588.0,88588.0,88588.0,88588.0,88588.0,88588.0,88588.0
mean,0.52217,0.500801,-0.074811,-0.562585,-0.313956,0.00416,0.037203,0.022327
std,0.499511,0.500002,1.009299,0.658458,0.486815,1.253423,1.198725,1.914423
min,0.0,0.0,-5.3505,-3.299,-3.7538,-4.4306,-7.4647,-9.48
25%,0.0,0.0,-0.3818,-1.0335,-0.376,-0.9207,-0.644825,-1.345125
50%,1.0,1.0,-0.0595,-0.7591,-0.221,0.0187,0.0393,0.0069
75%,1.0,1.0,0.3555,-0.241775,-0.0859,0.8888,0.7337,1.3982
max,1.0,1.0,5.6033,2.668,1.6403,4.8742,8.498,11.2662


## 2. Let the target variable ‘y’ be the activity and assign all the columns after it to ‘x’.  

In [84]:
X = run_data.iloc[:,5:]
y = run_data['activity']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=85)

In [85]:
X.head()

Unnamed: 0,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z
0,0.265,-0.7814,-0.0076,-0.059,0.0325,-2.9296
1,0.6722,-1.1233,-0.2344,-0.1757,0.0208,0.1269
2,0.4399,-1.4817,0.0722,-0.9105,0.1063,-2.4367
3,0.3031,-0.8125,0.0888,0.1199,-0.4099,-2.9336
4,0.4814,-0.9312,0.0359,0.0527,0.4379,2.4922


In [86]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: activity, dtype: int64

##  3. Using Scikit-learn fit a Gaussian Naive Bayes model and observe the accuracy. Generate a classification report using scikit learn. 

In [87]:
gnb = GaussianNB()
normal_pred = gnb.fit(X_train,y_train).predict(X_test)

In [88]:
metrics.accuracy_score(y_test,normal_pred)

0.9558076532339993

In [89]:
from sklearn.metrics import confusion_matrix
cnf_matrix_gnb = confusion_matrix(y_test,normal_pred)
print(cnf_matrix_gnb)

[[8745  101]
 [ 682 8190]]


In [90]:
print("Number of mislabeled points out of a total %d points : %d" % (len(y_test),(y_test != normal_pred).sum()))

Number of mislabeled points out of a total 17718 points : 783


## 4.Repeat the model once using only the acceleration values as predictors and then using only the gyro values as predictors. Comment on the difference in accuracy between both the models.

### only the acceleration values as predictors

In [91]:
run_data.head()

Unnamed: 0,date,time,username,wrist,activity,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z
0,2017-6-30,13:51:15:847724020,viktor,0,0,0.265,-0.7814,-0.0076,-0.059,0.0325,-2.9296
1,2017-6-30,13:51:16:246945023,viktor,0,0,0.6722,-1.1233,-0.2344,-0.1757,0.0208,0.1269
2,2017-6-30,13:51:16:446233987,viktor,0,0,0.4399,-1.4817,0.0722,-0.9105,0.1063,-2.4367
3,2017-6-30,13:51:16:646117985,viktor,0,0,0.3031,-0.8125,0.0888,0.1199,-0.4099,-2.9336
4,2017-6-30,13:51:16:846738994,viktor,0,0,0.4814,-0.9312,0.0359,0.0527,0.4379,2.4922


In [92]:
X = run_data.iloc[:,5:8]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=85)

In [93]:
X.head()

Unnamed: 0,acceleration_x,acceleration_y,acceleration_z
0,0.265,-0.7814,-0.0076
1,0.6722,-1.1233,-0.2344
2,0.4399,-1.4817,0.0722
3,0.3031,-0.8125,0.0888
4,0.4814,-0.9312,0.0359


In [94]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: activity, dtype: int64

In [95]:
gnb = GaussianNB()
accelerate_pred = gnb.fit(X_train,y_train).predict(X_test)
metrics.accuracy_score(y_test,accelerate_pred)

0.9573879670391692

### only the gyro values as predictors

In [96]:
X = run_data.iloc[:,8:]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=85)

In [97]:
X.head()

Unnamed: 0,gyro_x,gyro_y,gyro_z
0,-0.059,0.0325,-2.9296
1,-0.1757,0.0208,0.1269
2,-0.9105,0.1063,-2.4367
3,0.1199,-0.4099,-2.9336
4,0.0527,0.4379,2.4922


In [98]:
gnb = GaussianNB()
gyro_pred = gnb.fit(X_train,y_train).predict(X_test)
metrics.accuracy_score(y_test,gyro_pred)

0.6511457275087482

In [99]:
## Gyro is not the good predictors to predict the output while we can predict the output better with Acceleration