In [60]:
# # Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [61]:
# # install joblib. This will be used to save your model. 
# # Restart your kernel after installing 
# !pip install joblib

In [62]:
import pandas as pd
import numpy as np
import math
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Read the CSV and Perform Basic Data Cleaning

In [63]:
# Read columns names
columns_df = pd.read_csv('data/features.txt', header=None)
columns = columns_df[0]
columns

0      tBodyAcc-Mean-1                
1      tBodyAcc-Mean-2                
2      tBodyAcc-Mean-3                
3      tBodyAcc-STD-1                 
4      tBodyAcc-STD-2                 
                    ...               
556    tBodyGyro-AngleWRTGravity-1    
557    tBodyGyroJerk-AngleWRTGravity-1
558    tXAxisAcc-AngleWRTGravity-1    
559    tYAxisAcc-AngleWRTGravity-1    
560    tZAxisAcc-AngleWRTGravity-1    
Name: 0, Length: 561, dtype: object

In [64]:
# Remove extra spaces from the columns' names
columns_clean = []
for i in range(len(columns)):
    column_name = columns[i]
    split_name = column_name.split(" ")[0]
    columns_clean.append(split_name)
print(columns_clean)

['tBodyAcc-Mean-1', 'tBodyAcc-Mean-2', 'tBodyAcc-Mean-3', 'tBodyAcc-STD-1', 'tBodyAcc-STD-2', 'tBodyAcc-STD-3', 'tBodyAcc-Mad-1', 'tBodyAcc-Mad-2', 'tBodyAcc-Mad-3', 'tBodyAcc-Max-1', 'tBodyAcc-Max-2', 'tBodyAcc-Max-3', 'tBodyAcc-Min-1', 'tBodyAcc-Min-2', 'tBodyAcc-Min-3', 'tBodyAcc-SMA-1', 'tBodyAcc-Energy-1', 'tBodyAcc-Energy-2', 'tBodyAcc-Energy-3', 'tBodyAcc-IQR-1', 'tBodyAcc-IQR-2', 'tBodyAcc-IQR-3', 'tBodyAcc-ropy-1', 'tBodyAcc-ropy-1', 'tBodyAcc-ropy-1', 'tBodyAcc-ARCoeff-1', 'tBodyAcc-ARCoeff-2', 'tBodyAcc-ARCoeff-3', 'tBodyAcc-ARCoeff-4', 'tBodyAcc-ARCoeff-5', 'tBodyAcc-ARCoeff-6', 'tBodyAcc-ARCoeff-7', 'tBodyAcc-ARCoeff-8', 'tBodyAcc-ARCoeff-9', 'tBodyAcc-ARCoeff-10', 'tBodyAcc-ARCoeff-11', 'tBodyAcc-ARCoeff-12', 'tBodyAcc-Correlation-1', 'tBodyAcc-Correlation-2', 'tBodyAcc-Correlation-3', 'tGravityAcc-Mean-1', 'tGravityAcc-Mean-2', 'tGravityAcc-Mean-3', 'tGravityAcc-STD-1', 'tGravityAcc-STD-2', 'tGravityAcc-STD-3', 'tGravityAcc-Mad-1', 'tGravityAcc-Mad-2', 'tGravityAcc-Mad-3

In [65]:
# Read Test csv into df and add header
X_train = pd.read_csv('data/Train/X_train.txt', sep='\s+', header=None)
X_train.columns = columns_clean
X_train

Unnamed: 0,tBodyAcc-Mean-1,tBodyAcc-Mean-2,tBodyAcc-Mean-3,tBodyAcc-STD-1,tBodyAcc-STD-2,tBodyAcc-STD-3,tBodyAcc-Mad-1,tBodyAcc-Mad-2,tBodyAcc-Mad-3,tBodyAcc-Max-1,...,fBodyGyroJerkMag-MeanFreq-1,fBodyGyroJerkMag-Skewness-1,fBodyGyroJerkMag-Kurtosis-1,tBodyAcc-AngleWRTGravity-1,tBodyAccJerk-AngleWRTGravity-1,tBodyGyro-AngleWRTGravity-1,tBodyGyroJerk-AngleWRTGravity-1,tXAxisAcc-AngleWRTGravity-1,tYAxisAcc-AngleWRTGravity-1,tZAxisAcc-AngleWRTGravity-1
0,0.043580,-0.005970,-0.035054,-0.995381,-0.988366,-0.937382,-0.995007,-0.988816,-0.953325,-0.794796,...,-0.012236,-0.314848,-0.713308,-0.112754,0.030400,-0.464761,-0.018446,-0.841559,0.179913,-0.051718
1,0.039480,-0.002131,-0.029067,-0.998348,-0.982945,-0.971273,-0.998702,-0.983315,-0.974000,-0.802537,...,0.202804,-0.603199,-0.860677,0.053477,-0.007435,-0.732626,0.703511,-0.845092,0.180261,-0.047436
2,0.039978,-0.005153,-0.022651,-0.995482,-0.977314,-0.984760,-0.996415,-0.975835,-0.985973,-0.798477,...,0.440079,-0.404427,-0.761847,-0.118559,0.177899,0.100699,0.808529,-0.849230,0.180610,-0.042271
3,0.039785,-0.011809,-0.028916,-0.996194,-0.988569,-0.993256,-0.996994,-0.988526,-0.993135,-0.798477,...,0.430891,-0.138373,-0.491604,-0.036788,-0.012892,0.640011,-0.485366,-0.848947,0.181907,-0.040826
4,0.038758,-0.002289,-0.023863,-0.998241,-0.986774,-0.993115,-0.998216,-0.986479,-0.993825,-0.801982,...,0.137735,-0.366214,-0.702490,0.123320,0.122542,0.693578,-0.615971,-0.848164,0.185124,-0.037080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7762,0.048048,-0.042445,-0.065884,-0.195448,-0.278326,-0.219954,-0.282233,-0.305861,-0.357803,0.267874,...,-0.008381,-0.596760,-0.879026,-0.190437,0.829718,0.206972,-0.425619,-0.792292,0.238580,0.056020
7763,0.037639,0.006430,-0.044345,-0.235372,-0.302680,-0.232843,-0.322483,-0.354464,-0.345592,0.181271,...,0.209452,-0.404418,-0.684496,0.064907,0.875679,-0.879033,0.400219,-0.772288,0.252653,0.056252
7764,0.037451,-0.002724,0.021009,-0.218281,-0.378082,-0.076950,-0.304446,-0.400661,-0.193071,0.113141,...,0.237003,0.000207,-0.317314,0.052806,-0.266724,0.864404,0.701169,-0.779566,0.249121,0.047071
7765,0.044011,-0.004536,-0.051242,-0.219202,-0.383350,-0.081035,-0.310419,-0.380233,-0.201007,0.166671,...,0.069366,0.037919,-0.356579,-0.101360,0.700740,0.936674,-0.589479,-0.785603,0.246409,0.031700


In [66]:
# Read target data
labels = pd.read_csv('data/activity_labels.txt', header=None)
labels

Unnamed: 0,0
0,1 WALKING
1,2 WALKING_UPSTAIRS
2,3 WALKING_DOWNSTAIRS
3,4 SITTING
4,5 STANDING
5,6 LAYING
6,7 STAND_TO_SIT
7,8 SIT_TO_STAND
8,9 SIT_TO_LIE
9,10 LIE_TO_SIT


In [67]:
# Read target data and rename the column
y_train = pd.read_csv('data/Train/y_train.txt', sep='\s+', header=None)
y_train = y_train.rename(columns={0: "Activity"})
y_train

Unnamed: 0,Activity
0,5
1,5
2,5
3,5
4,5
...,...
7762,2
7763,2
7764,2
7765,2


In [68]:
# Merge X and y data into one df for correlation matrix
df = y_train.join(X_train, how='outer')

In [69]:
df.to_csv("data/master.csv")

## Use seaborn to check which columns correlate the most

In [70]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [71]:
# Make a copy of the master df
exp_df = df.copy()
exp_df

Unnamed: 0,Activity,tBodyAcc-Mean-1,tBodyAcc-Mean-2,tBodyAcc-Mean-3,tBodyAcc-STD-1,tBodyAcc-STD-2,tBodyAcc-STD-3,tBodyAcc-Mad-1,tBodyAcc-Mad-2,tBodyAcc-Mad-3,...,fBodyGyroJerkMag-MeanFreq-1,fBodyGyroJerkMag-Skewness-1,fBodyGyroJerkMag-Kurtosis-1,tBodyAcc-AngleWRTGravity-1,tBodyAccJerk-AngleWRTGravity-1,tBodyGyro-AngleWRTGravity-1,tBodyGyroJerk-AngleWRTGravity-1,tXAxisAcc-AngleWRTGravity-1,tYAxisAcc-AngleWRTGravity-1,tZAxisAcc-AngleWRTGravity-1
0,5,0.043580,-0.005970,-0.035054,-0.995381,-0.988366,-0.937382,-0.995007,-0.988816,-0.953325,...,-0.012236,-0.314848,-0.713308,-0.112754,0.030400,-0.464761,-0.018446,-0.841559,0.179913,-0.051718
1,5,0.039480,-0.002131,-0.029067,-0.998348,-0.982945,-0.971273,-0.998702,-0.983315,-0.974000,...,0.202804,-0.603199,-0.860677,0.053477,-0.007435,-0.732626,0.703511,-0.845092,0.180261,-0.047436
2,5,0.039978,-0.005153,-0.022651,-0.995482,-0.977314,-0.984760,-0.996415,-0.975835,-0.985973,...,0.440079,-0.404427,-0.761847,-0.118559,0.177899,0.100699,0.808529,-0.849230,0.180610,-0.042271
3,5,0.039785,-0.011809,-0.028916,-0.996194,-0.988569,-0.993256,-0.996994,-0.988526,-0.993135,...,0.430891,-0.138373,-0.491604,-0.036788,-0.012892,0.640011,-0.485366,-0.848947,0.181907,-0.040826
4,5,0.038758,-0.002289,-0.023863,-0.998241,-0.986774,-0.993115,-0.998216,-0.986479,-0.993825,...,0.137735,-0.366214,-0.702490,0.123320,0.122542,0.693578,-0.615971,-0.848164,0.185124,-0.037080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7762,2,0.048048,-0.042445,-0.065884,-0.195448,-0.278326,-0.219954,-0.282233,-0.305861,-0.357803,...,-0.008381,-0.596760,-0.879026,-0.190437,0.829718,0.206972,-0.425619,-0.792292,0.238580,0.056020
7763,2,0.037639,0.006430,-0.044345,-0.235372,-0.302680,-0.232843,-0.322483,-0.354464,-0.345592,...,0.209452,-0.404418,-0.684496,0.064907,0.875679,-0.879033,0.400219,-0.772288,0.252653,0.056252
7764,2,0.037451,-0.002724,0.021009,-0.218281,-0.378082,-0.076950,-0.304446,-0.400661,-0.193071,...,0.237003,0.000207,-0.317314,0.052806,-0.266724,0.864404,0.701169,-0.779566,0.249121,0.047071
7765,2,0.044011,-0.004536,-0.051242,-0.219202,-0.383350,-0.081035,-0.310419,-0.380233,-0.201007,...,0.069366,0.037919,-0.356579,-0.101360,0.700740,0.936674,-0.589479,-0.785603,0.246409,0.031700


# Random Forest classifier

In [72]:
from sklearn.ensemble import RandomForestClassifier

## Convert koi_disposition column to numeric

In [73]:
# Convert "Activity" column to numeric
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
exp_df['Activity'] = le.fit_transform(df['Activity'].astype(str))

In [74]:
# Check unique values
exp_df['Activity'].unique()

array([ 7,  9,  6, 10,  2,  8,  1, 11,  3,  0,  5,  4])

In [75]:
# X = df.drop('koi_disposition', axis=1)
# y = df['koi_disposition']
# print(X.shape, y.shape)

## Create a Train Test Split
- Note: current data set has already split the data into Train and Test sets

Use `Activity` for the y values

In [76]:
# # Make a copy of the df
# feat_df = selected_features

In [77]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [78]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)

In [79]:
X_train.head()

Unnamed: 0,tBodyAcc-Mean-1,tBodyAcc-Mean-2,tBodyAcc-Mean-3,tBodyAcc-STD-1,tBodyAcc-STD-2,tBodyAcc-STD-3,tBodyAcc-Mad-1,tBodyAcc-Mad-2,tBodyAcc-Mad-3,tBodyAcc-Max-1,...,fBodyGyroJerkMag-MeanFreq-1,fBodyGyroJerkMag-Skewness-1,fBodyGyroJerkMag-Kurtosis-1,tBodyAcc-AngleWRTGravity-1,tBodyAccJerk-AngleWRTGravity-1,tBodyGyro-AngleWRTGravity-1,tBodyGyroJerk-AngleWRTGravity-1,tXAxisAcc-AngleWRTGravity-1,tYAxisAcc-AngleWRTGravity-1,tZAxisAcc-AngleWRTGravity-1
0,0.04358,-0.00597,-0.035054,-0.995381,-0.988366,-0.937382,-0.995007,-0.988816,-0.953325,-0.794796,...,-0.012236,-0.314848,-0.713308,-0.112754,0.0304,-0.464761,-0.018446,-0.841559,0.179913,-0.051718
1,0.03948,-0.002131,-0.029067,-0.998348,-0.982945,-0.971273,-0.998702,-0.983315,-0.974,-0.802537,...,0.202804,-0.603199,-0.860677,0.053477,-0.007435,-0.732626,0.703511,-0.845092,0.180261,-0.047436
2,0.039978,-0.005153,-0.022651,-0.995482,-0.977314,-0.98476,-0.996415,-0.975835,-0.985973,-0.798477,...,0.440079,-0.404427,-0.761847,-0.118559,0.177899,0.100699,0.808529,-0.84923,0.18061,-0.042271
3,0.039785,-0.011809,-0.028916,-0.996194,-0.988569,-0.993256,-0.996994,-0.988526,-0.993135,-0.798477,...,0.430891,-0.138373,-0.491604,-0.036788,-0.012892,0.640011,-0.485366,-0.848947,0.181907,-0.040826
4,0.038758,-0.002289,-0.023863,-0.998241,-0.986774,-0.993115,-0.998216,-0.986479,-0.993825,-0.801982,...,0.137735,-0.366214,-0.70249,0.12332,0.122542,0.693578,-0.615971,-0.848164,0.185124,-0.03708


In [80]:
y_train = y_train['Activity']

In [81]:
# y_train = y_train.values.reshape(-1,1)

In [82]:
rf = rf.fit(X_train, y_train)

In [83]:
rf.score(X_train, y_train)

1.0

In [84]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.00143244, 0.00096253, 0.00051117, 0.0083485 , 0.00043654,
       0.00040968, 0.00603915, 0.00041969, 0.00027168, 0.00856978,
       0.00073532, 0.00043975, 0.00059631, 0.00048927, 0.00025186,
       0.00107048, 0.00661453, 0.0010891 , 0.00053336, 0.00239414,
       0.00049176, 0.00025193, 0.00172748, 0.00106122, 0.00046133,
       0.00051004, 0.00016862, 0.00023854, 0.00036941, 0.00021871,
       0.00019469, 0.00017678, 0.00023421, 0.00023997, 0.00021738,
       0.0001969 , 0.00023299, 0.00395948, 0.00109392, 0.00178714,
       0.02329406, 0.02418386, 0.00851795, 0.0026315 , 0.00152872,
       0.00094498, 0.00248281, 0.00203805, 0.00066658, 0.02210072,
       0.02797825, 0.00949879, 0.02284976, 0.02903259, 0.00732658,
       0.00341192, 0.03111578, 0.01295576, 0.00789801, 0.0015824 ,
       0.00095665, 0.00025345, 0.00249319, 0.00720852, 0.00052532,
       0.00514311, 0.00457885, 0.00353843, 0.00203561, 0.00836954,
       0.00595069, 0.0049876 , 0.00425734, 0.00833442, 0.00675

In [85]:
# We can sort the features by their importance
sorted(zip(rf.feature_importances_, exp_df.columns), reverse=True)

[(0.031115779955236095, 'tGravityAcc-SMA-1'),
 (0.02903259469848765, 'tGravityAcc-Min-1'),
 (0.027978250647361907, 'tGravityAcc-Max-1'),
 (0.02754116353968352, 'tBodyGyroJerk-AngleWRTGravity-1'),
 (0.02695395005910205, 'tXAxisAcc-AngleWRTGravity-1'),
 (0.024183855506473617, 'tGravityAcc-Mean-1'),
 (0.02329405951209556, 'tBodyAcc-Correlation-3'),
 (0.02284976368777465, 'tGravityAcc-Max-3'),
 (0.022100717354844087, 'tGravityAcc-Mad-3'),
 (0.012955763356279197, 'tGravityAcc-Energy-1'),
 (0.009755599114900928, 'tYAxisAcc-AngleWRTGravity-1'),
 (0.009498785876175036, 'tGravityAcc-Max-2'),
 (0.009076264726724554, 'fBodyAccJerk-BandsEnergyOld-8'),
 (0.008703534066599903, 'fBodyAcc-BandsEnergyOld-12'),
 (0.008620762078239301, 'fBodyAccMag-Mean-1'),
 (0.008569778478168501, 'tBodyAcc-Mad-3'),
 (0.008517953217336807, 'tGravityAcc-Mean-2'),
 (0.008440208621035058, 'tBodyAccJerk-IQR-1'),
 (0.00836953654046138, 'tGravityAcc-ARCoeff-4'),
 (0.008348496170714299, 'tBodyAcc-Mean-3'),
 (0.0083344156703057

In [87]:
feat_df = exp_df[['tGravityAcc-SMA-1', 'tGravityAcc-Min-1', 'tGravityAcc-Max-1',
                  'tBodyGyroJerk-AngleWRTGravity-1', 'tXAxisAcc-AngleWRTGravity-1',
                  'tGravityAcc-Mean-1', 'tBodyAcc-Correlation-3', 'tGravityAcc-Mad-3',
                  'tGravityAcc-Energy-1', 'Activity']]
feat_df.head()

Unnamed: 0,tGravityAcc-SMA-1,tGravityAcc-Min-1,tGravityAcc-Max-1,tBodyGyroJerk-AngleWRTGravity-1,tXAxisAcc-AngleWRTGravity-1,tGravityAcc-Mean-1,tBodyAcc-Correlation-3,tGravityAcc-Mad-3,tGravityAcc-Energy-1,Activity
0,-0.374464,0.977436,0.892055,-0.018446,-0.841559,0.960051,0.660199,-0.931834,0.891821,7
1,-0.38248,0.98452,0.89206,0.703511,-0.845092,0.963215,0.578649,-0.956796,0.900148,7
2,-0.40068,0.98677,0.892401,0.808529,-0.84923,0.963532,0.608219,-0.995437,0.900984,7
3,-0.399355,0.986821,0.893817,-0.485366,-0.848947,0.964269,0.506602,-0.986065,0.902929,7
4,-0.399553,0.987434,0.893817,-0.615971,-0.848164,0.964878,0.598515,-0.986376,0.904536,7


In [88]:
# # Select the feature and target data
# X = new_df.drop('koi_disposition', axis=1)
# y = new_df['koi_disposition']
# print(X.shape, y.shape)

In [89]:
# # Split into training and testing datasets
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [90]:
# Scale your data
scaler = MinMaxScaler().fit(X_train)

# Train the Model with , K Neighbor classifier



In [94]:
# Read X_test data
X_test = pd.read_csv("data/Test/X_test.txt", sep='\s+', header=None)
X_test.columns = columns_clean
X_test

Unnamed: 0,tBodyAcc-Mean-1,tBodyAcc-Mean-2,tBodyAcc-Mean-3,tBodyAcc-STD-1,tBodyAcc-STD-2,tBodyAcc-STD-3,tBodyAcc-Mad-1,tBodyAcc-Mad-2,tBodyAcc-Mad-3,tBodyAcc-Max-1,...,fBodyGyroJerkMag-MeanFreq-1,fBodyGyroJerkMag-Skewness-1,fBodyGyroJerkMag-Kurtosis-1,tBodyAcc-AngleWRTGravity-1,tBodyAccJerk-AngleWRTGravity-1,tBodyGyro-AngleWRTGravity-1,tBodyGyroJerk-AngleWRTGravity-1,tXAxisAcc-AngleWRTGravity-1,tYAxisAcc-AngleWRTGravity-1,tZAxisAcc-AngleWRTGravity-1
0,0.030914,-0.008927,0.040382,-0.938504,-0.944626,-0.759334,-0.952398,-0.950281,-0.802483,-0.757099,...,0.122830,-0.345684,-0.709087,0.006462,0.162920,-0.825886,0.271151,-0.720559,0.276779,-0.051074
1,0.042548,0.001079,-0.026236,-0.975516,-0.977502,-0.960146,-0.986694,-0.978983,-0.966820,-0.757099,...,-0.314688,-0.142804,-0.600867,-0.083495,0.017500,-0.434375,0.920593,-0.698684,0.281322,-0.076825
2,0.038297,-0.011660,-0.025643,-0.993922,-0.979215,-0.973030,-0.994298,-0.980535,-0.977508,-0.799005,...,0.114730,-0.209525,-0.645500,-0.034956,0.202302,0.064103,0.145068,-0.703355,0.280062,-0.072302
3,0.036205,-0.018148,-0.025240,-0.994845,-0.981534,-0.976175,-0.995169,-0.983020,-0.980785,-0.798401,...,0.164515,-0.359352,-0.738474,-0.017067,0.154438,0.340134,0.296407,-0.699545,0.284093,-0.070079
4,0.038034,-0.013437,-0.032899,-0.993955,-0.977493,-0.984290,-0.994006,-0.977354,-0.985899,-0.798401,...,-0.056085,-0.544467,-0.846150,-0.002223,-0.040046,0.736715,-0.118545,-0.692849,0.290701,-0.066849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3157,0.052279,-0.038687,-0.013494,-0.287933,-0.403601,-0.431550,-0.356012,-0.433826,-0.534817,0.244280,...,0.125445,-0.390349,-0.752788,-0.337422,0.346295,0.884904,-0.698885,-0.652416,0.274605,0.190104
3158,0.073745,-0.024673,-0.017836,-0.305455,-0.286487,-0.417996,-0.373468,-0.354849,-0.557925,0.244280,...,0.150787,-0.336001,-0.703532,-0.736701,-0.372889,-0.657421,0.322548,-0.655858,0.273556,0.187748
3159,0.068334,0.043822,-0.024135,-0.329706,-0.335273,-0.456965,-0.387944,-0.423504,-0.604403,0.079255,...,-0.004765,-0.139894,-0.476335,-0.181560,0.088574,0.696664,0.363139,-0.656034,0.274457,0.186527
3160,0.023016,0.032345,-0.011830,-0.323182,-0.465502,-0.426108,-0.392307,-0.520848,-0.569571,0.079255,...,0.013540,-0.224141,-0.623083,0.444558,-0.819188,0.929294,-0.008398,-0.660387,0.264759,0.192865


In [95]:
# Transform the training and testing data using the X_scaler and y_scaler models
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [96]:
scaler = MinMaxScaler()
scaler.fit(X_train_scaled)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [97]:
from sklearn.neighbors import KNeighborsClassifier
scaler = KNeighborsClassifier()
scaler.fit(X_train_scaled, y_train)
test_score = scaler.predict(X_test_scaled)
test_score

array([5, 5, 5, ..., 2, 2, 1])

In [103]:
# Load y_test data
y_test = pd.read_csv("data/Test/y_test.txt", sep='\s+', header=None)
y_test = y_test.rename(columns={0: "Activity"})
y_test

Unnamed: 0,Activity
0,5
1,5
2,5
3,5
4,5
...,...
3157,2
3158,2
3159,2
3160,2


In [None]:
y_test.to_csv("test.csv")

In [104]:
print(f"Training Data Score: {scaler.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {scaler.score(X_test_scaled, y_test)}")

Training Data Score: 0.9778550276812153
Testing Data Score: 0.8870967741935484


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [105]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

In [110]:
# Train the model with GridSearch
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=300, random_state=0)

rf_feature_imp = RandomForestClassifier(100)

feat_selection = SelectFromModel(rf_feature_imp, threshold=0.5)

model = Pipeline([
          ('fs', feat_selection), 
          ('clf', classifier), 
        ])

grid_param = {
    'n_estimators': [20, 60, 100],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [111]:
# from sklearn.ensemble import RandomForestClassifier
# classifier = RandomForestClassifier(n_estimators=300, random_state=0)

In [112]:
grid = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

In [113]:
grid.fit(X_train, y_train)

KeyboardInterrupt: 

In [33]:
print(grid.best_params_)
print(grid.best_score_)

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 20}
0.7354568873299907


# Predict

In [34]:
predicted_class = grid2.predict(X_test)
print(predicted_class)

[1 2 2 ... 2 2 1]


In [35]:
predictions = grid2.predict(X_test)
print(f"True output: {y[0]}")
print(f"Predicted output: {predictions[0]}")
print(f"Prediction Error: {predictions[0]-y[0]}")

True output: 1
Predicted output: 1
Prediction Error: 0


In [36]:
predictions = grid2.predict(X_test)
pred = {}
for i in range(len(predictions)):
    print(f"True output: {y[i]}")
    print(f"Predicted output: {predictions[i]}")
    print(f"Prediction Error: {predictions[i]-y[i]}")

True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True outp

Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Predictio

True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 2
Predicted output: 1
Prediction Error: -1
Tr

True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True ou

Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction

Predicted output: 2
Prediction Error: 2
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 1
Predicted

Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Predicti

Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1


# Save the Model

In [37]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'model_final_grid2.sav'
joblib.dump(grid2, filename)

['model_final_grid2.sav']

In [38]:
model = joblib.load(filename)
model

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

## Predict

In [39]:
predictions = grid2.predict(X_test)
print(f"True output: {y[0]}")
print(f"Predicted output: {predictions[0]}")
print(f"Prediction Error: {predictions[0]-y[0]}")

True output: 1
Predicted output: 1
Prediction Error: 0


In [40]:
pred = grid2.best_estimator_.predict(X_test)
print(f"True output: {y[0]}")
print(f"Predicted output: {predictions[0]}")
print(f"Prediction Error: {predictions[0]-y[0]}")

True output: 1
Predicted output: 1
Prediction Error: 0


# Test 1 - SVM

In [41]:
# Try accuracy_score
from sklearn.svm import SVC
clf_svc = SVC()
# Create classifier / model
clf_svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [42]:
pred = clf_svc.predict(X_test)
print(pred)

[1 2 2 ... 2 2 1]


In [43]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, y_test)

def submitAccuracy():
    return acc
submitAccuracy()

0.7511441647597255