In [81]:
# # Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [82]:
# # install joblib. This will be used to save your model. 
# # Restart your kernel after installing 
# !pip install joblib

In [83]:
import pandas as pd
import numpy as np
import math
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Read the CSV and Perform Basic Data Cleaning

In [84]:
# Read columns names
columns_df = pd.read_csv('data/features.txt', header=None)
columns = columns_df[0]
columns

0      tBodyAcc-Mean-1                
1      tBodyAcc-Mean-2                
2      tBodyAcc-Mean-3                
3      tBodyAcc-STD-1                 
4      tBodyAcc-STD-2                 
                    ...               
556    tBodyGyro-AngleWRTGravity-1    
557    tBodyGyroJerk-AngleWRTGravity-1
558    tXAxisAcc-AngleWRTGravity-1    
559    tYAxisAcc-AngleWRTGravity-1    
560    tZAxisAcc-AngleWRTGravity-1    
Name: 0, Length: 561, dtype: object

In [85]:
# Read Test csv into df and add header
X_train = pd.read_csv('data/Train/X_train.txt', sep='\s+', header=None)
X_train.columns = columns
X_train

Unnamed: 0,tBodyAcc-Mean-1,tBodyAcc-Mean-2,tBodyAcc-Mean-3,tBodyAcc-STD-1,tBodyAcc-STD-2,tBodyAcc-STD-3,tBodyAcc-Mad-1,tBodyAcc-Mad-2,tBodyAcc-Mad-3,tBodyAcc-Max-1,...,fBodyGyroJerkMag-MeanFreq-1,fBodyGyroJerkMag-Skewness-1,fBodyGyroJerkMag-Kurtosis-1,tBodyAcc-AngleWRTGravity-1,tBodyAccJerk-AngleWRTGravity-1,tBodyGyro-AngleWRTGravity-1,tBodyGyroJerk-AngleWRTGravity-1,tXAxisAcc-AngleWRTGravity-1,tYAxisAcc-AngleWRTGravity-1,tZAxisAcc-AngleWRTGravity-1
0,0.043580,-0.005970,-0.035054,-0.995381,-0.988366,-0.937382,-0.995007,-0.988816,-0.953325,-0.794796,...,-0.012236,-0.314848,-0.713308,-0.112754,0.030400,-0.464761,-0.018446,-0.841559,0.179913,-0.051718
1,0.039480,-0.002131,-0.029067,-0.998348,-0.982945,-0.971273,-0.998702,-0.983315,-0.974000,-0.802537,...,0.202804,-0.603199,-0.860677,0.053477,-0.007435,-0.732626,0.703511,-0.845092,0.180261,-0.047436
2,0.039978,-0.005153,-0.022651,-0.995482,-0.977314,-0.984760,-0.996415,-0.975835,-0.985973,-0.798477,...,0.440079,-0.404427,-0.761847,-0.118559,0.177899,0.100699,0.808529,-0.849230,0.180610,-0.042271
3,0.039785,-0.011809,-0.028916,-0.996194,-0.988569,-0.993256,-0.996994,-0.988526,-0.993135,-0.798477,...,0.430891,-0.138373,-0.491604,-0.036788,-0.012892,0.640011,-0.485366,-0.848947,0.181907,-0.040826
4,0.038758,-0.002289,-0.023863,-0.998241,-0.986774,-0.993115,-0.998216,-0.986479,-0.993825,-0.801982,...,0.137735,-0.366214,-0.702490,0.123320,0.122542,0.693578,-0.615971,-0.848164,0.185124,-0.037080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7762,0.048048,-0.042445,-0.065884,-0.195448,-0.278326,-0.219954,-0.282233,-0.305861,-0.357803,0.267874,...,-0.008381,-0.596760,-0.879026,-0.190437,0.829718,0.206972,-0.425619,-0.792292,0.238580,0.056020
7763,0.037639,0.006430,-0.044345,-0.235372,-0.302680,-0.232843,-0.322483,-0.354464,-0.345592,0.181271,...,0.209452,-0.404418,-0.684496,0.064907,0.875679,-0.879033,0.400219,-0.772288,0.252653,0.056252
7764,0.037451,-0.002724,0.021009,-0.218281,-0.378082,-0.076950,-0.304446,-0.400661,-0.193071,0.113141,...,0.237003,0.000207,-0.317314,0.052806,-0.266724,0.864404,0.701169,-0.779566,0.249121,0.047071
7765,0.044011,-0.004536,-0.051242,-0.219202,-0.383350,-0.081035,-0.310419,-0.380233,-0.201007,0.166671,...,0.069366,0.037919,-0.356579,-0.101360,0.700740,0.936674,-0.589479,-0.785603,0.246409,0.031700


In [86]:
# Read target data
labels = pd.read_csv('data/activity_labels.txt', header=None)
labels

Unnamed: 0,0
0,1 WALKING
1,2 WALKING_UPSTAIRS
2,3 WALKING_DOWNSTAIRS
3,4 SITTING
4,5 STANDING
5,6 LAYING
6,7 STAND_TO_SIT
7,8 SIT_TO_STAND
8,9 SIT_TO_LIE
9,10 LIE_TO_SIT


In [87]:
# Read target data and rename the column
y_train = pd.read_csv('data/Train/y_train.txt', sep='\s+', header=None)
y_train = y_train.rename(columns={0: "Activity"})
y_train

Unnamed: 0,Activity
0,5
1,5
2,5
3,5
4,5
...,...
7762,2
7763,2
7764,2
7765,2


In [88]:
# Merge X and y data into one df for correlation matrix
df = y_train.join(X_train, how='outer')

In [89]:
df.to_csv("data/master.csv")

## Use seaborn to check which columns correlate the most

In [90]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [91]:
# Make a copy of the master df
exp_df = df.copy()
exp_df

Unnamed: 0,Activity,tBodyAcc-Mean-1,tBodyAcc-Mean-2,tBodyAcc-Mean-3,tBodyAcc-STD-1,tBodyAcc-STD-2,tBodyAcc-STD-3,tBodyAcc-Mad-1,tBodyAcc-Mad-2,tBodyAcc-Mad-3,...,fBodyGyroJerkMag-MeanFreq-1,fBodyGyroJerkMag-Skewness-1,fBodyGyroJerkMag-Kurtosis-1,tBodyAcc-AngleWRTGravity-1,tBodyAccJerk-AngleWRTGravity-1,tBodyGyro-AngleWRTGravity-1,tBodyGyroJerk-AngleWRTGravity-1,tXAxisAcc-AngleWRTGravity-1,tYAxisAcc-AngleWRTGravity-1,tZAxisAcc-AngleWRTGravity-1
0,5,0.043580,-0.005970,-0.035054,-0.995381,-0.988366,-0.937382,-0.995007,-0.988816,-0.953325,...,-0.012236,-0.314848,-0.713308,-0.112754,0.030400,-0.464761,-0.018446,-0.841559,0.179913,-0.051718
1,5,0.039480,-0.002131,-0.029067,-0.998348,-0.982945,-0.971273,-0.998702,-0.983315,-0.974000,...,0.202804,-0.603199,-0.860677,0.053477,-0.007435,-0.732626,0.703511,-0.845092,0.180261,-0.047436
2,5,0.039978,-0.005153,-0.022651,-0.995482,-0.977314,-0.984760,-0.996415,-0.975835,-0.985973,...,0.440079,-0.404427,-0.761847,-0.118559,0.177899,0.100699,0.808529,-0.849230,0.180610,-0.042271
3,5,0.039785,-0.011809,-0.028916,-0.996194,-0.988569,-0.993256,-0.996994,-0.988526,-0.993135,...,0.430891,-0.138373,-0.491604,-0.036788,-0.012892,0.640011,-0.485366,-0.848947,0.181907,-0.040826
4,5,0.038758,-0.002289,-0.023863,-0.998241,-0.986774,-0.993115,-0.998216,-0.986479,-0.993825,...,0.137735,-0.366214,-0.702490,0.123320,0.122542,0.693578,-0.615971,-0.848164,0.185124,-0.037080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7762,2,0.048048,-0.042445,-0.065884,-0.195448,-0.278326,-0.219954,-0.282233,-0.305861,-0.357803,...,-0.008381,-0.596760,-0.879026,-0.190437,0.829718,0.206972,-0.425619,-0.792292,0.238580,0.056020
7763,2,0.037639,0.006430,-0.044345,-0.235372,-0.302680,-0.232843,-0.322483,-0.354464,-0.345592,...,0.209452,-0.404418,-0.684496,0.064907,0.875679,-0.879033,0.400219,-0.772288,0.252653,0.056252
7764,2,0.037451,-0.002724,0.021009,-0.218281,-0.378082,-0.076950,-0.304446,-0.400661,-0.193071,...,0.237003,0.000207,-0.317314,0.052806,-0.266724,0.864404,0.701169,-0.779566,0.249121,0.047071
7765,2,0.044011,-0.004536,-0.051242,-0.219202,-0.383350,-0.081035,-0.310419,-0.380233,-0.201007,...,0.069366,0.037919,-0.356579,-0.101360,0.700740,0.936674,-0.589479,-0.785603,0.246409,0.031700


# Random Forest classifier

In [92]:
from sklearn.ensemble import RandomForestClassifier

## Convert koi_disposition column to numeric

In [93]:
# Convert "Activity" column to numeric
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
exp_df['Activity'] = le.fit_transform(df['Activity'].astype(str))

In [94]:
# Check unique values
exp_df['Activity'].unique()

array([ 7,  9,  6, 10,  2,  8,  1, 11,  3,  0,  5,  4])

In [95]:
# X = df.drop('koi_disposition', axis=1)
# y = df['koi_disposition']
# print(X.shape, y.shape)

## Create a Train Test Split
- Note: current data set has already split the data into Train and Test sets

Use `Activity` for the y values

In [96]:
# # Make a copy of the df
# feat_df = selected_features

In [97]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [98]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)

In [99]:
X_train.head()

Unnamed: 0,tBodyAcc-Mean-1,tBodyAcc-Mean-2,tBodyAcc-Mean-3,tBodyAcc-STD-1,tBodyAcc-STD-2,tBodyAcc-STD-3,tBodyAcc-Mad-1,tBodyAcc-Mad-2,tBodyAcc-Mad-3,tBodyAcc-Max-1,...,fBodyGyroJerkMag-MeanFreq-1,fBodyGyroJerkMag-Skewness-1,fBodyGyroJerkMag-Kurtosis-1,tBodyAcc-AngleWRTGravity-1,tBodyAccJerk-AngleWRTGravity-1,tBodyGyro-AngleWRTGravity-1,tBodyGyroJerk-AngleWRTGravity-1,tXAxisAcc-AngleWRTGravity-1,tYAxisAcc-AngleWRTGravity-1,tZAxisAcc-AngleWRTGravity-1
0,0.04358,-0.00597,-0.035054,-0.995381,-0.988366,-0.937382,-0.995007,-0.988816,-0.953325,-0.794796,...,-0.012236,-0.314848,-0.713308,-0.112754,0.0304,-0.464761,-0.018446,-0.841559,0.179913,-0.051718
1,0.03948,-0.002131,-0.029067,-0.998348,-0.982945,-0.971273,-0.998702,-0.983315,-0.974,-0.802537,...,0.202804,-0.603199,-0.860677,0.053477,-0.007435,-0.732626,0.703511,-0.845092,0.180261,-0.047436
2,0.039978,-0.005153,-0.022651,-0.995482,-0.977314,-0.98476,-0.996415,-0.975835,-0.985973,-0.798477,...,0.440079,-0.404427,-0.761847,-0.118559,0.177899,0.100699,0.808529,-0.84923,0.18061,-0.042271
3,0.039785,-0.011809,-0.028916,-0.996194,-0.988569,-0.993256,-0.996994,-0.988526,-0.993135,-0.798477,...,0.430891,-0.138373,-0.491604,-0.036788,-0.012892,0.640011,-0.485366,-0.848947,0.181907,-0.040826
4,0.038758,-0.002289,-0.023863,-0.998241,-0.986774,-0.993115,-0.998216,-0.986479,-0.993825,-0.801982,...,0.137735,-0.366214,-0.70249,0.12332,0.122542,0.693578,-0.615971,-0.848164,0.185124,-0.03708


In [103]:
y_train = y_train['Activity']

In [104]:
# y_train = y_train.values.reshape(-1,1)

In [105]:
rf = rf.fit(X_train, y_train)

In [106]:
rf.score(X_train, y_train)

1.0

In [107]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.00118441, 0.00126466, 0.00047185, 0.00770649, 0.00036634,
       0.00029508, 0.0046141 , 0.00050825, 0.0005209 , 0.00751056,
       0.0008795 , 0.00038932, 0.00110758, 0.00058837, 0.00045929,
       0.00096624, 0.00485183, 0.00110309, 0.0004816 , 0.00230263,
       0.00091292, 0.00032597, 0.00225119, 0.00090646, 0.00060657,
       0.00029978, 0.00019368, 0.00027144, 0.00028232, 0.0002155 ,
       0.00013779, 0.0001616 , 0.00021994, 0.00024494, 0.00016665,
       0.00022671, 0.00030702, 0.00479859, 0.00114068, 0.00186291,
       0.02652214, 0.024501  , 0.01133448, 0.00152525, 0.00231279,
       0.00100419, 0.00162582, 0.00185001, 0.00083166, 0.02802099,
       0.02175579, 0.00786718, 0.02649294, 0.02152908, 0.0092404 ,
       0.00531673, 0.02565486, 0.01857888, 0.00537053, 0.00115894,
       0.00123   , 0.00044511, 0.00444436, 0.00579096, 0.00032237,
       0.00583645, 0.00446925, 0.00381695, 0.00289193, 0.00762356,
       0.00642371, 0.00570976, 0.00385376, 0.00633817, 0.00857

In [108]:
# We can sort the features by their importance
sorted(zip(rf.feature_importances_, df.columns), reverse=True)

[(0.0280209928590966, 'tGravityAcc-Mad-3              '),
 (0.026522139725707324, 'tBodyAcc-Correlation-3         '),
 (0.02649294046637727, 'tGravityAcc-Max-3              '),
 (0.025796750068712012, 'tXAxisAcc-AngleWRTGravity-1    '),
 (0.025654864699614727, 'tGravityAcc-SMA-1              '),
 (0.024500999905833556, 'tGravityAcc-Mean-1             '),
 (0.024347379953889744, 'tBodyGyroJerk-AngleWRTGravity-1'),
 (0.021755786859499034, 'tGravityAcc-Max-1              '),
 (0.02152908368094253, 'tGravityAcc-Min-1              '),
 (0.018578882607237047, 'tGravityAcc-Energy-1           '),
 (0.01133448266714726, 'tGravityAcc-Mean-2             '),
 (0.010866254186973446, 'tYAxisAcc-AngleWRTGravity-1    '),
 (0.010747824952793963, 'fBodyAcc-STD-3                 '),
 (0.010052274918639017, 'fBodyAccJerk-Kurtosis-1        '),
 (0.009500643536287018, 'tBodyAccJerk-IQR-1             '),
 (0.00924040323114767, 'tGravityAcc-Min-2              '),
 (0.008761805539555221, 'fBodyAccMag-Mean-1   

In [20]:
feat_df = df[['koi_fpflag_ss','koi_disposition', 'koi_fpflag_nt','koi_fpflag_co']]
feat_df.head()

Unnamed: 0,koi_fpflag_ss,koi_disposition,koi_fpflag_nt,koi_fpflag_co
0,0,1,0,0
1,1,2,0,0
2,1,2,0,0
3,0,1,0,0
4,0,1,0,0


In [21]:
# Select the feature and target data
X = new_df.drop('koi_disposition', axis=1)
y = new_df['koi_disposition']
print(X.shape, y.shape)

(6991, 3) (6991,)


In [22]:
# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [23]:
# Scale your data
scaler = MinMaxScaler().fit(X_train)

# Train the Model



In [24]:
# Transform the training and testing data using the X_scaler and y_scaler models

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
scaler = MinMaxScaler()
scaler.fit(X_train_scaled)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [26]:
from sklearn.neighbors import KNeighborsClassifier
scaler = KNeighborsClassifier()
scaler.fit(X_train_scaled, y_train)
test_score = scaler.predict(X_test_scaled)
test_score

array([0, 2, 2, ..., 2, 2, 0])

In [27]:
print(f"Training Data Score: {scaler.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {scaler.score(X_test_scaled, y_test)}")

Training Data Score: 0.7228685866870113
Testing Data Score: 0.7368421052631579


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [28]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

In [29]:
# Train the model with GridSearch
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=300, random_state=0)

rf_feature_imp = RandomForestClassifier(100)

feat_selection = SelectFromModel(rf_feature_imp, threshold=0.5)

model = Pipeline([
          ('fs', feat_selection), 
          ('clf', classifier), 
        ])

grid_param = {
    'n_estimators': [20, 40, 60, 80, 100],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [30]:
# from sklearn.ensemble import RandomForestClassifier
# classifier = RandomForestClassifier(n_estimators=300, random_state=0)

In [31]:
grid2 = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

In [32]:
grid2.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

In [33]:
print(grid2.best_params_)
print(grid2.best_score_)

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 20}
0.7354568873299907


# Predict

In [34]:
predicted_class = grid2.predict(X_test)
print(predicted_class)

[1 2 2 ... 2 2 1]


In [35]:
predictions = grid2.predict(X_test)
print(f"True output: {y[0]}")
print(f"Predicted output: {predictions[0]}")
print(f"Prediction Error: {predictions[0]-y[0]}")

True output: 1
Predicted output: 1
Prediction Error: 0


In [36]:
predictions = grid2.predict(X_test)
pred = {}
for i in range(len(predictions)):
    print(f"True output: {y[i]}")
    print(f"Predicted output: {predictions[i]}")
    print(f"Prediction Error: {predictions[i]-y[i]}")

True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True outp

Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Predictio

True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 2
Predicted output: 1
Prediction Error: -1
Tr

True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True ou

Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction

Predicted output: 2
Prediction Error: 2
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 1
Predicted

Prediction Error: 0
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 1
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 2
Predicted output: 1
Prediction Error: -1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 1
Predicted output: 2
Predicti

Prediction Error: 1
True output: 1
Predicted output: 2
Prediction Error: 1
True output: 0
Predicted output: 1
Prediction Error: 1
True output: 0
Predicted output: 2
Prediction Error: 2
True output: 2
Predicted output: 2
Prediction Error: 0
True output: 2
Predicted output: 1
Prediction Error: -1


# Save the Model

In [37]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'model_final_grid2.sav'
joblib.dump(grid2, filename)

['model_final_grid2.sav']

In [38]:
model = joblib.load(filename)
model

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=None,
                                              oob_score=False, random_state=0,
                                   

## Predict

In [39]:
predictions = grid2.predict(X_test)
print(f"True output: {y[0]}")
print(f"Predicted output: {predictions[0]}")
print(f"Prediction Error: {predictions[0]-y[0]}")

True output: 1
Predicted output: 1
Prediction Error: 0


In [40]:
pred = grid2.best_estimator_.predict(X_test)
print(f"True output: {y[0]}")
print(f"Predicted output: {predictions[0]}")
print(f"Prediction Error: {predictions[0]-y[0]}")

True output: 1
Predicted output: 1
Prediction Error: 0


# Test 1 - SVM

In [41]:
# Try accuracy_score
from sklearn.svm import SVC
clf_svc = SVC()
# Create classifier / model
clf_svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [42]:
pred = clf_svc.predict(X_test)
print(pred)

[1 2 2 ... 2 2 1]


In [43]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, y_test)

def submitAccuracy():
    return acc
submitAccuracy()

0.7511441647597255