In [1]:
import pandas as pd
import numpy as np
from os.path import dirname, abspath


In [2]:
d = dirname(dirname(abspath("preprocessing.ipynb")))
df_wokring = pd.read_csv(d + '/data/Semi_processed_data.csv') #nrows = 1000)

df = df_wokring.sample(frac=0.03, replace=False, random_state=22) #3% smaple from df for 22794 observations

feature_names = ['start station name', 'end station name', 'gender',
                 'tripduration', 'Start Time','End Time']

label = 'usertype'
user_type = df['usertype']
df.head()
df.shape

(22794, 12)

In [3]:
nans = lambda df: df[df.isnull().any(axis=1)]


In [4]:
df.drop([label, 'birth year','start station latitude','end station latitude',
        'start station longitude', 'end station longitude'], axis = 1, inplace = True)


In [5]:
df.head()

Unnamed: 0,tripduration,start station name,end station name,gender,Start Time,End Time
35922,224,Jersey & 6th St,Grove St PATH,1,32214.436,32438.436
493350,217,Grove St PATH,Marin Light Rail,1,73627.443,73845.094
558429,680,Riverview Park,Hilltop,1,76253.519,76934.417
285744,228,McGinley Square,Sip Ave,2,24314.744,24543.641
455471,387,Newport Pkwy,Harborside,2,58258.963,58646.882


In [6]:
X = df.loc[:,df.columns != label].values
y = user_type.values

In [7]:
df_y = user_type


In [8]:
#Balance
balance = user_type.value_counts(normalize = True)
balance

Subscriber    0.923357
Customer      0.076643
Name: usertype, dtype: float64

In [9]:
base = balance[0]

In [10]:
print("balance of the data / baseline accuracy = ",base)


balance of the data / baseline accuracy =  0.9233570237781873


* **ML pipeline with logistic regression**

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib
import statistics
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.compose import ColumnTransformer
import matplotlib
from matplotlib import pylab as plt
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")


In [13]:
def ML_pipeline_kfold_GridSearchCV_log(X,y,random_state,n_folds):
    # create a test set
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    # splitter for _other
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    # create the pipeline: preprocessor + supervised ML method
    
    feature_names = ['start station name', 'end station name', 'gender',
                     'tripduration', 'Start Time','End Time']

    cat_ftrs = ['start station name', 'end station name', 'gender']
    num_ftrs = ['tripduration', 'Start Time','End Time']
    
    cat_ftrs_i = [df.columns.get_loc(x) for x in cat_ftrs]
    num_ftrs_i = [df.columns.get_loc(x) for x in num_ftrs]
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False, handle_unknown = 'ignore'))])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])


    # collect the transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_ftrs_i),
            ('cat', categorical_transformer, cat_ftrs_i)]) 

    pipe = make_pipeline(preprocessor, LogisticRegression(penalty='l1', solver='saga',
                                                          max_iter=10000, multi_class = 'auto', random_state = 22))
    #pipe = make_pipeline(preprocess, LogisticRegression(penalty='l1', solver='saga', max_iter=10000))
    
    # the parameter(s) we want to tune
    param_grid = {'logisticregression__C': [0.1, 1.0, 10, 100]}
    # prepare gridsearch
    grid = GridSearchCV(pipe, param_grid=param_grid,scoring = make_scorer(accuracy_score),
                            cv=kf, return_train_score = True,iid=True)
    # do kfold CV on _other
    print("running")
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)



In [14]:
test_scores = []

for i in range(10):
    grid, test_score = ML_pipeline_kfold_GridSearchCV_log(X,y.ravel(),i*22, 5)
    print(grid.best_params_)
    print('best CV score:',grid.best_score_)
    print('test score:',test_score)
    test_scores.append(test_score)
    
print('test accuracy:',np.around(np.mean(test_scores),2),'+/-',np.around(np.std(test_scores),2))

mean_LR_a = np.mean(test_scores)
mean_LR_sd = np.std(test_scores)

running
{'logisticregression__C': 1.0}
best CV score: 0.943844255552509
test score: 0.9447247203334065
running
{'logisticregression__C': 1.0}
best CV score: 0.94510556621881
test score: 0.9420925641588067
running
{'logisticregression__C': 100}
best CV score: 0.9447216890595009
test score: 0.9431892958982233
running
{'logisticregression__C': 1.0}
best CV score: 0.943953934740883
test score: 0.948672954595306
running
{'logisticregression__C': 10}
best CV score: 0.9433506992048258
test score: 0.9464794911164729
running
{'logisticregression__C': 10}
best CV score: 0.9435152179873869
test score: 0.9425312568545734
running
{'logisticregression__C': 1.0}
best CV score: 0.9442281327118179
test score: 0.9451634130291731
running
{'logisticregression__C': 1.0}
best CV score: 0.9437894159583219
test score: 0.9416538714630401
running
{'logisticregression__C': 1.0}
best CV score: 0.94455717027694
test score: 0.9440666812897566
running
{'logisticregression__C': 1.0}
best CV score: 0.945489443378119
t

In [18]:
print("test accuracy:",mean_LR_a)
print("sd: ",mean_LR_sd) 

test accuracy: 0.9438034656722966
sd:  0.00249090933390277


* **SVC**

In [31]:
from sklearn.svm import SVC

In [32]:
df_SVC = df_wokring.sample(frac=0.01, replace=False, random_state=22) #1% smaple from df for 22794 observations

user_type_SVC = df_SVC['usertype']

print(df_SVC.shape)
df_SVC.head()


(7598, 12)


Unnamed: 0,tripduration,start station name,start station latitude,start station longitude,end station name,end station latitude,end station longitude,usertype,birth year,gender,Start Time,End Time
35922,224,Jersey & 6th St,40.725289,-74.045572,Grove St PATH,40.719586,-74.043117,Subscriber,1970.0,1,32214.436,32438.436
493350,217,Grove St PATH,40.719586,-74.043117,Marin Light Rail,40.714584,-74.042817,Subscriber,1985.0,1,73627.443,73845.094
558429,680,Riverview Park,40.744319,-74.043991,Hilltop,40.731169,-74.057574,Subscriber,1979.0,1,76253.519,76934.417
285744,228,McGinley Square,40.72534,-74.067622,Sip Ave,40.730743,-74.063784,Subscriber,1996.0,2,24314.744,24543.641
455471,387,Newport Pkwy,40.728745,-74.032108,Harborside,40.719252,-74.034234,Subscriber,1990.0,2,58258.963,58646.882


In [33]:
nans = lambda df_SVC: df_SVC[df_SVC.isnull().any(axis=1)]

In [34]:
df_SVC.drop([label, 'birth year','start station latitude','end station latitude',
        'start station longitude', 'end station longitude'], axis = 1, inplace = True)
df_SVC.head()


Unnamed: 0,tripduration,start station name,end station name,gender,Start Time,End Time
35922,224,Jersey & 6th St,Grove St PATH,1,32214.436,32438.436
493350,217,Grove St PATH,Marin Light Rail,1,73627.443,73845.094
558429,680,Riverview Park,Hilltop,1,76253.519,76934.417
285744,228,McGinley Square,Sip Ave,2,24314.744,24543.641
455471,387,Newport Pkwy,Harborside,2,58258.963,58646.882


In [35]:
X_SVC = df_SVC.loc[:,df.columns != label].values
y_SVC = user_type_SVC.values

In [43]:
#Balance
balance_svc = user_type_SVC.value_counts(normalize = True)
balance_svc

Subscriber    0.924059
Customer      0.075941
Name: usertype, dtype: float64

In [45]:
base_SVC = balance_svc[0]

In [36]:
def ML_pipeline_kfold_GridSearchCV_SVC(X,y,random_state,n_folds):
    # create a test set
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    # splitter for _other
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    # create the pipeline: preprocessor + supervised ML method
    
    feature_names = ['start station name', 'end station name', 'gender',
                     'tripduration', 'Start Time','End Time']

    cat_ftrs = ['start station name', 'end station name', 'gender']
    num_ftrs = ['tripduration', 'Start Time','End Time']
    
    cat_ftrs_i = [df_SVC.columns.get_loc(x) for x in cat_ftrs]
    num_ftrs_i = [df_SVC.columns.get_loc(x) for x in num_ftrs]

    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False, handle_unknown = 'ignore'))])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])


    # collect the transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_ftrs_i),
            ('cat', categorical_transformer, cat_ftrs_i)]) 

    pipe = make_pipeline(preprocessor, SVC(random_state = 22))
    #pipe = make_pipeline(preprocess, LogisticRegression(penalty='l1', solver='saga', max_iter=10000))
    
    # the parameter(s) we want to tune
    param_grid = {'svc__C': np.logspace(-3,4,num=8),'svc__gamma': np.logspace(-3,4,num=8)}
    # prepare gridsearch
    grid = GridSearchCV(pipe, param_grid=param_grid,scoring = make_scorer(accuracy_score),
                            cv=kf, return_train_score = True,iid=True)
    # do kfold CV on _other
    print("running")
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)

In [37]:
test_scores = []

for i in range(5):
    grid, test_score = ML_pipeline_kfold_GridSearchCV_SVC(X_SVC,y_SVC.ravel(),i*22, 5)
    print(grid.best_params_)
    print('best CV score:',grid.best_score_)
    print('test score:',test_score)
    test_scores.append(test_score)
    
print('test accuracy:',np.around(np.mean(test_scores),2),'+/-',np.around(np.std(test_scores),2))

mean_SVC_a = np.around(np.mean(test_scores),2)
mean_SVC_sd = np.around(np.std(test_scores),2)



running
{'svc__C': 10.0, 'svc__gamma': 0.1}
best CV score: 0.9442250740375123
test score: 0.9427631578947369
running
{'svc__C': 10.0, 'svc__gamma': 0.1}
best CV score: 0.9422507403751234
test score: 0.9381578947368421
running
{'svc__C': 1.0, 'svc__gamma': 0.1}
best CV score: 0.9420862125699243
test score: 0.9407894736842105
running
{'svc__C': 1.0, 'svc__gamma': 0.1}
best CV score: 0.9417571569595261
test score: 0.9388157894736842
running
{'svc__C': 1000.0, 'svc__gamma': 0.01}
best CV score: 0.9422507403751234
test score: 0.944078947368421
running


KeyboardInterrupt: 

In [39]:
mean_SVC_a = np.around(np.mean(test_scores),2)
mean_SVC_sd = np.around(np.std(test_scores),2)

print("test accuracy:",mean_SVC_a)
print("sd: ",mean_SVC_sd)

test accuracy: 0.94
sd:  0.0


* **Random Forest**

In [47]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
def ML_pipeline_kfold_GridSearchCV_RF(X,y,random_state,n_folds):
    # create a test set
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    # splitter for _other
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    # create the pipeline: preprocessor + supervised ML method
    
    feature_names = ['start station name', 'end station name', 'gender',
                     'tripduration', 'Start Time','End Time']

    cat_ftrs = ['start station name', 'end station name', 'gender']
    num_ftrs = ['tripduration', 'Start Time','End Time']
    
    cat_ftrs_i = [df.columns.get_loc(x) for x in cat_ftrs]
    num_ftrs_i = [df.columns.get_loc(x) for x in num_ftrs]

    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False, handle_unknown = 'ignore'))])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])


    # collect the transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_ftrs_i),
            ('cat', categorical_transformer, cat_ftrs_i)]) 

    pipe = make_pipeline(preprocessor, RandomForestClassifier(random_state = 22))
    #pipe = make_pipeline(preprocess, LogisticRegression(penalty='l1', solver='saga', max_iter=10000))
    
    # the parameter(s) we want to tune
    param_grid = {'randomforestclassifier__min_samples_split': range(2,25,5),
                  'randomforestclassifier__max_depth': range(1,30,5)}
    # prepare gridsearch
    grid = GridSearchCV(pipe, param_grid=param_grid,scoring = make_scorer(accuracy_score),
                            cv=kf, return_train_score = True,iid=True)
    # do kfold CV on _other
    print("running")
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)




In [22]:
test_scores = []

for i in range(10):
    grid, test_score = ML_pipeline_kfold_GridSearchCV_RF(X,y.ravel(),i*22, 5)
    print(grid.best_params_)
    print('best CV score:',grid.best_score_)
    print('test score:',test_score)
    test_scores.append(test_score)
    
print('test accuracy:',np.around(np.mean(test_scores),2),'+/-',np.around(np.std(test_scores),2))

mean_RF_a = np.mean(test_scores)
mean_RF_sd = np.std(test_scores)


running
{'randomforestclassifier__max_depth': 26, 'randomforestclassifier__min_samples_split': 12}
best CV score: 0.9515217987386894
test score: 0.9530598815529722
running
{'randomforestclassifier__max_depth': 26, 'randomforestclassifier__min_samples_split': 7}
best CV score: 0.9521250342747464
test score: 0.9524018425093222
running
{'randomforestclassifier__max_depth': 21, 'randomforestclassifier__min_samples_split': 17}
best CV score: 0.9525089114340554
test score: 0.9497696863347225
running
{'randomforestclassifier__max_depth': 21, 'randomforestclassifier__min_samples_split': 7}
best CV score: 0.9511379215793803
test score: 0.9561307304233384
running
{'randomforestclassifier__max_depth': 26, 'randomforestclassifier__min_samples_split': 17}
best CV score: 0.9523443926514944
test score: 0.9563500767712217
running
{'randomforestclassifier__max_depth': 21, 'randomforestclassifier__min_samples_split': 7}
best CV score: 0.9535508637236084
test score: 0.9502083790304892
running
{'randomfor

In [23]:
print("test accuracy:",mean_RF_a)
print("sd: ",mean_RF_sd) 

test accuracy: 0.9515463917525773
sd:  0.0031732526162933017


* Test all three

In [61]:
base RF_score = 8.883430152912998

0.9233570237781873

In [46]:
LR_score = (mean_LR_a - base)/mean_LR_sd

RF_score = (mean_RF_a - base)/mean_RF_sd

SVC_score = (mean_SVC_a - base_SVC)/mean_LR_sd #/mean_SVC_sd

print("LR_score =",LR_score)
print("RF_score =",RF_score)
print("SVC_score =",SVC_score)


LR_score = 8.208424777177127
RF_score = 8.883430152912998
SVC_score = 6.3996858087378685


* Saving 10 RF modles

In [49]:
def ML_pipeline_kfold_GridSearchCV_RF_3(X,y,random_state,n_folds):
    # create a test set
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    # splitter for _other
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    # create the pipeline: preprocessor + supervised ML method
    
    feature_names = ['start station name', 'end station name', 'gender',
                     'tripduration', 'Start Time','End Time']

    cat_ftrs = ['start station name', 'end station name', 'gender']
    num_ftrs = ['tripduration', 'Start Time','End Time']
    
    cat_ftrs_i = [df.columns.get_loc(x) for x in cat_ftrs]
    num_ftrs_i = [df.columns.get_loc(x) for x in num_ftrs]

    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False, handle_unknown = 'ignore'))])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])


    # collect the transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_ftrs_i),
            ('cat', categorical_transformer, cat_ftrs_i)]) 

    pipe = make_pipeline(preprocessor, RandomForestClassifier(random_state = 22))
    #pipe = make_pipeline(preprocess, LogisticRegression(penalty='l1', solver='saga', max_iter=10000))
    
    # the parameter(s) we want to tune
    param_grid = {'randomforestclassifier__min_samples_split': range(2,25,5),
                  'randomforestclassifier__max_depth': range(1,30,5)}
    # prepare gridsearch
    grid = GridSearchCV(pipe, param_grid=param_grid,scoring = make_scorer(accuracy_score),
                            cv=kf, return_train_score = True,iid=True)
    # do kfold CV on _other
    print("running")
    grid.fit(X_other, y_other)
    return grid, X_test, y_test


In [52]:
import pickle

best_estimators = []

for i in range(10):
    grid, X_test, y_test = ML_pipeline_kfold_GridSearchCV_RF_3(X,y,22*i,4)
    print(grid.best_score_)
    print(grid.score(X_test,y_test))
    print(grid.best_params_)
    best_estimators.append(grid.best_estimator_)
    
# save the output so I can use it later
file = open(d + '/results/RF Models_best_estimators.save', 'wb')
pickle.dump((best_estimators),file)
file.close()



running
0.9516314779270634
0.9539372669445054
{'randomforestclassifier__max_depth': 21, 'randomforestclassifier__min_samples_split': 17}
running
0.9530024677817384
0.9524018425093222
{'randomforestclassifier__max_depth': 26, 'randomforestclassifier__min_samples_split': 22}
running
0.9521250342747464
0.9502083790304892
{'randomforestclassifier__max_depth': 26, 'randomforestclassifier__min_samples_split': 22}
running
0.9513024403619413
0.9574468085106383
{'randomforestclassifier__max_depth': 26, 'randomforestclassifier__min_samples_split': 22}
running
0.9519605154921854
0.9532792279008554
{'randomforestclassifier__max_depth': 21, 'randomforestclassifier__min_samples_split': 2}
running
0.9534411845352344
0.9513051107699056
{'randomforestclassifier__max_depth': 21, 'randomforestclassifier__min_samples_split': 12}
running
0.9527831094049904
0.9510857644220224
{'randomforestclassifier__max_depth': 21, 'randomforestclassifier__min_samples_split': 12}
running
0.9533863449410475
0.9484536082474