In [112]:
import pandas as pd
import numpy as np
import csv
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, ParameterGrid, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

- `job`
- `education`
- `device`
- `outcome_old`

# import data

In [113]:
campaign_ad = pd.read_csv("MLUnige2023_subscriptions_train.csv", index_col="Id")
campaign_test = pd.read_csv("MLUnige2023_subscriptions_test.csv", index_col="Id")

In [114]:
campaign_test.groupby(by='job').size()

job
entrepreneur         114
freelance            121
housekeeper          100
industrial_worker    747
manager              852
na                    21
retired              242
salesman             341
student              120
teacher              436
technology           629
unemployed           114
dtype: int64

# dummify marital and outcome_old

In [115]:
campaign_ad = pd.get_dummies(campaign_ad, columns=['marital', 'outcome_old'])

# train-valid-test split

In [116]:
X = campaign_ad.drop(columns='subscription')
y = campaign_ad['subscription']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=46)

In [117]:
print('size of training set:', X_train.shape[0])
print('size of validation set:', X_valid.shape[0])
print('size of test set:', X_test.shape[0])

size of training set: 6266
size of validation set: 1343
size of test set: 1343


# Imputation for Device 
## Train data - `device` 

In [118]:
X_tr_dev = X_train.drop(columns=['job', 'education'])
X_tr_imp_dev = X_tr_dev[X_tr_dev['device'] == 'na'].drop(columns='device') 
#this is the data set that we need to predict and fuse with actual Training data for future imputation of "education"
X_tr_imp_dev


Unnamed: 0_level_0,age,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,X1,X2,X3,X4,marital_divorced,marital_married,marital_single,outcome_old_failure,outcome_old_na,outcome_old_other,outcome_old_success
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
4359,44,17,6,24.05,1,0,-1,0,0,0,0.076980,0,1,0,0,1,0,0
5378,31,26,5,8.60,5,0,-1,0,0,1,0.068700,1,0,0,0,1,0,0
3095,49,5,6,0.45,3,0,-1,0,0,0,0.088029,0,1,0,0,1,0,0
8737,36,16,6,33.80,11,0,-1,0,0,1,0.075291,0,1,0,0,1,0,0
7745,32,12,5,13.95,3,0,-1,0,0,1,0.085014,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4510,32,6,6,29.80,5,0,-1,0,0,1,0.061255,0,1,0,0,1,0,0
4515,36,11,6,5.35,2,0,-1,0,0,0,0.095655,0,1,0,0,1,0,0
4030,29,6,5,12.90,2,0,-1,0,0,1,0.073194,0,1,0,0,1,0,0
5194,44,29,5,22.30,1,0,-1,0,0,1,0.073457,0,1,0,0,1,0,0


In [119]:
X_tr_dev = X_tr_dev[X_tr_dev['device'] != 'na'] # this is the data set that does not have NAs
y_tr_dev = X_tr_dev[['device']] 
X_tr_dev = X_tr_dev.drop(columns='device')
X_tr_dev

Unnamed: 0_level_0,age,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,X1,X2,X3,X4,marital_divorced,marital_married,marital_single,outcome_old_failure,outcome_old_na,outcome_old_other,outcome_old_success
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
4949,59,6,8,3.90,2,0,-1,0,0,0,0.073675,0,1,0,0,1,0,0
4298,50,5,8,27.15,1,0,-1,0,0,0,0.093803,0,1,0,0,1,0,0
6904,55,21,11,2.25,5,0,-1,1,0,1,0.085450,0,1,0,0,1,0,0
4642,44,19,11,17.65,2,0,-1,0,0,0,0.086812,0,1,0,0,1,0,0
2403,53,19,11,27.55,1,0,-1,0,0,0,0.078133,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8241,27,4,2,53.40,1,0,-1,0,0,1,0.081256,0,0,1,0,1,0,0
278,42,12,5,0.70,1,0,-1,0,0,0,0.072803,0,1,0,0,1,0,0
3714,39,13,5,98.25,5,0,-1,1,0,1,0.072803,0,0,1,0,1,0,0
3325,60,18,8,4.40,7,0,-1,1,0,1,0.146533,0,1,0,0,1,0,0


In [120]:

enc = OrdinalEncoder(categories=[['smartphone', 'desktop']])
y_tr_dev = enc.fit_transform(y_tr_dev)
y_tr_dev

array([[0.],
       [0.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [121]:
# changing shape of y_tr_dev so RandomForestClassifier stops complaining
print(y_tr_dev.shape)
y_tr_dev.shape = (y_tr_dev.shape[0],)
print(y_tr_dev.shape)

(4830, 1)
(4830,)


In [122]:
rfc_dev = RandomForestClassifier(n_estimators=100, random_state=59, n_jobs=-2)

In [123]:
rfc_dev.fit(X_tr_dev, y_tr_dev)

In [124]:
y_tr_dev_pred = rfc_dev.predict(X_tr_dev)
accuracy_score(y_true=y_tr_dev, y_pred=y_tr_dev_pred)
#np.array(y_tr_dev_pred.shape)

1.0

## Validation `device`

In [125]:
X_v_dev = X_valid.drop(columns=['job', 'education'])
X_v_imp_dev = X_v_dev[X_v_dev['device'] == 'na'].drop(columns='device')
X_v_dev = X_v_dev[X_v_dev['device'] != 'na']
y_v_dev = X_v_dev[['device']]
X_v_dev = X_v_dev.drop(columns='device')
enc = OrdinalEncoder(categories=[['smartphone', 'desktop']])
y_v_dev = enc.fit_transform(y_v_dev)

In [126]:
y_v_dev_pred = rfc_dev.predict(X_v_dev)
accuracy_score(y_pred=y_v_dev_pred, y_true=y_v_dev)

0.913926499032882

## Test `device`

In [127]:
X_te_dev = X_test.drop(columns=['job', 'education'])
X_te_imp_dev = X_te_dev[X_te_dev['device'] == 'na'].drop(columns='device')
X_te_dev = X_te_dev[X_te_dev['device'] != 'na']
y_te_dev = X_te_dev[['device']]
X_te_dev = X_te_dev.drop(columns='device')
enc = OrdinalEncoder(categories=[['smartphone', 'desktop']])
y_te_dev = enc.fit_transform(y_te_dev)

In [128]:
y_te_dev_pred = rfc_dev.predict(X_te_dev)
accuracy_score(y_pred=y_te_dev_pred, y_true=y_te_dev)

0.918111753371869

# Inserting device imputations in our sets

In [129]:
y_tr_imp_dev = rfc_dev.predict(X_tr_imp_dev)
y_v_imp_dev  = rfc_dev.predict(X_v_imp_dev)
y_te_imp_dev = rfc_dev.predict(X_te_imp_dev)

In [130]:
X_train.loc[X_train['device'] == 'smartphone', 'device'] = 0
X_train.loc[X_train['device'] == 'desktop', 'device'] = 1
X_train.loc[X_tr_imp_dev.index, 'device'] = y_tr_imp_dev

In [131]:
X_valid.loc[X_valid['device'] == 'smartphone', 'device'] = 0
X_valid.loc[X_valid['device'] == 'desktop', 'device'] = 1
X_valid.loc[X_v_imp_dev.index, 'device'] = y_v_imp_dev

In [132]:
X_test.loc[X_test['device'] == 'smartphone', 'device'] = 0
X_test.loc[X_test['device'] == 'desktop', 'device'] = 1
X_test.loc[X_te_imp_dev.index, 'device'] = y_te_imp_dev

# Imputing most common value for `job` and `education`

In [133]:
imputer = SimpleImputer(missing_values="na", strategy='most_frequent')
X_train[['job', 'education']] = imputer.fit_transform(X_train[['job', 'education']])
X_valid[['job', 'education']] = imputer.fit_transform(X_valid[['job', 'education']])
X_test[['job', 'education']]  = imputer.fit_transform(X_test[['job', 'education']])

In [134]:
X_train = pd.get_dummies(X_train, columns=['job'])
X_valid = pd.get_dummies(X_valid, columns=['job'])
X_test  = pd.get_dummies(X_test, columns=['job'])

In [135]:
enc = OrdinalEncoder(categories=[['high_school', 'university', 'grad_school']])
X_train['education'] = enc.fit_transform(X_train[['education']])
X_valid['education'] = enc.fit_transform(X_valid[['education']])
X_test['education']  = enc.fit_transform(X_test[['education']])

In [136]:
X_train.columns

Index(['age', 'education', 'device', 'day', 'month', 'time_spent',
       'banner_views', 'banner_views_old', 'days_elapsed_old', 'X1', 'X2',
       'X3', 'X4', 'marital_divorced', 'marital_married', 'marital_single',
       'outcome_old_failure', 'outcome_old_na', 'outcome_old_other',
       'outcome_old_success', 'job_entrepreneur', 'job_freelance',
       'job_housekeeper', 'job_industrial_worker', 'job_manager',
       'job_retired', 'job_salesman', 'job_student', 'job_teacher',
       'job_technology', 'job_unemployed'],
      dtype='object')

In [138]:
X_train.to_csv('0_X_train.csv')
y_train.to_csv('0_y_train.csv')
X_valid.to_csv('1_X_valid.csv')
y_valid.to_csv('1_y_valid.csv')
X_test.to_csv('2_X_test.csv')
y_test.to_csv('2_y_test.csv')

# BELOW IS USELESS FOR NOW Imputation for `education`

## Train data - `education`

In [132]:
X_tr_ed = X_train.drop(columns='job')
X_tr_imp_ed = X_tr_ed[X_tr_ed['education'] == 'na'].drop(columns='education') 
#this is the data set that we need to predict and fuse with actual Training data for future imputation of "education"
X_tr_imp_ed

Unnamed: 0_level_0,age,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,X1,X2,X3,X4,marital_divorced,marital_married,marital_single,outcome_old_failure,outcome_old_na,outcome_old_other,outcome_old_success
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
6904,55,1,21,11,2.25,5,0,-1,1,0,1,0.085450,0,1,0,0,1,0,0
6936,44,0,28,12,44.15,1,0,-1,0,0,0,0.097534,0,1,0,0,1,0,0
530,59,0,25,8,10.15,4,0,-1,0,0,0,0.072803,0,1,0,0,1,0,0
7817,46,0.0,20,6,1.40,5,0,-1,0,0,0,0.073557,0,1,0,0,1,0,0
3810,29,0,2,11,1.80,3,0,-1,0,0,0,0.166606,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1782,44,0,6,10,14.30,3,0,-1,0,0,0,0.075364,0,1,0,0,1,0,0
4635,32,0,30,3,7.85,2,0,-1,0,0,0,0.093875,0,0,1,0,1,0,0
1158,22,0,19,2,6.75,3,1,192,0,0,0,0.083834,0,0,1,1,0,0,0
4178,41,0,22,4,10.45,1,0,-1,0,0,0,0.081356,0,0,1,0,1,0,0


In [133]:
X_tr_ed = X_tr_ed[X_tr_ed['education'] != 'na'] # this is the data set that does not have NAs
y_tr_ed = X_tr_ed[['education']] 
X_tr_ed = X_tr_ed.drop(columns='education')
X_tr_ed

Unnamed: 0_level_0,age,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,X1,X2,X3,X4,marital_divorced,marital_married,marital_single,outcome_old_failure,outcome_old_na,outcome_old_other,outcome_old_success
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
4949,59,0,6,8,3.90,2,0,-1,0,0,0,0.073675,0,1,0,0,1,0,0
4298,50,0,5,8,27.15,1,0,-1,0,0,0,0.093803,0,1,0,0,1,0,0
4359,44,0.0,17,6,24.05,1,0,-1,0,0,0,0.076980,0,1,0,0,1,0,0
5378,31,0.0,26,5,8.60,5,0,-1,0,0,1,0.068700,1,0,0,0,1,0,0
3095,49,0.0,5,6,0.45,3,0,-1,0,0,0,0.088029,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,42,1,12,5,0.70,1,0,-1,0,0,0,0.072803,0,1,0,0,1,0,0
3714,39,0,13,5,98.25,5,0,-1,1,0,1,0.072803,0,0,1,0,1,0,0
7409,46,0.0,18,6,6.25,3,0,-1,0,0,0,0.089354,0,1,0,0,1,0,0
3325,60,0,18,8,4.40,7,0,-1,1,0,1,0.146533,0,1,0,0,1,0,0


In [134]:
enc = OrdinalEncoder(categories=[['high_school', 'university', 'grad_school']])
y_tr_ed = enc.fit_transform(y_tr_ed)
y_tr_ed.shape = (y_tr_ed.shape[0],)
y_tr_ed

array([0., 2., 2., ..., 0., 1., 1.])

In [135]:
rfc_ed = RandomForestClassifier(n_estimators=100, random_state=305, n_jobs=-2)

In [136]:
rfc_ed.fit(X_tr_ed, y_tr_ed)

In [137]:
y_tr_ed_pred = rfc_ed.predict(X_tr_ed)
accuracy_score(y_true=y_tr_ed, y_pred=y_tr_ed_pred)

1.0

## Validation `education`

In [138]:
X_v_ed = X_valid.drop(columns='job')
X_v_imp_ed = X_v_ed[X_v_ed['education'] == 'na'].drop(columns='education')
X_v_ed = X_v_ed[X_v_ed['education'] != 'na']
y_v_ed = X_v_ed[['education']]
X_v_ed = X_v_ed.drop(columns='education')
enc = OrdinalEncoder(categories=[['high_school', 'university', 'grad_school']])
y_v_ed = enc.fit_transform(y_v_ed)

In [139]:
y_v_ed_pred = rfc_ed.predict(X_v_ed)
accuracy_score(y_pred=y_v_ed_pred, y_true=y_v_ed)

0.5236985236985237

In [140]:
# rfc_ed = RandomForestClassifier(n_estimators=100, random_state=305, n_jobs=-2)
# hyperparam_grid={"max_features":[3,4,5,6,7],
#                  'min_samples_leaf':[1,3,5,7,9,11]}

# best_score=0.4

# # takes line 3 minutes to run!!!!
# for g in ParameterGrid(hyperparam_grid):
#     rfc_ed.set_params(**g)
#     #or rfc.set_params(min_samples_leaf=g['min_samples_leaf'], max_features=g['max_features'])
#     rfc_ed.fit(X_tr_ed,y_tr_ed)
#     # save if best
#     y_pred = rfc_ed.predict(X=X_v_ed)
#     acc = accuracy_score(y_true=y_v_ed, y_pred=y_pred)
#     if acc > best_score:
#         best_score = acc
#         best_params = g

# print(f"Accuracy: %0.5f" % best_score)
# print("Best parameters:", best_params)

In [141]:
gbr_ed = GradientBoostingClassifier(max_depth=2, n_estimators=400, learning_rate=0.01)
gbr_ed.fit(X_tr_ed, y_tr_ed)
y_v_ed_pred = gbr_ed.predict(X_v_ed)
accuracy_score(y_pred=y_v_ed_pred, y_true=y_v_ed)

0.5547785547785548

In [142]:
# gbr_ed = GradientBoostingClassifier(random_state=305)
# hyperparam_grid={'max_depth':[1,2,3,4],
#                  'learning_rate':[0.001, 0.01, 0.1, 1],
#                  'n_estimators':[50, 100, 250, 500]}

# best_score=0.4

# # takes line 3 minutes to run!!!!
# for g in ParameterGrid(hyperparam_grid):
#     gbr_ed.set_params(**g)
#     #or rfc.set_params(min_samples_leaf=g['min_samples_leaf'], max_features=g['max_features'])
#     gbr_ed.fit(X_tr_ed,y_tr_ed)
#     # save if best
#     y_pred = gbr_ed.predict(X=X_v_ed)
#     acc = accuracy_score(y_true=y_v_ed, y_pred=y_pred)
#     if acc > best_score:
#         best_score = acc
#         best_params = g

# print(f"Accuracy: %0.5f" % best_score)
# print("Best parameters:", best_params)

In [143]:
num_vars = ['age', 'time_spent', 'banner_views', 'banner_views_old', 'days_elapsed_old', 'X4']

scaler = StandardScaler()
scaler.fit(X_train[num_vars])

In [144]:
X_tr_ed[num_vars] = scaler.transform(X_tr_ed[num_vars])
X_tr_ed

Unnamed: 0_level_0,age,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,X1,X2,X3,X4,marital_divorced,marital_married,marital_single,outcome_old_failure,outcome_old_na,outcome_old_other,outcome_old_success
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
4949,1.521285,0,6,8,-0.777026,-0.198329,-0.363414,-0.469994,0,0,0,-0.410806,0,1,0,0,1,0,0
4298,0.746771,0,5,8,0.592095,-0.545265,-0.363414,-0.469994,0,0,0,0.231246,0,1,0,0,1,0,0
4359,0.230429,0.0,17,6,0.409546,-0.545265,-0.363414,-0.469994,0,0,0,-0.305390,0,1,0,0,1,0,0
5378,-0.888314,0.0,26,5,-0.500257,0.842481,-0.363414,-0.469994,0,0,1,-0.569509,1,0,0,0,1,0,0
3095,0.660714,0.0,5,6,-0.980186,0.148608,-0.363414,-0.469994,0,0,0,0.047058,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,0.058314,1,12,5,-0.965464,-0.545265,-0.363414,-0.469994,0,0,0,-0.438608,0,1,0,0,1,0,0
3714,-0.199857,0,13,5,4.778956,0.842481,-0.363414,-0.469994,1,0,1,-0.438608,0,0,1,0,1,0,0
7409,0.402543,0.0,18,6,-0.638642,0.148608,-0.363414,-0.469994,0,0,0,0.089340,0,1,0,0,1,0,0
3325,1.607342,0,18,8,-0.747583,1.536354,-0.363414,-0.469994,1,0,1,1.913266,0,1,0,0,1,0,0


In [145]:
X_v_ed[num_vars] = scaler.transform(X_v_ed[num_vars])
X_v_ed

Unnamed: 0_level_0,age,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,X1,X2,X3,X4,marital_divorced,marital_married,marital_single,outcome_old_failure,outcome_old_na,outcome_old_other,outcome_old_success
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
8392,1.004943,0,27,8,-0.756416,1.189418,-0.363414,-0.469994,1,0,0,0.167823,0,1,0,0,1,0,0
2970,-0.285914,0.0,28,5,-0.712250,0.842481,-0.363414,-0.469994,0,0,1,-0.357808,0,1,0,0,1,0,0
1269,2.726085,0,11,11,0.483154,-0.198329,1.043393,0.369540,0,0,0,2.214165,0,1,0,0,0,1,0
5924,-0.458028,0,13,5,-0.653363,-0.198329,-0.363414,-0.469994,0,0,1,-0.271216,0,1,0,0,1,0,0
4121,-1.404656,0,30,7,-0.853579,3.271037,-0.363414,-0.469994,0,0,1,-0.368523,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3271,-0.199857,0,29,1,-0.514979,-0.545265,-0.363414,-0.469994,1,0,1,-0.067625,0,0,1,0,1,0,0
7176,1.693399,0,10,3,-0.332430,-0.545265,-0.363414,-0.469994,1,0,1,0.372573,0,1,0,0,1,0,0
532,0.316486,0,26,8,-0.644530,1.536354,-0.363414,-0.469994,0,0,0,-0.435711,0,1,0,0,1,0,0
1680,-0.027743,0.0,20,6,-0.997852,0.148608,-0.363414,-0.469994,0,0,1,0.149578,0,1,0,0,1,0,0


In [146]:
knn = KNeighborsClassifier()

hyper_parameters = {"n_neighbors" : range(1, 50, 2)}
hyper_parameters
knn_ed = GridSearchCV(estimator=knn,
                     param_grid=hyper_parameters,
                     cv=KFold(n_splits=10, shuffle=True, random_state=1))
knn_ed.fit(X_tr_ed, y_tr_ed)
y_v_ed_pred = knn_ed.predict(X_v_ed)
accuracy_score(y_pred=y_v_ed_pred, y_true=y_v_ed)

0.5182595182595182

trying lasso to select predictors

- logistic regression
- knn
- LDA QDA
- SVM
- trees
- random forest
- bagging
- boosting
- xgboost
- neural networks