In [1]:
##### Module

import numpy as np
import os
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
##### Path and read

fol_path  = './Kaggle/data-science-london-scikit-learn'
train_csv = np.genfromtxt(os.path.join(fol_path, 'train.csv') , delimiter=',')
label_csv = np.genfromtxt(os.path.join(fol_path, 'trainLabels.csv') , delimiter=',')
test_csv  = np.genfromtxt(os.path.join(fol_path, 'test.csv') , delimiter=',')

print(np.shape(train_csv), np.shape(label_csv), np.shape(test_csv), )

(1000, 40) (1000,) (9000, 40)


In [3]:
##### Data Preparation

x_train, x_test, y_train, y_test = train_test_split(train_csv, label_csv, test_size=0.25,
                                                    # random_state=9527
                                                   )
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

In [4]:
##### Check Acc

acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.896


In [5]:
##### Find best parameter: Generate Grid

loss_candi   = ['deviance', 'exponential']
lr_candi     = [0.01, 0.05, 0.1, 0.5]
n_esti_candi = [100, 200, 300, 400]
param_grid   = dict(loss = loss_candi,
                    learning_rate = lr_candi,
                    n_estimators = n_esti_candi)

In [6]:
##### Find best parameter:

grid_search = GridSearchCV(clf, param_grid, scoring="neg_mean_squared_error", n_jobs=8, verbose=1)
grid_result = grid_search.fit(x_train, y_train)

print(f'Best Accuracy: {grid_result.best_score_} using {grid_result.best_params_}')

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    7.0s
[Parallel(n_jobs=8)]: Done 160 out of 160 | elapsed:   30.3s finished


Best Accuracy: -0.14266666666666666 using {'learning_rate': 0.05, 'loss': 'exponential', 'n_estimators': 400}


In [7]:
##### Introducing Best Parameters

clf_2 = GradientBoostingClassifier(loss = grid_result.best_params_['loss'],
                                   learning_rate = grid_result.best_params_['learning_rate'],
                                   n_estimators = grid_result.best_params_['n_estimators'])
clf_2.fit(x_train, y_train)
y_pred_2 = clf_2.predict(x_test)
acc_2 = metrics.accuracy_score(y_test, y_pred_2)
print("New acuuracy: ", acc_2)

New acuuracy:  0.892


In [13]:
##### Go predict

submit_pred = clf_2.predict(test_csv).astype('int')

In [17]:
submit_pred

array([1, 0, 0, ..., 1, 0, 1])

In [14]:
print(np.shape(submit_pred))

(9000,)


In [40]:
submit_pred_df = pd.DataFrame({'Id':np.arange(1, 9001), 'Solution': submit_pred})
submit_pred_df

Unnamed: 0,Id,Solution
0,1,1
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
8995,8996,0
8996,8997,1
8997,8998,1
8998,8999,0


In [41]:
submit_pred_df.to_csv(os.path.join(fol_path, 'submit_pred_01.csv'), index = False)

In [28]:
#np.savetxt(os.path.join(fol_path, 'submit_pred_01.csv'), submit_pred, delimiter=',', fmt='%d')