In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
path = '../../Data Science London + Scikit-learn/'
app_train = pd.read_csv(path + 'train.csv', header=None)
app_test = pd.read_csv(path + 'test.csv', header=None)
target = pd.read_csv(path + 'trainLabels.csv', header=None)

In [3]:
train = app_train.copy()
test = app_test.copy()

print(train.shape, test.shape, target.shape)

(1000, 40) (9000, 40) (1000, 1)


In [4]:
reg = GradientBoostingClassifier(random_state = 7)

reg.fit(train, target)

pred = reg.predict(test)

In [5]:
n_estimators = [100 * (i + 1) for i in range(10)]
max_depth = [2 ** i for i in range(10)]
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth)

grid_search = GridSearchCV(reg, param_grid, scoring = "neg_mean_squared_error", n_jobs = -1, verbose = 1)

grid_result = grid_search.fit(train, target)
                              
print(f"Best Accuracy: {grid_result.best_score_} using {grid_result.best_params_}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  8.6min finished
Best Accuracy: -0.121 using {'max_depth': 4, 'n_estimators': 1000}


In [6]:
best_clf = GradientBoostingClassifier(
        n_estimators = grid_result.best_params_["n_estimators"],
        max_depth = grid_result.best_params_["max_depth"]
)

best_clf.fit(train, target)

pred = best_clf.predict(test)

submission = pd.DataFrame(pred)
print(submission)

      0
0     1
1     0
2     1
3     0
4     0
...  ..
8995  1
8996  1
8997  1
8998  0
8999  1

[9000 rows x 1 columns]


In [7]:
submission.columns = ['Solution']
print(submission)

      Solution
0            1
1            0
2            1
3            0
4            0
...        ...
8995         1
8996         1
8997         1
8998         0
8999         1

[9000 rows x 1 columns]


In [8]:
submission['Id'] = np.arange(1, submission.shape[0]+1)
print(submission)

      Solution    Id
0            1     1
1            0     2
2            1     3
3            0     4
4            0     5
...        ...   ...
8995         1  8996
8996         1  8997
8997         1  8998
8998         0  8999
8999         1  9000

[9000 rows x 2 columns]


In [10]:
submission = submission[['Id', 'Solution']]
pd.DataFrame(submission).to_csv("Day_048_HW.csv", index = False)
print(submission.shape)

(9000, 2)
