In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer, MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [2]:
app_train = pd.read_csv("./data/train.csv")
app_test = pd.read_csv("./data/test.csv")
target = pd.read_csv("./data/trainLabels.csv")

In [3]:
train = app_train.copy()
test = app_test.copy()

print(train.shape, test.shape, target.shape)

(999, 40) (8999, 40) (999, 1)


In [4]:
reg = GradientBoostingClassifier(random_state = 7)

reg.fit(train, target)

pred = reg.predict(test)

In [5]:
n_estimators = [100 * (i + 1) for i in range(10)]
max_depth = [2 ** i for i in range(10)]
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth)

grid_search = GridSearchCV(reg, param_grid, scoring = "neg_mean_squared_error", n_jobs = -1, verbose = 1)

grid_result = grid_search.fit(train, target)
                              
print(f"Best Accuracy: {grid_result.best_score_} using {grid_result.best_params_}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   54.9s finished


Best Accuracy: -0.12012012012012012 using {'max_depth': 4, 'n_estimators': 400}


In [8]:
best_clf = GradientBoostingClassifier(
        n_estimators = grid_result.best_params_["n_estimators"],
        max_depth = grid_result.best_params_["max_depth"]
)

best_clf.fit(train, target)

pred = best_clf.predict(test)

pd.DataFrame(pred).to_csv("HW048.csv", index = False)