In [32]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import KFold, train_test_split, GridSearchCV

In [33]:
%store -r y_pred

In [34]:
X_train = pd.read_csv('0_X_train.csv', index_col='Id')
X_valid = pd.read_csv('1_X_valid.csv', index_col='Id')
X_test  = pd.read_csv('2_X_test.csv', index_col='Id')

y_train = pd.read_csv('0_y_train.csv', index_col='Id')
y_valid = pd.read_csv('1_y_valid.csv', index_col='Id')
y_test  = pd.read_csv('2_y_test.csv', index_col='Id')

X_pred = pd.read_csv("TEST_KAGGLE.csv", index_col="Id")

In [35]:
tree_full = DecisionTreeClassifier(criterion='entropy')#gini entropy
tree_full.fit(X_train, y_train)

In [36]:
path = tree_full.cost_complexity_pruning_path(X_train, y_train)

ccp_alphas, impurities = path.ccp_alphas, path.impurities #store to variables
grid_alphas = ccp_alphas

In [37]:
folds = KFold(n_splits = 10, shuffle = True, random_state = 69)
tree_full = DecisionTreeClassifier(criterion = "entropy", ccp_alpha = 0)
hyper_params = {"ccp_alpha": grid_alphas}

#no need to scale as the data as the splits do not depend on the comparison between variable range
treeCV = GridSearchCV(estimator = tree_full,
                      scoring = "accuracy",
                      param_grid = hyper_params,
                      cv = folds)

In [38]:
treeCV.fit(X_train, y_train)

In [39]:
resCV = treeCV.cv_results_

test_Misclasif = 1 - resCV["mean_test_score"]
std_mean_err_test = resCV["std_test_score"] / np.sqrt(10)
alpha_grid = resCV["param_ccp_alpha"].data

index_best = treeCV.best_index_
best_alpha = treeCV.best_params_["ccp_alpha"]
one_se_rule_best_alpha = np.max(alpha_grid[test_Misclasif <= test_Misclasif[index_best] + std_mean_err_test[index_best]])

print("Best alpha:", best_alpha)
print("Best 1se alpha:", one_se_rule_best_alpha)

Best alpha: 0.0015374043408464787
Best 1se alpha: 0.002441424556796112


In [40]:
tree_best_1se = DecisionTreeClassifier(criterion='entropy', ccp_alpha=one_se_rule_best_alpha).fit(X_train,y_train)
tree_best = DecisionTreeClassifier(criterion='entropy', ccp_alpha=best_alpha).fit(X_train,y_train)

In [41]:
#fully grown tree is like k nearest neighbor with 1 neighbor
print("Train accuracy:", tree_best.score(X_train,y_train))
print("Validation accuracy:", tree_best.score(X_valid,y_valid))
print("Test accuracy:", tree_best.score(X_test,y_test))

Train accuracy: 0.8495052665177146
Validation accuracy: 0.8160833953834699
Test accuracy: 0.8451228592702904


In [42]:
#fully grown tree is like k nearest neighbor with 1 neighbor
print("Train accuracy:", tree_best_1se.score(X_train,y_train))
print("Validation accuracy:", tree_best_1se.score(X_valid,y_valid))
print("Test accuracy:", tree_best_1se.score(X_test,y_test))

Train accuracy: 0.8415256942227897
Validation accuracy: 0.8131049888309755
Test accuracy: 0.8436336559940432


In [43]:
y_pred = tree_best.predict(X_pred)
y_pred.shape

(3837,)

In [5]:
file = open('test_file_trees.csv', 'w')
writer = csv.writer(file)
writer.writerow(['Id', 'subscription'])
for i in range(len(y_pred)):
    writer.writerow([i, y_pred[i]])
file.close()