# Libraries

In [10]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, ParameterGrid
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from yellowbrick.classifier import classification_report, confusion_matrix
from yellowbrick.classifier.rocauc import roc_auc
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import roc_curve
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Import Data

Import test partitions and prediction set

In [11]:
X_train = pd.read_csv('0_X_train.csv', index_col='Id')
X_valid = pd.read_csv('1_X_valid.csv', index_col='Id')
X_test  = pd.read_csv('2_X_test.csv', index_col='Id')

y_train = pd.read_csv('0_y_train.csv', index_col='Id')
y_valid = pd.read_csv('1_y_valid.csv', index_col='Id')
y_test  = pd.read_csv('2_y_test.csv', index_col='Id')

X_pred = pd.read_csv("MLUnige2023_subscriptions_test.csv", index_col="Id")

Import original data without partition

In [12]:
campaign_ad = pd.read_csv("MLUnige2023_subscriptions_train.csv", index_col="Id")

Create new partitions to train full model

In [13]:
X = campaign_ad.drop(columns='subscription')
y = campaign_ad['subscription']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=46)

Create the boosted tree with best parameters

In [14]:
boost = GradientBoostingClassifier(max_depth = 9, 
                                   n_estimators = 1250,
                                   learning_rate = 0.01)

Create column transformer and boosted tree pipeline

In [15]:
string_vars = ['job', 'device', 'education', 'marital', 'outcome_old'] 
trans = ColumnTransformer(transformers=[('onehot', OneHotEncoder(), string_vars)],
                            remainder='passthrough')

boost_pipe = Pipeline([
    ("trans", trans),
    ("boost", boost)
])


boost_pipe.fit(X_train, y_train)

Print results

In [16]:
print("Train accuracy: ", boost_pipe.score(X_train, y_train))
print("Validation accuracy: ", boost_pipe.score(X_valid, y_valid))
print("Test accuracy: ", boost_pipe.score(X_test, y_test))

Train accuracy:  0.9998404085541015
Validation accuracy:  0.857036485480268
Test accuracy:  0.8592702903946389


Fit new data and plot results

In [17]:
boost_pipe.fit(X, y)
#show results
print("Train accuracy: ", boost_pipe.score(X, y))
print("Train accuracy: ", boost_pipe.score(X_train, y_train))
print("Validation accuracy: ", boost_pipe.score(X_valid, y_valid))
print("Test accuracy: ", boost_pipe.score(X_test, y_test))

Train accuracy:  0.9989946380697051
Train accuracy:  0.9992020427705075
Validation accuracy:  0.9985107967237528
Test accuracy:  0.9985107967237528


Predict values of test set

In [None]:
y_pred = boost_pipe.predict(X_pred)

Export to file

In [None]:
file = open('test_file_boost.csv', 'w')
writer = csv.writer(file)
writer.writerow(['Id', 'subscription'])
for i in range(len(y_pred)):
    writer.writerow([i, y_pred[i]])
file.close()