Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from math import isclose
import matplotlib.pyplot as plt
import random as rd

from xgboost.sklearn import XGBClassifier

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.compose import make_column_selector as selector
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV

from imblearn.under_sampling import CondensedNearestNeighbour, RandomUnderSampler

Importing raw data

In [None]:
dir = 'C:/Users/GEOFF/OneDrive/Documents/Apziva/Term_Deposit_Marketing/'
path = dir+'data/raw/'
dataframe = pd.read_csv(path+"term-deposit-marketing-2020.csv")
target_name = 'y'
target = dataframe[target_name]
data = dataframe.drop(columns = [target_name])

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

print('There are %s costumer datapoints'%len(data))
print('\nThere are %s features, which are:'%len(data.columns))
print(set(data.columns))

print('\nThe numerical features are:')
print(numerical_columns)

print('\nThe categorical features are:')
print(categorical_columns)

print('\nThe 5 first customers data are:')
print(data.head())

Discriminating the heterogeneous features

In [None]:
print('\nDescription of the numerical features in the data:')
print(data.describe())
non_binary_categorical_columns = {}
binary_categorical_columns = []
for category in categorical_columns:
    nb_cat = pd.Series.nunique(data[category])
    if nb_cat > 2:
        non_binary_categorical_columns[category] = f' ({nb_cat} values found)'
    else:
        binary_categorical_columns.append(category)

print('\nThe binary categorical columns are:')
for cat in binary_categorical_columns:
    print(cat, ' (2 values found)')
print('\nThe other categorical columns are:')
for key, value in non_binary_categorical_columns.items():
    print(key, value)
non_binary_categorical_columns = list(non_binary_categorical_columns.keys())

Encoding the categorical features and scaling the numerical features

In [None]:
ColumnTransformer_ = make_column_transformer(
    (StandardScaler(), numerical_columns),
    (OneHotEncoder(), categorical_columns),
    remainder='passthrough',
    n_jobs = 4)

Transformed_Columns = ColumnTransformer_.fit_transform(data)
columns = ColumnTransformer_.get_feature_names_out()
print(f'The transformed features names are\n{columns}\n')
transformed_df = pd.DataFrame(Transformed_Columns.toarray(), columns = columns)

# Dropping the "no" columns resulting from the one-hot encoding of binary categorical features
columns_to_drop = []
for feature in binary_categorical_columns:
    columns_to_drop.append('onehotencoder__'+feature+'_no')
transformed_df.drop(columns = columns_to_drop, inplace = True)

print(f'The transformed dataset has a shape of {transformed_df.shape}')
print(f'A total of {len(columns)-transformed_df.shape[1]} useless encoded features were removed')

transformed_target = LabelEncoder().fit_transform(target)

Undersampling the data to obtain balance regarding to the target

In [None]:
print('The data is unbalanced')
print(f'The proportion of the minority class is {sum(transformed_target)/transformed_target.shape[0]:.2f}\n')

# undersample = CondensedNearestNeighbour(n_neighbors=1)
undersample = RandomUnderSampler()
X, y = undersample.fit_resample(transformed_df, transformed_target)

print('The dataset resulting from undersampling has a shape of', X.shape, 
      '\nThe label array resulting from undersampling has a shape of', y.shape)
print(f'The proportion of the minority class is now {sum(y)/y.shape[0]:.2f}\n')

data_train, data_test, target_train, target_test = train_test_split(
    np.array(X), y, test_size = 0.2, stratify = y, random_state=42)

print('The data resulting from undersampling is now divided into a train and a test set 80/20%')
print('The train set and label array have a shape of ', data_train.shape, target_train.shape)
print('The test set and label array have a shape of ', data_test.shape, target_test.shape)

Trying a Decision Tree model with a depth of 1 to detect the most informative feature

In [None]:
simple_tree_model = DecisionTreeClassifier(max_depth = 1,
                                    max_leaf_nodes = 2,
                                    max_features = None,
                                    random_state = 42)

simple_tree_cv_results = cross_validate(simple_tree_model, data_train, target_train, cv=5, 
                                        return_train_score = True)

t_scores = simple_tree_cv_results["train_score"]
print("The mean cross-validation training accuracy is: "
      f"{t_scores.mean():.3f} ± {t_scores.std():.3f}")
scores = simple_tree_cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} ± {scores.std():.3f}")

simple_tree_model.fit(data_train, target_train)
plot_tree(simple_tree_model)

print(f'\nThe feature that yields the most information gain is {columns[3]}')

this_transformed_column = transformed_df[columns[3]]
this_column = np.array(dataframe['duration'])
scaler = StandardScaler()
this_retransformed_column = scaler.fit_transform(this_column.reshape(-1,1))
print(f'The average call duration is {int(scaler.mean_)} sec\n',
      f'The standard deviation is {int(scaler.var_**0.5)} sec')

threshold = scaler.inverse_transform(np.array([0.469]*40000).reshape(-1,1))
print(f'The call duration threshold is {threshold[0,0]:.0f} sec')
print(f'The mean cross-validation accuracy was {scores.mean():.3f} just from using that criteria')

# Checking if the scaler StandardScaler fitted the column with the same parameters as ColumnTranformer did
equal = np.array([isclose(this_retransformed_column[i], this_transformed_column[i], abs_tol = 1e-6) for i in range(40000)])
assert equal.all()

Trying a Logistic Regression model (works)

In [None]:
logis_model = LogisticRegression(penalty = 'l2',
                                 C = 1.0,
                                 solver = 'newton-cholesky', # sag and saga solvers can be tried as well
                                 max_iter = 100,
                                 n_jobs = 4)

logis_cv_results = cross_validate(logis_model, data_train, target_train, cv=5, 
                                  return_train_score = True)

t_scores = logis_cv_results["train_score"]
print("The mean cross-validation training accuracy is: "
      f"{t_scores.mean():.3f} ± {t_scores.std():.3f}")
scores = logis_cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} ± {scores.std():.3f}")

Trying a shallow Decision Tree model (works)

In [None]:
tree_model = DecisionTreeClassifier(max_depth = 3,
                                    max_leaf_nodes = 20,
                                    max_features = None,
                                    random_state = 42)

tree_cv_results = cross_validate(tree_model, data_train, target_train, cv=5, 
                                  return_train_score = True)

t_scores = tree_cv_results["train_score"]
print("The mean cross-validation training accuracy is: "
      f"{t_scores.mean():.3f} ± {t_scores.std():.3f}")
scores = tree_cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} ± {scores.std():.3f}")
tree_model.fit(data_train, target_train)
plot_tree(tree_model)
print(f'The selected features in this tree were:\n{list(transformed_df.iloc[:,[3,30,37,25,29]].columns)}')
print('From the decision tree, for clients with a call duration below the threshold\n' 
       'if the last call was in April and if the client has no housing loan, or if last call was in March\n'
        'y is more likely to be 1')

Trying a XGBoost model

In [None]:
model = XGBClassifier()
parameters = {'n_estimators':list(range(1,501,100)), 
              'max_depth':list(range(1,11,1)), 
              'learning_rate':[round(10**(i/10),3) for i in range(-20, 0, 5)]}
clf = GridSearchCV(estimator = model, param_grid = parameters, cv = 5)
clf.fit(data_train, target_train)
print('The best parameters in the specified grid are found to be\n', clf.best_params_)
print(f'With the model accuracy reaching {clf.score(data_train, target_train):0.3f} on the training set')

In [None]:
best_params = {'learning_rate': 0.032, 'max_depth': 8, 'n_estimators': 400}

Trying the XGBoost model with above found parameters

In [None]:
n_estimators = best_params['n_estimators']
max_depth = best_params['max_depth']
learning_rate = best_params['learning_rate']

model = XGBClassifier(n_estimators = n_estimators, 
                             max_depth = max_depth, 
                             learning_rate = learning_rate, 
                             random_state = 42)

cv_results = cross_validate(model, data_train, target_train, cv=5,
                            return_train_score = True)

t_scores = cv_results["train_score"]
print("The mean training accuracy is: "
      f"{t_scores.mean():.3f} ± {t_scores.std():.3f}")
scores = cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} ± {scores.std():.3f}")

print('The model seems to be overfitting.\n',
     'An overfitting test will be run by changing the variable max_depth')

In [None]:
training_accuracies = []
validation_accuracies = []
for max_depth in range(1,11):
    model = XGBClassifier(n_estimators = n_estimators, 
                                 max_depth = max_depth, 
                                 learning_rate = learning_rate, 
                                 random_state = 42)

    cv_results = cross_validate(model, data_train, target_train, cv=5,
                                return_train_score = True)

    t_scores = cv_results["train_score"]
    training_accuracies.append(t_scores.mean())
    scores = cv_results["test_score"]
    validation_accuracies.append(scores.mean())
fig, ax = plt.subplots(1,1)
ax.plot(list(range(1,11)), training_accuracies, color = 'blue', label = 'Training Accuracy')
ax.plot(list(range(1,11)), validation_accuracies, color = 'red', label = 'Validation Accuracy')
ax.legend(fontsize = 12)
ax.set_xlabel('Max_depth')
ax.set_ylabel('Model Accuracy')
ax.set_title('Overfitting test')
plt.show()

From the figure above, it can be seen that the cross-validation accuracy 
starts plateau-ing when max_depth >= 4.
The XGBoost model with a max depth of 4 will be kept.
The cross-validation results for each fold are provided below with more details

In [None]:
best_params = {'learning_rate': 0.032, 'max_depth': 4, 'n_estimators': 400}
n_estimators = best_params['n_estimators']
max_depth = best_params['max_depth']
learning_rate = best_params['learning_rate']

model = XGBClassifier(n_estimators = n_estimators, 
                             max_depth = max_depth, 
                             learning_rate = learning_rate, 
                             random_state = 42)

The cross-validation accuracy results will be displayed in detail for each run

In [None]:
def Kfold_crossvalidation(partition):
    accuracy = 0
    n = 0
    for train, test in partition:
        model.fit(data_train[train], target_train[train])
        target_train_predict = model.predict(data_train[test])
        accuracy += model.score(data_train[test], target_train[test])
        n += 1
        print(classification_report(target_train[test], target_train_predict))
    accuracy *= 1/n
    print(f'cross validation accuracy = {accuracy}')

skf = StratifiedKFold(5, shuffle = False)
Kfold_crossvalidation(skf.split(data_train, target_train))

Next, trying to identify a costumer segment more likely to be successful

In [None]:
segment_df = transformed_df.iloc[:,5:23]
print(f'The features considered are now\n{list(segment_df.columns)}')
segment_data_train = data_train[:,5:23]

In [None]:
logis_model = LogisticRegression(penalty = 'l2',
                                 C = 1.0,
                                 solver = 'newton-cholesky', # sag and saga solvers can be tried as well
                                 max_iter = 100,
                                 n_jobs = 4)

logis_cv_results = cross_validate(logis_model, segment_data_train, target_train, cv=5, 
                                  return_train_score = True)

t_scores = logis_cv_results["train_score"]
print("The mean cross-validation training accuracy is: "
      f"{t_scores.mean():.3f} ± {t_scores.std():.3f}")
scores = logis_cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} ± {scores.std():.3f}")

In [None]:
segment_tree_model = DecisionTreeClassifier(max_depth = None,
                                            max_features = None,
                                            random_state = 42)

segment_tree_cv_results = cross_validate(segment_tree_model, segment_data_train, target_train, cv=5, 
                                         return_train_score = True)

t_scores = segment_tree_cv_results["train_score"]
print("The mean cross-validation training accuracy is: "
      f"{t_scores.mean():.3f} ± {t_scores.std():.3f}")
scores = segment_tree_cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} ± {scores.std():.3f}")
print('\nThe segment of costumers does not provide enough information to yield a conclusion on the target y.')
print('No costumer segment should be prioritized')

Testing the previous models on test data

In [None]:
logis_model.fit(data_train, target_train)
target_test_predict = logis_model.predict(data_test)
print('\nFor the Logistic Regression model, the test results are:\n')
print(classification_report(target_test, target_test_predict))

tree_model.fit(data_train, target_train)
target_test_predict = tree_model.predict(data_test)
print('For the Decision Tree model, the test results are:\n')
print(classification_report(target_test, target_test_predict))

model.fit(data_train, target_train)
target_test_predict = model.predict(data_test)
print('For the XGBoost model, the test results are:\n')
print(classification_report(target_test, target_test_predict))