In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.metrics import SCORERS
from sklearn.metrics import classification_report


import seaborn as sns

In [2]:
df = pd.read_csv('bank-additional-full.csv', sep= ';')

In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [4]:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [5]:
le = LabelEncoder()
df['y'] = le.fit_transform(df['y'], )
df['y']

0        0
1        0
2        0
3        0
4        0
        ..
41183    1
41184    0
41185    0
41186    1
41187    0
Name: y, Length: 41188, dtype: int64

In [16]:
df_dummies = pd.get_dummies(df)

In [17]:
train_set, test_set = train_test_split(df_dummies, test_size=0.2, random_state=50, stratify=df['y'])  

In [18]:
print(train_set.shape)
print(test_set.shape)

(32950, 64)
(8238, 64)


In [19]:
x_train = train_set.drop(["y"], axis=1).copy()
y_train = train_set["y"].copy()
x_test = test_set.drop(["y"], axis=1).copy()
y_test = test_set["y"].copy()

In [20]:
print(x_train.shape)
print(y_train.shape)

(32950, 63)
(32950,)


In [21]:
sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we

In [24]:
clf = RandomForestClassifier(n_estimators= 400, max_depth=6, random_state=60)

In [25]:
scoring = "f1_macro"
scores = cross_val_score(clf, x_train, y_train, cv=10, scoring = scoring)
print(f"{scoring}: %.2f +/- %.2f" % (scores.mean(), scores.std()))

f1_macro: 0.63 +/- 0.02


In [26]:
clf.fit(x_train, y_train)

RandomForestClassifier(max_depth=6, n_estimators=400, random_state=60)

In [27]:
y_test_predictions = clf.predict(x_test)
y_test_predictions

array([0, 0, 0, ..., 0, 0, 0])

In [28]:
print(classification_report(y_test, y_test_predictions))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      7310
           1       0.75      0.21      0.33       928

    accuracy                           0.90      8238
   macro avg       0.83      0.60      0.64      8238
weighted avg       0.89      0.90      0.88      8238



In [29]:
clf = RandomForestClassifier(n_estimators= 400, max_depth=6, random_state=60)
clf.fit(x_train, y_train)
y_test_predictions = clf.predict(x_test)

In [30]:
importance = clf.feature_importances_
features = x_train.columns
feature_importance_dict = dict(zip(features, importance))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x:x[1], reverse=True )
sorted_feature_importance[:7]

[('duration', 0.2654758235763924),
 ('nr.employed', 0.14981441221469122),
 ('euribor3m', 0.13316784319044636),
 ('pdays', 0.08292033356901936),
 ('poutcome_success', 0.06803677637404498),
 ('emp.var.rate', 0.053177265363669825),
 ('cons.conf.idx', 0.05292312584919447)]