In [None]:
#import libraries
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix,roc_auc_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import f1_score, recall_score

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

RSEED = 42


In [None]:
#load data
df = pd.read_csv('data/Train.csv')
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
sns.countplot(data=df,x='bank_account',)

In [None]:
sns.pairplot(df, hue='bank_account')

In [None]:
df.columns

In [None]:
df.year.value_counts()

In [None]:
df.country.value_counts()

In [None]:
plt.figure(figsize=(15,5))
fig = sns.countplot(x='country',data=df,hue='bank_account')
plt.xticks(rotation=90);

In [None]:
df.bank_account.value_counts()

In [None]:
df.location_type.value_counts()

In [None]:
df.cellphone_access.value_counts()

In [None]:
df.job_type.value_counts()

In [None]:
plt.figure(figsize=(15,5))
fig = sns.countplot(x='job_type',data=df,hue='bank_account')
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(15,5))
fig = sns.histplot(x = df['job_type'], hue=df['bank_account'], multiple="dodge", 
                  stat = 'density', shrink = 0.8, common_norm=True)
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(15,5))

x,y = 'job_type', 'bank_account'

df1 = df.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1)
g.ax.set_ylim(0,100)
plt.xticks(rotation=90);
for p in g.ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() 
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt)

In [None]:
plt.figure(figsize=(15,5))
fig = sns.countplot(x='relationship_with_head',data=df,hue='bank_account')
plt.xticks(rotation=90);

In [None]:
df.marital_status.value_counts()

In [None]:
df.education_level.value_counts()

In [None]:
plt.figure(figsize=(15,5))
fig = sns.countplot(x='education_level',data=df,hue='bank_account')
plt.xticks(rotation=90);

In [None]:
df.columns

In [None]:
y = df.bank_account
X = df.drop(['bank_account','uniqueid'],axis=1)

In [None]:
#create dummies
# define list of categorical features
cat_feat = ['country', 'year',  'location_type',
       'cellphone_access', 
       'gender_of_respondent', 'relationship_with_head', 'marital_status',
       'education_level', 'job_type']

In [None]:
X = pd.get_dummies(X,columns=cat_feat,drop_first=True)
X.head()

In [None]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RSEED,stratify=y)

In [None]:
#transform strings to numbers in target
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train =le.fit_transform(y_train)
y_test = le.transform(y_test)
y_test


In [None]:
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
        print("--------"*5)
    else:
        print('Confusion matrix, without normalization:')
        print("--------"*5)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# from sklearn.metrics import confusion_matrix
# import itertools

# def plot_confusion_matrix(cm, classes,
#                           normalize=True,
#                           title='Confusion matrix',
#                           cmap=plt.cm.Oranges):
#     """
#     This function prints and plots the confusion matrix.
#     Normalization can be applied by setting `normalize=True`.
#     Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
#     """
#     if normalize:
#         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#         print("Normalized confusion matrix")
#     else:
#         print('Confusion matrix, without normalization')

#     print(cm)

#     plt.figure(figsize = (10, 10))
#     plt.imshow(cm, interpolation='nearest', cmap=cmap)
#     plt.title(title, size = 24)
#     plt.colorbar(aspect=4)
#     tick_marks = np.arange(len(classes))
#     plt.xticks(tick_marks, classes, rotation=45, size = 14)
#     plt.yticks(tick_marks, classes, size = 14)

#     fmt = '.2f' if normalize else 'd'
#     thresh = cm.max() / 2.
    
#     # Labeling the plot
#     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#         plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
#                  horizontalalignment="center",
#                  color="white" if cm[i, j] > thresh else "black")
        
#     plt.grid(None)
#     plt.tight_layout()
#     plt.ylabel('True label', size = 18)
#     plt.xlabel('Predicted label', size = 18)

In [None]:
# test boolean logic for baseline model
test = (X_test['cellphone_access_Yes'] == 1) & (X_test['location_type_Urban'] == 1)
#test = [0 if x == 'True' else 1 for x in test]
test = le.fit_transform(test)
len(test)


In [None]:
#non-algorithmic baseline model
def my_baseline(x):
    selection = (X_test['cellphone_access_Yes'] == 1) & (X_test['location_type_Urban'] == 1) 
    y_pred = le.fit_transform(selection)
    return y_pred   

# predict values with baseline model
y_baseline_test = my_baseline(X_test)
#print(len(y_test))
#print(len(y_baseline_test))
#print(len(X_test))
print('Accuracy on test:', accuracy_score(y_test,y_baseline_test))
print('Recall on test:', recall_score(y_test,y_baseline_test))
print('F1-score on test:', f1_score(y_test,y_baseline_test))
print(classification_report(y_test,y_baseline_test))

# Confusion matrix using pandas crosstab
#confusion_matrix = pd.crosstab(y_test, y_baseline_test, rownames=['Actual'], colnames=['Predicted'],normalize=False)
#sns.heatmap(confusion_matrix, annot=True);

cm = confusion_matrix(y_test, y_baseline_test)
plot_confusion_matrix(cm, classes = ['No bank', 'Bank account'],
                       title = 'Financial inclusion',normalize=True)


In [None]:
#dummy classifier as baseline (Zero Rule)
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent') # most frequent = predicting always the majority class (ZeroR)
dummy_clf.fit(X_train,y_train)
y_pred = dummy_clf.predict(X_test)
print(dummy_clf.score(X_test,y_test))
print('F1-score on test:', f1_score(y_test,y_pred))
print('Recall on test:', recall_score(y_test,y_pred))

print(classification_report(y_test,y_pred))
# Confusion matrix using pandas crosstab
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'],normalize=True)
sns.heatmap(confusion_matrix, annot=True);


In [None]:
#Model 1 with dummies
#logistic regression 
from locale import normalize


logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
y_pred = logistic_regression.predict(X_test)

print(logistic_regression.score(X_test,y_test))
print('F1-score on test:', f1_score(y_test,y_pred))
print('Recall:', recall_score(y_test,y_pred))
print('ROC-AUC:', roc_auc_score(y_test,y_pred))
print("--------"*10)
print(classification_report(y_test,y_pred))

cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes = ['No bank', 'Bank account'],
                       title = 'Financial inclusion',normalize=True)



In [None]:
# Model 2 with dummies
# AdABoost
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(random_state=RSEED)
ada_clf.fit(X_train,y_train)
y_pred = ada_clf.predict(X_test)

print(logistic_regression.score(X_test,y_test))
print('F1-score on test:', f1_score(y_test,y_pred))
print('Recall:', recall_score(y_test,y_pred))
print('ROC-AUC:', roc_auc_score(y_test,y_pred))
print("--------"*10)
print(classification_report(y_test,y_pred))

cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes = ['No bank', 'Bank account'],
                       title = 'Financial inclusion',normalize=True)

In [None]:
# # TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries


# # TODO: Initialize the classifier
# clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), random_state=RSEED) 

# # TODO: Create the parameters list you wish to tune
# parameters = {'n_estimators':[50, 100],                
#               'learning_rate':[0.1, 0.5, 1.],               
#               'base_estimator__min_samples_split' : np.arange(2, 8, 2),               
#               'base_estimator__max_depth' : np.arange(1, 4, 1)              
#              } 



# # TODO: Perform grid search on the classifier using 'scorer' as the scoring method
# grid_obj = GridSearchCV(clf, parameters, verbose=1, n_jobs=-1)

# # TODO: Fit the grid search object to the training data and find the optimal parameters
# grid_fit = grid_obj.fit(X_train,y_train) 

# # Get the estimator
# best_clf = grid_fit.best_estimator_ 


In [None]:

# # Make predictions using the unoptimized and model
# #predictions = (clf.fit(X_train, y_train)).predict(X_test)
# y_pred = best_clf.predict(X_test) 

# # Compute classification report
# print('Accuracy on test', accuracy_score(y_test,y_pred))
# print('F1-score on test:', f1_score(y_test,y_pred))
# print('Recall:', recall_score(y_test,y_pred))
# print('ROC-AUC-score:',roc_auc_score(y_test,y_pred))
# print("--------"*10)
# print(classification_report(y_test, y_pred))

# # Compute confusion matrix
# cnf_matrix = confusion_matrix(y_test, y_pred)
# cnf_matrix =  cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]

# # Plot non-normalized confusion matrix
# plot_confusion_matrix(cnf_matrix, classes=['No bank', 'Bank account'], normalize=False, title='Confusion matrix')


In [None]:
#standard vector classifier
from sklearn.svm import SVC

svc_clf = SVC(kernel='rbf',C=1000)
svc_clf.fit(X_train, y_train)
y_pred = svc_clf.predict(X_test)

print(logistic_regression.score(X_test,y_test))
print('F1-score on test:', f1_score(y_test,y_pred))
print('Recall:', recall_score(y_test,y_pred))
print('ROC-AUC:', roc_auc_score(y_test,y_pred))
print("--------"*10)
print(classification_report(y_test,y_pred))

cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes = ['No bank', 'Bank account'],
                       title = 'Financial inclusion',normalize=True)

In [None]:
# # Define hyperparameter grid 
# param_grid = [{'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 
#                'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 'auto', 'scale'],
#                'C': [0.01, 0.1, 1, 10, 100],
#                'degree': [2, 3, 4]
#               }]

# gs = RandomizedSearchCV(SVC(), param_grid, scoring=f1_score,cv=5, verbose=1, n_jobs=-1);

# gs.fit(X_train, y_train);

In [None]:
# # Get the estimator
# best_clf = gs.best_estimator_ 
# best_clf

In [None]:
# y_pred = best_clf.predict(X_test)
# # Compute classification report
# print(classification_report(y_test, y_pred))

# # Compute confusion matrix
# cnf_matrix = confusion_matrix(y_test, y_pred)

# # Plot non-normalized confusion matrix
# plot_confusion_matrix(cnf_matrix, classes=['No bank', 'Bank account'], normalize=False, title='Confusion matrix')

In [451]:
# new data for label encoding
y = df.bank_account
X = df.drop(['bank_account','uniqueid'],axis=1)

In [452]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RSEED,stratify=y)

In [453]:
#list for encoder
cat_predictor = list(X_train.columns)
cat_predictor.remove("household_size")
cat_predictor.remove("age_of_respondent")
print(cat_predictor)

['country', 'year', 'location_type', 'cellphone_access', 'gender_of_respondent', 'relationship_with_head', 'marital_status', 'education_level', 'job_type']


In [454]:
#label encoding
#X columns
for predictor in cat_predictor:
    X_train[(predictor)] = le.fit_transform(X_train[(predictor)])
    X_test[(predictor)] = le.transform(X_test[(predictor)])
#y column (target)
y_train =le.fit_transform(y_train)
y_test = le.transform(y_test)


In [None]:
#logreg
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
y_pred = logistic_regression.predict(X_test)
print(logistic_regression.score(X_test,y_test))
print('F1-score on test:', f1_score(y_test,y_pred))
print('Recall:', recall_score(y_test,y_pred))
print('ROC-AUC:', roc_auc_score(y_test,y_pred))
print("--------"*10)
print(classification_report(y_test,y_pred))

cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes = ['No bank', 'Bank account'],
                       title = 'Financial inclusion',normalize=True)


In [455]:
#AdaBoost
ada_clf = AdaBoostClassifier(random_state=RSEED)
ada_clf.fit(X_train,y_train)
y_pred = ada_clf.predict(X_test)

print(logistic_regression.score(X_test,y_test))
print('F1-score on test:', f1_score(y_test,y_pred))
print('Recall:', recall_score(y_test,y_pred))
print('ROC-AUC:', roc_auc_score(y_test,y_pred))
print("--------"*10)
print(classification_report(y_test,y_pred))

cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes = ['No bank', 'Bank account'],
                       title = 'Financial inclusion',normalize=True)


0.8760414895425948
F1-score on test: 0.45678033306899285
Recall: 0.34782608695652173
ROC-AUC: 0.6595651313468538
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.97      0.93      5053
           1       0.67      0.35      0.46       828

    accuracy                           0.88      5881
   macro avg       0.78      0.66      0.70      5881
weighted avg       0.87      0.88      0.87      5881



TypeError: 'DataFrame' object is not callable

In [456]:
#XGB
from xgboost import XGBClassifier

xg_clf = XGBClassifier()
xg_clf.fit(X_train,y_train)
y_pred = xg_clf.predict(X_test)

print(logistic_regression.score(X_test,y_test))
print('F1-score on test:', f1_score(y_test,y_pred))
print('Recall:', recall_score(y_test,y_pred))
print('ROC-AUC:', roc_auc_score(y_test,y_pred))
print("--------"*10)
print(classification_report(y_test,y_pred))

cm = confusion_matrix(y_test, y_baseline_test)
plot_confusion_matrix(cm, classes = ['No bank', 'Bank account'],
                       title = 'Financial inclusion',normalize=True)


0.8760414895425948
F1-score on test: 0.4992526158445442
Recall: 0.4033816425120773
ROC-AUC: 0.6842754244620549
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      5053
           1       0.65      0.40      0.50       828

    accuracy                           0.89      5881
   macro avg       0.78      0.68      0.72      5881
weighted avg       0.87      0.89      0.87      5881



TypeError: 'DataFrame' object is not callable

In [458]:
# Defining parameter grid (as dictionary)
param_grid = {  "learning_rate"    : [ 0.20, 0.25, 0.30 ] ,
                'min_child_weight': [1, 2],
                'gamma': [0.5, 1, 1.5],
                'subsample': [0.6, 0.8, 1.0],
                'colsample_bytree': [0.6, 0.8, 1.0],
                'max_depth': [ 5, 6, 8]
            
             }

# Instantiate gridsearch and define the metric to optimize 
gs = GridSearchCV(XGBClassifier(), param_grid, scoring='f1',
                  cv=5, verbose=5, n_jobs=-1);

# Fit gridsearch object to data
gs.fit(X_train, y_train);

Fitting 5 folds for each of 3240 candidates, totalling 16200 fits
[CV 1/5] END colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=3, min_child_weight=1, subsample=0.8;, score=0.406 total time=   0.9s
[CV 2/5] END colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=3, min_child_weight=1, subsample=0.8;, score=0.393 total time=   0.9s
[CV 3/5] END colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=3, min_child_weight=1, subsample=0.8;, score=0.380 total time=   0.9s
[CV 4/5] END colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=3, min_child_weight=1, subsample=0.6;, score=0.384 total time=   0.9s
[CV 2/5] END colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=3, min_child_weight=1, subsample=0.6;, score=0.401 total time=   0.9s
[CV 5/5] END colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=3, min_child_weight=1, subsample=0.6;, score=0.428 total time=   0.9s
[CV 1/5] END colsample_bytree=0.6, gamma=0.5, learning_rat

In [459]:
# Best score
print('Best score:', round(gs.best_score_, 3))

# Best parameters
print('Best parameters:', gs.best_params_)

Best score: 0.485
Best parameters: {'colsample_bytree': 0.6, 'gamma': 1, 'learning_rate': 0.25, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 0.6}


In [462]:
# Assigning the fitted 
xgb_best = gs.best_estimator_

# Making predictions on the test set
y_pred_train = xgb_best.predict(X_train)
y_pred_test = xgb_best.predict(X_test)

print(logistic_regression.score(X_test,y_test))
print('F1-score on test:', f1_score(y_test,y_pred))
print('Recall:', recall_score(y_test,y_pred))
print('ROC-AUC:', roc_auc_score(y_test,y_pred))
print("--------"*10)
print(classification_report(y_test,y_pred))

cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes = ['No bank', 'Bank account'],
                       title = 'Financial inclusion',normalize=True)

0.8760414895425948
F1-score on test: 0.4992526158445442
Recall: 0.4033816425120773
ROC-AUC: 0.6842754244620549
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      5053
           1       0.65      0.40      0.50       828

    accuracy                           0.89      5881
   macro avg       0.78      0.68      0.72      5881
weighted avg       0.87      0.89      0.87      5881



TypeError: 'DataFrame' object is not callable