# Financial Inclusion in Africa - Notebook

# Part 1 Data prep and cleaning

In [31]:
# Load packages
# dataframe and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning
#from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
#from imblearn.over_sampling import SMOTE

In [32]:
# Load data
df = pd.read_csv('Train.csv')
#zindi_test = pd.read_csv("Test.csv")
#test_df = pd.read_csv("Test.csv")

# Feature Engineering

In [33]:
# Convert all non-binary categories to k categories
cats = ["country", "relationship_with_head", "marital_status", "education_level", "job_type"] 
df_dumm2 = pd.get_dummies(df, prefix_sep="_", columns = cats)

# Convert all binary categories to k-1 categories
bin_cat = ["bank_account", "location_type", "cellphone_access","gender_of_respondent"]
df_dumm2 = pd.get_dummies(df_dumm2, prefix_sep="_", columns = bin_cat, drop_first=True)
# Drop 'uniqueid'
df_dumm2.drop("uniqueid",inplace=True, axis= 1)

df_dumm2.head()


Unnamed: 0,year,household_size,age_of_respondent,country_Kenya,country_Rwanda,country_Tanzania,country_Uganda,relationship_with_head_Child,relationship_with_head_Head of Household,relationship_with_head_Other non-relatives,...,job_type_Government Dependent,job_type_Informally employed,job_type_No Income,job_type_Other Income,job_type_Remittance Dependent,job_type_Self employed,bank_account_Yes,location_type_Urban,cellphone_access_Yes,gender_of_respondent_Male
0,2018,3,24,True,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,True,False
1,2018,5,70,True,False,False,False,False,True,False,...,True,False,False,False,False,False,False,False,False,False
2,2018,5,26,True,False,False,False,False,False,False,...,False,False,False,False,False,True,True,True,True,True
3,2018,5,34,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
4,2018,8,26,True,False,False,False,True,False,False,...,False,True,False,False,False,False,False,True,False,True


In [34]:
# Train-Test-Split
y2 = df_dumm2[["bank_account_Yes"]]
X2 = df_dumm2.drop("bank_account_Yes", axis = 1)


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state=42, stratify=y2) # Default 25% in test



# Non-Scaled, unbalanced und 'neue' encoding methode. Hat bei mir am Besten funktioniert.

In [35]:
param_grid = {"max_depth": [2, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "min_samples_split": randint(1,100),
              "criterion": ["gini", "entropy"]}

rs = RandomizedSearchCV(DecisionTreeClassifier(), param_grid, scoring='accuracy', # CHANGE TO YOUR CLASSIFIER
                  cv=3, verbose=0, n_jobs=-1, n_iter=300)

rs.fit(X_train, y_train)

6 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/fisch/neuefische/ds-ml-financial-inclusion-project/.venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/fisch/neuefische/ds-ml-financial-inclusion-project/.venv/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/Users/fisch/neuefische/ds-ml-financial-inclusion-project/.venv/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/Users/fisch/neuefische/ds-

In [36]:
y_pred_rs = rs.predict(X_test)

print(confusion_matrix(y_test, y_pred_rs))
print(classification_report(y_test, y_pred_rs))
print(accuracy_score(y_test, y_pred_rs))

rs.best_params_

[[4891  162]
 [ 527  301]]
              precision    recall  f1-score   support

       False       0.90      0.97      0.93      5053
        True       0.65      0.36      0.47       828

    accuracy                           0.88      5881
   macro avg       0.78      0.67      0.70      5881
weighted avg       0.87      0.88      0.87      5881

0.8828430539023976


{'criterion': 'entropy',
 'max_depth': None,
 'max_features': 8,
 'min_samples_leaf': 6,
 'min_samples_split': 73}

# Scaling

In [37]:
# # Min-Max-Scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [38]:
param_grid = {"criterion" : ["gini"],
              "max_depth" : [2,3,4,5,6,7,8,9,10]#,
              ######## HYPERPARAMETERS HERE #####################
             }

rs = RandomizedSearchCV(DecisionTreeClassifier(), param_grid, scoring='accuracy',
                  cv=3, verbose=0, n_jobs=-1, n_iter=300)

rs.fit(X_train_scaled, y_train)



In [39]:
y_pred_rs = rs.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred_rs))
print(classification_report(y_test, y_pred_rs))
print(accuracy_score(y_test, y_pred_rs))

rs.best_params_

[[4887  166]
 [ 541  287]]
              precision    recall  f1-score   support

       False       0.90      0.97      0.93      5053
        True       0.63      0.35      0.45       828

    accuracy                           0.88      5881
   macro avg       0.77      0.66      0.69      5881
weighted avg       0.86      0.88      0.86      5881

0.8797823499404863


{'max_depth': 6, 'criterion': 'gini'}

# OVERSAMPLING

In [41]:
from imblearn.over_sampling import SMOTE
#sudo install imblearn

smote = SMOTE()

#fit predictor and target variable

X_smote, y_smote = smote.fit_resample(X_train,y_train)

ModuleNotFoundError: No module named 'imblearn'

In [None]:
param_grid = {"criterion" : ["gini"],
              "max_depth" : [2,3,4,5,6,7,8,9,10]#,
              ######## HYPERPARAMETERS HERE #####################
             }

rs = RandomizedSearchCV(DecisionTreeClassifier(), param_grid, scoring='accuracy',
                  cv=3, verbose=0, n_jobs=-1, n_iter=300)

rs.fit(X_smote, y_smote)

In [None]:
y_pred_rs = rs.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred_rs))
print(classification_report(y_test, y_pred_rs))
print(accuracy_score(y_test, y_pred_rs))

rs.best_params_

# Old encoding

In [None]:
#cats = ["country", "relationship_with_head", "marital_status", "education_level", "job_type"] 
#df_dumm2 = pd.get_dummies(df, prefix_sep="_", columns = cats)

# Convert all binary categories to k-1 categories
bin_cat = ["bank_account", "location_type", "cellphone_access","gender_of_respondent", "country", "relationship_with_head", "marital_status", "education_level", "job_type"]
df_dumm3 = pd.get_dummies(df, prefix_sep="_", columns = bin_cat, drop_first=True)
# Drop 'uniqueid'
df_dumm3.drop("uniqueid",inplace=True, axis= 1)

df_dumm3.head()



In [None]:
# Train-Test-Split
y3 = df_dumm3[["bank_account_Yes"]]
X3 = df_dumm3.drop("bank_account_Yes", axis = 1)


from sklearn.model_selection import train_test_split
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X3, y3, random_state=42, stratify=y3) # Default 25% in test



In [None]:
param_grid = {"criterion" : ["gini"],
              "max_depth" : [2,3,4,5,6,7,8,9,10]#,
              ######## HYPERPARAMETERS HERE #####################
             }

rs = RandomizedSearchCV(DecisionTreeClassifier(), param_grid, scoring='accuracy',
                  cv=3, verbose=0, n_jobs=-1, n_iter=300)

rs.fit(X_train_enc, y_train_enc)

In [None]:
y_pred_rs = rs.predict(X_test_enc)

print(confusion_matrix(y_test_enc, y_pred_rs))
print(classification_report(y_test_enc, y_pred_rs))
print(accuracy_score(y_test_enc, y_pred_rs))

rs.best_params_