# Financial Inclusion in Africa - Notebook

# Part 1 Data prep and cleaning

In [1]:
# Load packages
# dataframe and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning
#from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
# Load data
df = pd.read_csv('data/Train.csv')

In [6]:
df.head()
# Column uniqueid is NOT unique. Only unique in combination with country

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [7]:
df.bank_account.head

<bound method NDFrame.head of 0        Yes
1         No
2        Yes
3         No
4         No
        ... 
23519     No
23520     No
23521     No
23522     No
23523     No
Name: bank_account, Length: 23524, dtype: object>

In [8]:
df.info()
# Many categorical variables -> need to create dummies
# bank_account = target = needs to be numerical as well (contains yes and no)
# No NaNs, which is nice!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


Data exploration was mainly done here: https://medium.com/analytics-vidhya/why-you-need-to-explore-your-data-how-you-can-start-13de6f29c8c1

Main takeaways: 14% have a bankaccount, 86% don't. Highly imbalanced. Might need tweaking for modelling!

In [9]:
# Bank account distribution among category levels
ctdf = df.select_dtypes(include='object').drop(["bank_account","uniqueid"], axis=1)
ctdf_y = df.bank_account


for column_name in ctdf.columns:
    print(pd.crosstab(ctdf[column_name], ctdf_y,normalize="index"))
    print("____________")


# Main takeaways:
# - No cellphone = most likely no bank account
# - Differences among countries
# - Small differences between rural and urban and also between genders. Smaller as expected. 
# education_level and job_type have a high influence and are POTENTIALLY correlated.
# Need to research banking practice in Africa !!!

pd.crosstab(ctdf.job_type, ctdf.education_level,normalize="index")

bank_account        No       Yes
country                         
Kenya         0.749341  0.250659
Rwanda        0.885175  0.114825
Tanzania      0.908308  0.091692
Uganda        0.913851  0.086149
____________
bank_account         No       Yes
location_type                    
Rural          0.883497  0.116503
Urban          0.821261  0.178739
____________
bank_account            No       Yes
cellphone_access                    
No                0.982867  0.017133
Yes               0.816203  0.183797
____________
bank_account                No       Yes
gender_of_respondent                    
Female                0.893205  0.106795
Male                  0.810304  0.189696
____________
bank_account                  No       Yes
relationship_with_head                    
Child                   0.913414  0.086586
Head of Household       0.822851  0.177149
Other non-relatives     0.894737  0.105263
Other relative          0.901198  0.098802
Parent                  0.940147  0.059853
S

education_level,No formal education,Other/Dont know/RTA,Primary education,Secondary education,Tertiary education,Vocational/Specialised training
job_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dont Know/Refuse to answer,0.142857,0.0,0.595238,0.238095,0.015873,0.007937
Farming and Fishing,0.258225,0.0,0.552472,0.162103,0.009741,0.01746
Formally employed Government,0.010336,0.005168,0.108527,0.335917,0.312661,0.22739
Formally employed Private,0.03128,0.001896,0.309953,0.251185,0.212322,0.193365
Government Dependent,0.530364,0.004049,0.37247,0.036437,0.032389,0.024291
Informally employed,0.201894,0.002859,0.608719,0.141147,0.031445,0.013936
No Income,0.185008,0.004785,0.499203,0.22488,0.07496,0.011164
Other Income,0.17037,0.002778,0.477778,0.263889,0.043519,0.041667
Remittance Dependent,0.195489,0.000791,0.37871,0.317768,0.068856,0.038385
Self employed,0.155352,0.000932,0.630107,0.137952,0.047382,0.028274


In [10]:
pd.crosstab(ctdf.job_type, ctdf.education_level,normalize="index")

# There are - as expected - correlations between job and education

pd.crosstab(ctdf.country, ctdf.education_level,normalize="index")

education_level,No formal education,Other/Dont know/RTA,Primary education,Secondary education,Tertiary education,Vocational/Specialised training
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Kenya,0.155076,0.001318,0.428477,0.294166,0.048121,0.072841
Rwanda,0.242931,0.002061,0.553635,0.170693,0.020263,0.010418
Tanzania,0.171148,0.000453,0.638066,0.065408,0.099094,0.025831
Uganda,0.151832,0.002856,0.538315,0.244645,0.015231,0.04712


# Feature Engineering

In [11]:
# Convert all categories into dummies
cats = ["country", "bank_account", "location_type", "cellphone_access","gender_of_respondent", "relationship_with_head", "marital_status", "education_level", "job_type"] 
df_dumm = pd.get_dummies(df, prefix_sep="_", columns = cats ,drop_first=True)


# Drop 'uniqueid'
df_dumm.drop("uniqueid",inplace=True, axis= 1)


In [12]:
# Train-Test-Split
y = df_dumm[["bank_account_Yes"]]
X = df_dumm.drop("bank_account_Yes", axis = 1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y) # Default 25% in test



In [13]:
# Min-Max-Scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled


array([[0.        , 0.15      , 0.33333333, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.1       , 0.54761905, ..., 0.        , 1.        ,
        0.        ],
       [0.5       , 0.05      , 0.11904762, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.25      , 0.23809524, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.15      , 0.35714286, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.15      , 0.07142857, ..., 0.        , 0.        ,
        0.        ]])

# Model

- Metric: Our stakeholder - the UN - wants an accurate and unbiased view of the banking situation in Africa. We therefore think that the accuracy - although the data is not balanced - is a good starter metric

- Baseline: To showcase the stakeholder what is possible with data science. Very simple but assumption driven. Selection of 1 or 2 features. 

Idea: Rural and no/early_education

In [15]:
df_dumm.columns

Index(['year', 'household_size', 'age_of_respondent', 'country_Rwanda',
       'country_Tanzania', 'country_Uganda', 'bank_account_Yes',
       'location_type_Urban', 'cellphone_access_Yes',
       'gender_of_respondent_Male', 'relationship_with_head_Head of Household',
       'relationship_with_head_Other non-relatives',
       'relationship_with_head_Other relative',
       'relationship_with_head_Parent', 'relationship_with_head_Spouse',
       'marital_status_Dont know', 'marital_status_Married/Living together',
       'marital_status_Single/Never Married', 'marital_status_Widowed',
       'education_level_Other/Dont know/RTA',
       'education_level_Primary education',
       'education_level_Secondary education',
       'education_level_Tertiary education',
       'education_level_Vocational/Specialised training',
       'job_type_Farming and Fishing', 'job_type_Formally employed Government',
       'job_type_Formally employed Private', 'job_type_Government Dependent',
       'j

In [16]:
y_pred_baseline = []

X_test_scaled_df = pd.DataFrame(X_test_scaled,columns = df_dumm.drop("bank_account_Yes",axis=1).columns)

for index, row in X_test_scaled_df.iterrows():
    if row["location_type_Urban"] or row['education_level_Other/Dont know/RTA'] or row['education_level_Secondary education'] or row['education_level_Tertiary education'] or row[ 'education_level_Vocational/Specialised training']:
        y_pred_baseline.append(1)
    else:
        y_pred_baseline.append(0)

print(y_pred_baseline)

[1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 

In [17]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred_baseline))
print(classification_report(y_test, y_pred_baseline))

# Our baseline model has an accuracy of 55% and many false positives.

[[2574 2479]
 [ 176  652]]
              precision    recall  f1-score   support

       False       0.94      0.51      0.66      5053
        True       0.21      0.79      0.33       828

    accuracy                           0.55      5881
   macro avg       0.57      0.65      0.49      5881
weighted avg       0.83      0.55      0.61      5881



In [71]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
test = DecisionTreeClassifier()

test.fit(X_train_scaled, y_train)

y_pred = test.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4551  502]
 [ 480  348]]
              precision    recall  f1-score   support

       False       0.90      0.90      0.90      5053
        True       0.41      0.42      0.41       828

    accuracy                           0.83      5881
   macro avg       0.66      0.66      0.66      5881
weighted avg       0.83      0.83      0.83      5881



In [18]:
np.savetxt('data/X_train_scaled.csv', X_train_scaled, delimiter=',')
np.savetxt('data/X_test_scaled.csv', X_test_scaled, delimiter=',')
np.savetxt('data/y_test.csv', y_test, delimiter=',')
np.savetxt('data/y_train.csv', y_train, delimiter=',')

In [19]:
import imblearn

ModuleNotFoundError: No module named 'imblearn'

In [None]:
# from imblearn.under_sampling import RandomUnderSampler, NearMiss

# rus = NearMiss(version=3)
# X_rus, y_rus = rus.fit_resample(X_train_scaled, y_train)



In [None]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
# test = DecisionTreeClassifier()

# test.fit(X_rus, y_rus)

# y_pred = test.predict(X_test_scaled)

# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

[[3051 2002]
 [ 291  537]]
              precision    recall  f1-score   support

       False       0.91      0.60      0.73      5053
        True       0.21      0.65      0.32       828

    accuracy                           0.61      5881
   macro avg       0.56      0.63      0.52      5881
weighted avg       0.81      0.61      0.67      5881



# Ideas to improve

- Get rid of year
- transform skewed numerical variables to more normally distributed values (log-scaling) -> age and number of householdmembers


- Unbalanced target variable. Play around with balanced bootstrapping: https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/
https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/