# Financial inclusion in Kenya, Tanzania, Rwanda and Uganda
Group 4: Flo, Markus and Jan

In [7]:
# Load packages
# dataframe and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning
#from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# Load data
df = pd.read_csv('Train.csv')

# EDA

EDA was partly done in the StarterNotebook and more extensively [here](https://medium.com/analytics-vidhya/why-you-need-to-explore-your-data-how-you-can-start-13de6f29c8c1) and also in the downloadable file at the end of the article.

Keypoints: No NaNs, most features are categorical, target is unbalanced (85-15). Not much cleaning is needed

Influence of categorical features on target:

In [4]:
ctdf = df.select_dtypes(include='object').drop(["bank_account","uniqueid"], axis=1)
ctdf_y = df.bank_account


for column_name in ctdf.columns:
    print(pd.crosstab(ctdf[column_name], ctdf_y,normalize="index"))
    print("____________")


bank_account        No       Yes
country                         
Kenya         0.749341  0.250659
Rwanda        0.885175  0.114825
Tanzania      0.908308  0.091692
Uganda        0.913851  0.086149
____________
bank_account         No       Yes
location_type                    
Rural          0.883497  0.116503
Urban          0.821261  0.178739
____________
bank_account            No       Yes
cellphone_access                    
No                0.982867  0.017133
Yes               0.816203  0.183797
____________
bank_account                No       Yes
gender_of_respondent                    
Female                0.893205  0.106795
Male                  0.810304  0.189696
____________
bank_account                  No       Yes
relationship_with_head                    
Child                   0.913414  0.086586
Head of Household       0.822851  0.177149
Other non-relatives     0.894737  0.105263
Other relative          0.901198  0.098802
Parent                  0.940147  0.059853
S

Keypoints:  
People without a cellphone are very unlikely to have a bank account.    
Differences among countries. 25% of people in Kenya have a bank account and only 8-11% of people in Rwanda, Uganda and Tanzania. 
Differences between rural and urban and between genders are present but not as prominent as expected.  
Education level and job type have big differences between the bank account proportions.


# Feature engineering

We utilized different ways to treat the features like Min-Max scaling of the numerical features, random under and oversampling of the unbalanced target, using drop_first = T dummies for the categorical variables. However, we obtained the best score without those transformations. We are still applying min-max scaling to make the features comparable, because we want to identify the important features for our stakeholder.  
Additionally, we are getting rid of the 'year' column because not all countries have been sampled in all years. There is an increased number of bank_accounts in 2018, however, this is due to the fact that Kenya was only sampled in that year.

In [5]:
# Convert all non-binary categories to k categories
cats = ["country", "relationship_with_head", "marital_status", "education_level", "job_type"] 
df_dumm2 = pd.get_dummies(df, prefix_sep="_", columns = cats)

# Convert all binary categories to k-1 categories
bin_cat = ["bank_account", "location_type", "cellphone_access","gender_of_respondent"]
df_dumm2 = pd.get_dummies(df_dumm2, prefix_sep="_", columns = bin_cat, drop_first=True)
# Drop 'uniqueid'
df_dumm2.drop("uniqueid",inplace=True, axis= 1)

df_dumm2.head()

Unnamed: 0,year,household_size,age_of_respondent,country_Kenya,country_Rwanda,country_Tanzania,country_Uganda,relationship_with_head_Child,relationship_with_head_Head of Household,relationship_with_head_Other non-relatives,...,job_type_Government Dependent,job_type_Informally employed,job_type_No Income,job_type_Other Income,job_type_Remittance Dependent,job_type_Self employed,bank_account_Yes,location_type_Urban,cellphone_access_Yes,gender_of_respondent_Male
0,2018,3,24,True,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,True,False
1,2018,5,70,True,False,False,False,False,True,False,...,True,False,False,False,False,False,False,False,False,False
2,2018,5,26,True,False,False,False,False,False,False,...,False,False,False,False,False,True,True,True,True,True
3,2018,5,34,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
4,2018,8,26,True,False,False,False,True,False,False,...,False,True,False,False,False,False,False,True,False,True


In [23]:
# Train-Test-Split
y2 = df_dumm2[["bank_account_Yes"]]
X2 = df_dumm2.drop(["bank_account_Yes","year"], axis = 1)


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state=42, stratify=y2) # Default 25% in test

In [32]:
# # Min-Max-Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Modelling

Baseline Model: Our baseline model is based on the basic assumption that people living in rural areas and with just basic education do not own a bank account.

In [62]:
y_pred_baseline = []

X_test_scaled_df = pd.DataFrame(X_test,columns = df_dumm2.drop(["bank_account_Yes","year"],axis=1).columns)

for index, row in X_test_scaled_df.iterrows():
    if row["location_type_Urban"] or row['education_level_Other/Dont know/RTA'] or row['education_level_Secondary education'] or row['education_level_Tertiary education'] or row[ 'education_level_Vocational/Specialised training']:
        y_pred_baseline.append(1)
    else:
        y_pred_baseline.append(0)
print(confusion_matrix(y_test,y_pred_baseline))
print(accuracy_score(y_test, y_pred_baseline))

[[2574 2479]
 [ 176  652]]
0.5485461656180921


The baseline model is bad in predicting if a person owns a bank account or not. The accuracy is only 55% and there are a lot of false positives indicating that many people living in rural areas and with early education own a bank account. Those two features alone don't predict the target very well.

## Classification by logistic regression.  
Our best model - determined by a GridSearch - reaches an accuracy of 88.6%

In [39]:

logmod = LogisticRegression(max_iter=1000, solver="liblinear", fit_intercept=False, class_weight = None, C=2)
logmod.fit(X_train, y_train)

y_pred = logmod.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))


[[4918  135]
 [ 537  291]]
              precision    recall  f1-score   support

       False       0.90      0.97      0.94      5053
        True       0.68      0.35      0.46       828

    accuracy                           0.89      5881
   macro avg       0.79      0.66      0.70      5881
weighted avg       0.87      0.89      0.87      5881

0.8857337187553137


  y = column_or_1d(y, warn=True)


Parameter estimates

In [37]:
coef_df = pd.DataFrame({"cols": X2.columns, "coefs": logmod.coef_[0]})
coef_df

Unnamed: 0,cols,coefs
0,household_size,0.100346
1,age_of_respondent,1.184192
2,country_Kenya,-0.308703
3,country_Rwanda,-0.660514
4,country_Tanzania,-1.491015
5,country_Uganda,-1.653959
6,relationship_with_head_Child,-0.900339
7,relationship_with_head_Head of Household,-0.158948
8,relationship_with_head_Other non-relatives,-0.923693
9,relationship_with_head_Other relative,-0.839349


As expected from the crosstab in the EDA section the highest influence seems to be the cellphone_access, job_type, education_level, country, and also the age. There are also differences between certain levels in the marital_status and relationship_with_head features.