# Financial Inclusion in Africa - Notebook

# Part 1 Data prep and cleaning

In [None]:
# Load packages
# dataframe and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning
#from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Load data
df = pd.read_csv('data/Train.csv')

In [None]:
df.head()
# Column uniqueid is NOT unique. Only unique in combination with country

In [None]:
df.bank_account.head

In [None]:
df.info()
# Many categorical variables -> need to create dummies
# bank_account = target = needs to be numerical as well (contains yes and no)
# No NaNs, which is nice!

Data exploration was mainly done here: https://medium.com/analytics-vidhya/why-you-need-to-explore-your-data-how-you-can-start-13de6f29c8c1

Main takeaways: 14% have a bankaccount, 86% don't. Highly imbalanced. Might need tweaking for modelling!

In [None]:
# Bank account distribution among category levels
ctdf = df.select_dtypes(include='object').drop(["bank_account","uniqueid"], axis=1)
ctdf_y = df.bank_account


for column_name in ctdf.columns:
    print(pd.crosstab(ctdf[column_name], ctdf_y,normalize="index"))
    print("____________")


# Main takeaways:
# - No cellphone = most likely no bank account
# - Differences among countries
# - Small differences between rural and urban and also between genders. Smaller as expected. 
# education_level and job_type have a high influence and are POTENTIALLY correlated.
# Need to research banking practice in Africa !!!

pd.crosstab(ctdf.job_type, ctdf.education_level,normalize="index")

In [None]:
pd.crosstab(ctdf.job_type, ctdf.education_level,normalize="index")

# There are - as expected - correlations between job and education

pd.crosstab(ctdf.country, ctdf.education_level,normalize="index")

# Feature Engineering

In [None]:
# Convert all categories into dummies
cats = ["country", "bank_account", "location_type", "cellphone_access","gender_of_respondent", "relationship_with_head", "marital_status", "education_level", "job_type"] 
df_dumm = pd.get_dummies(df, prefix_sep="_", columns = cats ,drop_first=True)


# Drop 'uniqueid'
df_dumm.drop("uniqueid",inplace=True, axis= 1)


In [None]:
# Train-Test-Split
y = df_dumm[["bank_account_Yes"]]
X = df_dumm.drop("bank_account_Yes", axis = 1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y) # Default 25% in test



In [None]:
# Min-Max-Scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled


# Model

- Metric: Our stakeholder - the UN - wants an accurate and unbiased view of the banking situation in Africa. We therefore think that the accuracy - although the data is not balanced - is a good starter metric

- Baseline: To showcase the stakeholder what is possible with data science. Very simple but assumption driven. Selection of 1 or 2 features. 

Idea: Rural and no/early_education

In [None]:
df_dumm.columns

In [None]:
y_pred_baseline = []

X_test_scaled_df = pd.DataFrame(X_test_scaled,columns = df_dumm.drop("bank_account_Yes",axis=1).columns)

for index, row in X_test_scaled_df.iterrows():
    if row["location_type_Urban"] or row['education_level_Other/Dont know/RTA'] or row['education_level_Secondary education'] or row['education_level_Tertiary education'] or row[ 'education_level_Vocational/Specialised training']:
        y_pred_baseline.append(1)
    else:
        y_pred_baseline.append(0)

print(y_pred_baseline)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred_baseline))
print(classification_report(y_test, y_pred_baseline))

# Our baseline model has an accuracy of 55% and many false positives.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
test = DecisionTreeClassifier()

test.fit(X_train_scaled, y_train)

y_pred = test.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
np.savetxt('data/X_train_scaled.csv', X_train_scaled, delimiter=',')
np.savetxt('data/X_test_scaled.csv', X_test_scaled, delimiter=',')
np.savetxt('data/y_test.csv', y_test, delimiter=',')
np.savetxt('data/y_train.csv', y_train, delimiter=',')

In [None]:
import imblearn

In [None]:
# from imblearn.under_sampling import RandomUnderSampler, NearMiss

# rus = NearMiss(version=3)
# X_rus, y_rus = rus.fit_resample(X_train_scaled, y_train)



In [None]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
# test = DecisionTreeClassifier()

# test.fit(X_rus, y_rus)

# y_pred = test.predict(X_test_scaled)

# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

# Ideas to improve

- Get rid of year
- transform skewed numerical variables to more normally distributed values (log-scaling) -> age and number of householdmembers


- Unbalanced target variable. Play around with balanced bootstrapping: https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/
https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/