# Financial inclusion in Kenya, Tanzania, Rwanda and Uganda
Group 4: Flo, Markus and Jan

In [None]:
# Load packages
# dataframe and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning
# from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
# Load data
df = pd.read_csv("Train.csv")

# EDA

EDA was partly done in the StarterNotebook and more extensively [here](https://medium.com/analytics-vidhya/why-you-need-to-explore-your-data-how-you-can-start-13de6f29c8c1) and also in the downloadable file at the end of the article.

Keypoints: No NaNs, most features are categorical, target is unbalanced (85-15). Not much cleaning is needed

Influence of categorical features on target:

In [None]:
ctdf = df.select_dtypes(include="object").drop(["bank_account", "uniqueid"], axis=1)
ctdf_y = df.bank_account


for column_name in ctdf.columns:
    print(pd.crosstab(ctdf[column_name], ctdf_y, normalize="index"))
    print("____________")

In [None]:
pd.crosstab(ctdf.country, ctdf.cellphone_access, normalize="index")

Keypoints:  
People without a cellphone are very unlikely to have a bank account.    
Differences among countries. 25% of people in Kenya have a bank account and only 8-11% of people in Rwanda, Uganda and Tanzania. 
Differences between rural and urban and between genders are present but not as prominent as expected.  
Education level and job type have big differences between the bank account proportions.


# Feature engineering

We utilized different ways to treat the features like Min-Max scaling of the numerical features, random under and oversampling of the unbalanced target, using drop_first = T dummies for the categorical variables. However, we obtained the best score without those transformations. We are still applying min-max scaling to make the features comparable, because we want to identify the important features for our stakeholder.  
Additionally, we are getting rid of the 'year' column because not all countries have been sampled in all years. There is an increased number of bank_accounts in 2018, however, this is due to the fact that Kenya was only sampled in that year.

In [None]:
# Convert all non-binary categories to k categories
cats = [
    "country",
    "relationship_with_head",
    "marital_status",
    "education_level",
    "job_type",
]
df_dumm2 = pd.get_dummies(df, prefix_sep="_", columns=cats)

# Convert all binary categories to k-1 categories
bin_cat = ["bank_account", "location_type", "cellphone_access", "gender_of_respondent"]
df_dumm2 = pd.get_dummies(df_dumm2, prefix_sep="_", columns=bin_cat, drop_first=True)
# Drop 'uniqueid'
df_dumm2.drop("uniqueid", inplace=True, axis=1)

df_dumm2.head()

In [None]:
# Train-Test-Split
y2 = df_dumm2[["bank_account_Yes"]]
X2 = df_dumm2.drop(["bank_account_Yes", "year"], axis=1)


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X2, y2, random_state=42, stratify=y2
)  # Default 25% in test

In [None]:
# # Min-Max-Scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Modelling

Baseline Model: Our baseline model is based on the basic assumption that people living in rural areas and with just basic education do not own a bank account.

In [None]:
y_pred_baseline = []

X_test_scaled_df = pd.DataFrame(
    X_test, columns=df_dumm2.drop(["bank_account_Yes", "year"], axis=1).columns
)

for index, row in X_test_scaled_df.iterrows():
    if (
        row["location_type_Urban"]
        or row["education_level_Other/Dont know/RTA"]
        or row["education_level_Secondary education"]
        or row["education_level_Tertiary education"]
        or row["education_level_Vocational/Specialised training"]
    ):
        y_pred_baseline.append(1)
    else:
        y_pred_baseline.append(0)
print(confusion_matrix(y_test, y_pred_baseline))
print(accuracy_score(y_test, y_pred_baseline))

The baseline model is bad in predicting if a person owns a bank account or not. The accuracy is only 55% and there are a lot of false positives indicating that many people living in rural areas and with early education own a bank account. Those two features alone don't predict the target very well.

## Classification by logistic regression.  
Our best model - determined by a GridSearch - reaches an accuracy of 88.6%

In [None]:
logmod = LogisticRegression(
    max_iter=1000, solver="liblinear", fit_intercept=True, class_weight=None, C=2
)
logmod.fit(X_train, y_train)

y_pred = logmod.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

Parameter estimates

In [None]:
coef_df = pd.DataFrame({"cols": X2.columns, "coefs": logmod.coef_[0]})
coef_df

As expected from the crosstab in the EDA section the highest influence seems to be the cellphone_access, job_type, education_level, country, and also the age. There are also differences between certain levels in the marital_status and relationship_with_head features.

Heatmap of coefficients:

In [None]:
# Extract variables and estimates
variables = coef_df.cols
estimates = coef_df.coefs
# Create a grid of size 1xN, where N is the number of variables
np.array([estimates])

plt.imshow(np.array([estimates]).transpose(), cmap='RdBu', aspect='auto', origin='lower')

# Plot heatmap

# Customize the plot
plt.yticks(grid, variables)
plt.xlabel('Coefficient')
plt.ylabel('Category-level')
plt.colorbar()

# Display the plot
plt.show()

# Error analysis

Are there certain category levels which are predicted better/worse? Using the whole data set.

In [None]:
y2 = df_dumm2[["bank_account_Yes"]]
X2 = df_dumm2.drop(["bank_account_Yes", "year"], axis=1)
X3 = scaler.fit_transform(X2)

logmod_full = LogisticRegression(
    max_iter=1000, solver="liblinear", fit_intercept=True, class_weight=None, C=2
)
logmod_full.fit(X3, y2.values.ravel())

y_pred_full = logmod_full.predict(X3)  # Accuracy of 88.53

print(accuracy_score(y2, y_pred_full))
print(confusion_matrix(y2, y_pred_full))

Which categorie-levels have a low accuracy?

In [None]:
ea_data = df

ea_data["bank_pred"] = y_pred_full  # Predictions from the model
ea_data["bank_account"] = ea_data.bank_account.replace(
    {"Yes": 1, "No": 0}
)  # Convert Yes/No to 1/0
ea_data["bank_correct"] = (
    ea_data["bank_account"] == ea_data["bank_pred"]
)  # Checks if the prediction is correct. False are the false positives and false negatives.
ea_data["bank_correct"] = ea_data["bank_correct"].astype(int)

prop_df = pd.DataFrame(columns=["cat", "prop_correct"])
for cat in [
    "country",
    "relationship_with_head",
    "marital_status",
    "education_level",
    "job_type",
    "location_type",
    "cellphone_access",
    "gender_of_respondent",
]:
    # print(ea_data.groupby(cat)["bank_correct"].mean().reset_index())
    # print("_________")
    test_df = ea_data.groupby(cat)["bank_correct"].mean().reset_index()
    test_df.columns = ["cat", "prop_correct"]
    prop_df = pd.concat([prop_df, test_df], axis=0, ignore_index=True)

prop_df.sort_values("prop_correct")

Are the wrong predictions for certain category levels rather false positives or false negatives?

In [None]:
# Which category level has a high proportion of false negatives (positive number: True = 1 but predicted as 0) or false positives (negative number: True = 0 but predicted as 1)
ea_data["fpfn_prop"] = ea_data.bank_account - ea_data.bank_pred 

prop_df = pd.DataFrame(columns=["cat", "FPFN_proportion"])
for cat in [
    "country",
    "relationship_with_head",
    "marital_status",
    "education_level",
    "job_type",
    "location_type",
    "cellphone_access",
    "gender_of_respondent",
]:
    # print(ea_data.groupby(cat)["bank_correct"].mean().reset_index())
    # print("_________")
    test_df = ea_data.groupby(cat)["fpfn_prop"].mean().reset_index()
    test_df.columns = ["cat", "FPFN_proportion"]
    prop_df = pd.concat([prop_df, test_df], axis=0, ignore_index=True)

prop_df.sort_values("FPFN_proportion")

# Plots

Bank Account Yes / No: 

In [None]:
# value_counts = df['bank_account'].value_counts(normalize=True)
# value_counts = value_counts.sort_values(ascending=False)
# bars = plt.bar(value_counts.index, value_counts.values, color = ["#009EDB", "white"], edgecolor = ["#009EDB", "#009EDB"], linewidth = 3)
# plt.xlabel('')
# plt.ylabel('')
# plt.title('Bank Account', weight="bold")
# plt.axis("off")
# plt.xticks(rotation=90)  # Rotate x-axis labels if needed
# ax = plt.gca()
# ax.spines['bottom'].set_visible(False)
# ax.spines['left'].set_visible(False)
# ax.spines['right'].set_visible(False)

# text = ["Yes = 86 %", "No = 14 %"]

# for i,bar in enumerate(bars):
#     height = bar.get_height()
#     plt.text(bar.get_x() + bar.get_width() / 2, height+0.01, text[i], ha='center', va='bottom', weight="bold")


# plt.show()


value_counts = df["bank_account"].value_counts(normalize=True)*100
value_counts = value_counts.sort_values(ascending=False)
bars = plt.bar(
    value_counts.index,
    value_counts.values,
    color=["#009EDB", "#009EDB"],
    edgecolor=["#009EDB", "#009EDB"],
    linewidth=3,
)

plt.xlabel("")
plt.ylabel("")
plt.title("Bank Account")
#plt.axis("off")
plt.xticks(rotation=90)  # Rotate x-axis labels if needed
ax = plt.gca()
#ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.tick_params(axis="x", bottom=False)
plt.tick_params(axis="y", bottom=False)
plt.xticks([])
plt.yticks([])


text = ["No = 86 %", "Yes = 14 %"]
for i, bar in enumerate(bars):
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + 0.01,
        text[i],
        ha="center",
        va="bottom",
    )

x_center = np.mean(ax.get_xlim())
plt.text(x_center, 10.5, "Bank account baseline", ha="center", color=[0.25, 0.25, 0.25],alpha = 0.75)
# Line
plt.plot(ax.get_xlim(), [14.08, 14.08], color=[0.25, 0.25, 0.25], linestyle="--",alpha=0.75)



plt.show()

In [None]:
cellphone_props = pd.crosstab(ctdf.cellphone_access, ctdf_y, normalize="index")
cellphone_props

Cellphone:

In [None]:
categories = ["No cellphone", "Cellphone"]  # X labels
labels = ["Bank account", "No bank account"]  # Not relavant?

fig, ax = plt.subplots(figsize=(6, 6))

# Create stacked bar chart
bottombars = plt.bar(
    categories, [1.7, 18.4], label=labels[0], color="#009EDB", edgecolor="#009EDB"
)  # Bottom bars
plt.bar(
    categories,
    [98.3, 81.6],
    label=labels[1],
    bottom=[1.7, 18.4],
    color="white",
    edgecolor="#009EDB",
)  # Top bars, bottom defines where they start,

plt.title("Bank account ownership based on cellphone access")


# ax.set_xlabel('Cellphone access',weight ="bold") # xlabel
# ax.set_xticks(x)
ax.set_xticklabels(categories)  # x label
plt.gca().spines["left"].set_visible(False)  # getting rid of axis lines and box
plt.gca().spines["right"].set_visible(False)  # getting rid of axis lines and box
plt.gca().spines["top"].set_visible(False)  # getting rid of axis lines and box
#plt.gca().spines["bottom"].set_visible(False)  # getting rid of axis lines and box
plt.gca().yaxis.set_visible(False)  # getting rid of axis lines and box
#plt.tick_params(axis="x", bottom=False)  # Delete axis ticks.

# plt.subplots_adjust(bottom=0.2)  # Increase bottom value to create space below x-axis

# Position legend horizontally in the space below x-axis
# plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), frameon = False, ncol=2)
#
text = ["Bank account = 1.7 %", "Bank account = 18.4 %"]  # Write text in plot.
for i, bar in enumerate(bottombars):
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + 0.015,
        text[i],
        ha="center",
        va="bottom",
    )


x_center = np.mean(ax.get_xlim())
plt.text(x_center, 10.5, "Bank account baseline", ha="center", color=[0.25, 0.25, 0.25],alpha = 0.75)
# Line
plt.plot(ax.get_xlim(), [14, 14], color=[0.25, 0.25, 0.25], linestyle="--",alpha = 0.75)

plt.show()



Group education for easier interpretation:

In [None]:
df["education_level"].unique()

mapping = {
    "No formal education": "low",
    "Primary education": "low",
    "Secondary education": "mid",
    "Vocational/Specialised training": "high",
    "Tertiary education": "high",
    "Other/Dont know/RTA": "high",
}

# Create a new column 'groups' by mapping the values of 'categories'
df["edu_grouped"] = df["education_level"].map(mapping)

In [None]:
pd.crosstab(df.edu_grouped, df.bank_account, normalize="index")

Education level:

In [None]:
categories = ["Low education level", "Medium education level", "High education level"]  # X labels
labels = ["Bank account", "No bank account"]  # Not relavant?

fig, ax = plt.subplots(figsize=(9, 6))

# Create stacked bar chart
bottombars = plt.bar(
    categories, [7,23, 53], label=labels[0], color="#009EDB", edgecolor="#009EDB"
)  # Bottom bars
plt.bar(
    categories,
    [93, 77, 47],
    label=labels[1],
    bottom=[7,23, 53],
    color="white",
    edgecolor="#009EDB",
)  # Top bars, bottom defines where they start,

plt.title("Bank account ownership based education level")


# ax.set_xlabel('Cellphone access',weight ="bold") # xlabel
# ax.set_xticks(x)
#ax.set_xticklabels(categories, weight="bold")  # x label
plt.gca().spines["left"].set_visible(False)  # getting rid of axis lines and box
plt.gca().spines["right"].set_visible(False)  # getting rid of axis lines and box
plt.gca().spines["top"].set_visible(False)  # getting rid of axis lines and box
#plt.gca().spines["bottom"].set_visible(False)  # getting rid of axis lines and box
plt.gca().yaxis.set_visible(False)  # getting rid of axis lines and box
#plt.tick_params(axis="x", bottom=False)  # Delete axis ticks.

# plt.subplots_adjust(bottom=0.2)  # Increase bottom value to create space below x-axis

# Position legend horizontally in the space below x-axis
# plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), frameon = False, ncol=2)
#
text = ["Bank account = 7 %", "Bank account = 23 %", "Bank account = 47 %"]  # Write text in plot.
for i, bar in enumerate(bottombars):
    print(bar.get_x())
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + 0.015,
        text[i],
        ha="center",
        va="bottom",
    )


x_center2 = np.mean(ax.get_xlim())
plt.text(1, 10.5, "Bank account baseline", ha="center", color=[0.25, 0.25, 0.25],alpha = 0.75)
# Line
plt.plot((-0.49, 2.5), [14, 14], color=[0.25, 0.25, 0.25], linestyle="--",alpha = 0.75)

plt.show()

print(ax.get_xlim())

Job:  
Show just the most mean-deviating job types. Top2 each to stick to the minimal look.

In [None]:
categories = ["No income", "Informally employed", "Formally employed private", "Formally employed government"]  # X labels
labels = ["Bank account", "No bank account"]  # Not relavant?

fig, ax = plt.subplots(figsize=(12, 6))

# Create stacked bar chart
bottombars = plt.bar(
    categories, [2,8, 54, 78], label=labels[0], color="#009EDB", edgecolor="#009EDB"
)  # Bottom bars
plt.bar(
    categories,
    [98, 92,46,22 ],
    label=labels[1],
    bottom=[2,8, 54, 78],
    color="white",
    edgecolor="#009EDB",
)  # Top bars, bottom defines where they start,

plt.title("Bank account ownership based job type")


# ax.set_xlabel('Cellphone access',weight ="bold") # xlabel
# ax.set_xticks(x)
#ax.set_xticklabels(categories, weight="bold")  # x label
plt.gca().spines["left"].set_visible(False)  # getting rid of axis lines and box
plt.gca().spines["right"].set_visible(False)  # getting rid of axis lines and box
plt.gca().spines["top"].set_visible(False)  # getting rid of axis lines and box
#plt.gca().spines["bottom"].set_visible(False)  # getting rid of axis lines and box
plt.gca().yaxis.set_visible(False)  # getting rid of axis lines and box
#plt.tick_params(axis="x", bottom=False)  # Delete axis ticks.

# plt.subplots_adjust(bottom=0.2)  # Increase bottom value to create space below x-axis

# Position legend horizontally in the space below x-axis
# plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), frameon = False, ncol=2)
#
text = ["Bank account = 2 %", "Bank account = 12 %", "Bank account = 54 %", "Bank account = 78 %"]  # Write text in plot.
for i, bar in enumerate(bottombars):
    print(bar.get_x())
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + 0.015,
        text[i],
        ha="center",
        va="bottom",
    )


x_center2 = np.mean(ax.get_xlim())
plt.text(1.5, 15, "Bank account baseline", ha="center", color=[0.25, 0.25, 0.25],alpha = 0.75)
# Line
plt.plot((-0.49, 3.5), [14, 14], color=[0.25, 0.25, 0.25], linestyle="--",alpha = 0.75)

plt.show()



Gender:

In [None]:
categories = ["Female", "Male"]  # X labels
labels = ["Bank account", "No bank account"]  # Not relavant?

fig, ax = plt.subplots(figsize=(6, 6))

# Create stacked bar chart
bottombars = plt.bar(
    categories, [10.7, 19], label=labels[0], color="#009EDB", edgecolor="#009EDB"
)  # Bottom bars
plt.bar(
    categories,
    [89.3, 81],
    label=labels[1],
    bottom=[10.7, 19],
    color="white",
    edgecolor="#009EDB",
)  # Top bars, bottom defines where they start,

plt.title("Bank account ownership based on gender")


# ax.set_xlabel('Cellphone access',weight ="bold") # xlabel
# ax.set_xticks(x)
ax.set_xticklabels(categories)  # x label
plt.gca().spines["left"].set_visible(False)  # getting rid of axis lines and box
plt.gca().spines["right"].set_visible(False)  # getting rid of axis lines and box
plt.gca().spines["top"].set_visible(False)  # getting rid of axis lines and box
#plt.gca().spines["bottom"].set_visible(False)  # getting rid of axis lines and box
plt.gca().yaxis.set_visible(False)  # getting rid of axis lines and box
#plt.tick_params(axis="x", bottom=False)  # Delete axis ticks.

# plt.subplots_adjust(bottom=0.2)  # Increase bottom value to create space below x-axis

# Position legend horizontally in the space below x-axis
# plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), frameon = False, ncol=2)
#
text = ["Bank account = 10.7 %", "Bank account = 19.0 %"]  # Write text in plot.
for i, bar in enumerate(bottombars):
    if i == 0:
        height = bar.get_height() + 8.3
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            height + 0.015,
            text[i],
            ha="center",
            va="bottom",
        )
    else:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            height + 0.015,
            text[i],
            ha="center",
            va="bottom",
        )


x_center = np.mean(ax.get_xlim())
plt.text(x_center, 10.5, "Bank account baseline", ha="center", color=[0.25, 0.25, 0.25],alpha = 0.75)
# Line
plt.plot(ax.get_xlim(), [14, 14], color=[0.25, 0.25, 0.25], linestyle="--",alpha=0.75)

plt.show()



Location type:

In [None]:
categories = ["Rural", "Urban"]  # X labels
labels = ["Bank account", "No bank account"]  # Not relavant?

fig, ax = plt.subplots(figsize=(6, 6))


# Create stacked bar chart
bottombars = plt.bar(
    categories, [11.7, 17.9], label=labels[0], color="#009EDB", edgecolor="#009EDB"
)  # Bottom bars
plt.bar(
    categories,
    [88.3, 82.1],
    label=labels[1],
    bottom=[11.7, 17.9],
    color="white",
    edgecolor="#009EDB",
)  # Top bars, bottom defines where they start,

plt.title("Bank account ownership based on location type")


# ax.set_xlabel('Cellphone access',weight ="bold") # xlabel
# ax.set_xticks(x)
ax.set_xticklabels(categories)  # x label
plt.gca().spines["left"].set_visible(False)  # getting rid of axis lines and box
plt.gca().spines["right"].set_visible(False)  # getting rid of axis lines and box
plt.gca().spines["top"].set_visible(False)  # getting rid of axis lines and box
#plt.gca().spines["bottom"].set_visible(False)  # getting rid of axis lines and box
plt.gca().yaxis.set_visible(False)  # getting rid of axis lines and box
#plt.tick_params(axis="x", bottom=False)  # Delete axis ticks.

# plt.subplots_adjust(bottom=0.2)  # Increase bottom value to create space below x-axis

# Position legend horizontally in the space below x-axis
# plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), frameon = False, ncol=2)
#
text = ["Bank account = 11.7 %", "Bank account = 17.9 %"]  # Write text in plot.
for i, bar in enumerate(bottombars):
    if i == 0:
        height = bar.get_height()  + 6.2
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            height + 0.015,
            text[i],
            ha="center",
            va="bottom")
    else:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            height + 0.015,
            text[i],
            ha="center",
            va="bottom"
    )


x_center = np.mean(ax.get_xlim())
plt.text(x_center, 10.5, "Bank account baseline", ha="center", color=[0.25, 0.25, 0.25],alpha = 0.75)
# Line
plt.plot(ax.get_xlim(), [14, 14.2], color=[0.25, 0.25, 0.25], linestyle="--",alpha=0.75)

plt.show()

