# Install the necessary packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load your dataset
df = pd.read_csv('/Users/mohamoud/Desktop/streamlit/financial_inclusion_project/data/Financial_inclusion_dataset.csv')

## Display general information about the dataset

In [2]:
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


## Create a pandas profiling reports to gain insights into the dataset

In [4]:
%pip install -q ydata-profiling

Note: you may need to restart the kernel to use updated packages.


In [5]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Financial Inclusion Profiling Report")
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:00<00:00, 24.18it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



## Handle Missing and corrupted values

In [6]:
df.isnull().sum()

country                   0
year                      0
uniqueid                  0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

## Remove duplicates, if they exist

In [7]:
df.duplicated().sum()

0

## Handle outliers, if they exist

In [10]:
# Boxplots for quick outlier inspection
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.boxplot(x=df['household_size'], ax=axes[0])
axes[0].set_title("Household Size")
sns.boxplot(x=df['age_of_respondent'], ax=axes[1])
axes[1].set_title("Age of Respondent")
plt.tight_layout()
plt.savefig('boxplots.png')  # Save the plot

In [11]:
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers

outliers_age = detect_outliers_iqr(df, 'age_of_respondent')
outliers_household = detect_outliers_iqr(df, 'household_size')
print(f"Outliers in age: {len(outliers_age)}")
print(f"Outliers in household size: {len(outliers_household)}")

Outliers in age: 241
Outliers in household size: 381


In [12]:
# Capping the outliers
df['age_of_respondent'] = df['age_of_respondent'].apply(lambda x: min(x, 95))
df['household_size'] = df['household_size'].apply(lambda x: min(x, 10))

## Encode categorical features

In [14]:
# Encoding the target variable
df['bank_account'] = df['bank_account'].map({'Yes': 1, 'No': 0})

In [15]:
df['country_freq'] = df['country'].map(df['country'].value_counts())
df.drop('country', axis=1, inplace=True)

In [16]:
df = pd.get_dummies(df, columns=[
    'location_type',
    'cellphone_access',
    'gender_of_respondent',
    'marital_status'
], drop_first=True)

In [17]:
from sklearn.preprocessing import LabelEncoder

label_cols = ['relationship_with_head', 'education_level', 'job_type']
le = LabelEncoder()

for col in label_cols:
    df[col] = le.fit_transform(df[col])

In [18]:
# Dropping it since it is not helping us in prediction
df.drop('uniqueid', axis=1, inplace=True)

In [19]:
# inspecting the various columns
print(df.columns)
print(df.shape)

Index(['year', 'bank_account', 'household_size', 'age_of_respondent',
       'relationship_with_head', 'education_level', 'job_type', 'country_freq',
       'location_type_Urban', 'cellphone_access_Yes',
       'gender_of_respondent_Male', 'marital_status_Dont know',
       'marital_status_Married/Living together',
       'marital_status_Single/Never Married', 'marital_status_Widowed'],
      dtype='object')
(23524, 15)


In [20]:
df.head()

Unnamed: 0,year,bank_account,household_size,age_of_respondent,relationship_with_head,education_level,job_type,country_freq,location_type_Urban,cellphone_access_Yes,gender_of_respondent_Male,marital_status_Dont know,marital_status_Married/Living together,marital_status_Single/Never Married,marital_status_Widowed
0,2018,1,3,24,5,3,9,6068,False,True,False,False,True,False,False
1,2018,0,5,70,1,0,4,6068,False,False,False,False,False,False,True
2,2018,1,5,26,3,5,9,6068,True,True,True,False,False,True,False
3,2018,0,5,34,1,2,3,6068,False,True,False,False,True,False,False
4,2018,0,8,26,0,2,5,6068,True,False,True,False,False,True,False


# Based on the previous data exploration train and test a machine learning classifier

In [None]:
from sklearn.model_selection import train_test_split

# Features and target
X = df.drop('bank_account', axis=1)
y = df['bank_account']

# Train/test splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
from sklearn.ensemble import RandomForestClassifier

# Baseline model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluation for model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.8652497343251859
Precision: 0.5080645161290323
Recall: 0.3925233644859813
F1 Score: 0.4428822495606327


In [None]:
import pickle

# Save model to data/model.pkl
with open("models/model.pkl", "wb") as f:
    pickle.dump(rf_model, f)