In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
np.random.seed(123)
warnings.filterwarnings('ignore')

In [78]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [79]:
print("Train dataset shape:", train.shape)
print("Test dataset shape:", test.shape)

Train dataset shape: (23524, 13)
Test dataset shape: (10086, 12)


In [80]:
# Display the first few rows of the training dataset
train.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [81]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


In [None]:
#import preprocessing module
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Convert target label to numerical Data
le = LabelEncoder()
train['bank_account'] = le.fit_transform(train['bank_account'])

# Separate training features from target
X_train = train.drop(['bank_account'], axis=1)
y_train = train['bank_account']

y_train


In [83]:
# function to preprocess our data from train models

def preprocessing_data(data):

    # Convert the following numerical labels from interger to float
    float_array = data[["household_size", "age_of_respondent", "year"]].values.astype(float
    )
    
    # categorical features to be converted to One Hot Encoding
    categ = [
        "relationship_with_head",
        "marital_status",
        "education_level",
        "job_type",
        "country"
    ]
    
    # One Hot Encoding conversion
    data = pd.get_dummies(data, prefix_sep="_", columns=categ)

    # Label Encoder conversion
    data["location_type"] = le.fit_transform(data["location_type"])
    data["cellphone_access"] = le.fit_transform(data["cellphone_access"])
    data["gender_of_respondent"] = le.fit_transform(data["gender_of_respondent"])
    
    # drop uniquid column
    data = data.drop(["uniqueid"], axis=1)
    
    # scale our data into range of 0 and 1
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_array = scaler.fit_transform(data)
    scaled_df = pd.DataFrame(scaled_array, columns=data.columns, index=data.index)
    return scaled_df

In [90]:
processed_train = preprocessing_data(X_train)
processed_test = preprocessing_data(test)
print("Processed Train dataset shape:", processed_train.shape)
processed_train

Processed Train dataset shape: (23524, 37)


Unnamed: 0,year,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head_Child,relationship_with_head_Head of Household,relationship_with_head_Other non-relatives,relationship_with_head_Other relative,...,job_type_Government Dependent,job_type_Informally employed,job_type_No Income,job_type_Other Income,job_type_Remittance Dependent,job_type_Self employed,country_Kenya,country_Rwanda,country_Tanzania,country_Uganda
0,1.0,0.0,1.0,0.10,0.095238,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.20,0.642857,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,1.0,1.0,0.20,0.119048,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.20,0.214286,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.35,0.119048,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23519,1.0,0.0,1.0,0.15,0.380952,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
23520,1.0,0.0,1.0,0.05,0.130952,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
23521,1.0,0.0,1.0,0.20,0.130952,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
23522,1.0,1.0,1.0,0.30,0.166667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [None]:
# After preprocessing the data, we can now use some features selection techniques to select the most important features from the dataset.
# We will use the Random Forest Classifier to select the most important features.

In [99]:
# Ranking features using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


model = RandomForestClassifier()
model.fit(processed_train, y_train)

importances = model.feature_importances_
feature_names = processed_train.columns

importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(importance_df.head(10))
print(classification_report(y_train, model.predict(processed_train)))
# Top 10 most important features

                                            Feature  Importance
4                                 age_of_respondent    0.342450
3                                    household_size    0.144087
26               job_type_Formally employed Private    0.042662
21               education_level_Tertiary education    0.039760
1                                     location_type    0.038061
25            job_type_Formally employed Government    0.036353
22  education_level_Vocational/Specialised training    0.032862
2                                  cellphone_access    0.028154
5                              gender_of_respondent    0.025576
19                education_level_Primary education    0.022724
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     20212
           1       0.97      0.87      0.92      3312

    accuracy                           0.98     23524
   macro avg       0.97      0.93      0.95     23524
weighted avg       0.98 

In [None]:
# Reduce training and test sets to top features
top_features = importance_df["Feature"].head(10).tolist()

X_train_reduced = processed_train.loc[y_train.index, top_features]
X_test_reduced = processed_test[top_features]
print("Reduced Train dataset shape:", X_train_reduced.shape)
print("Training y dataset shape:", y_train.shape)

Reduced Train dataset shape: (21171, 10)
Training y dataset shape: (21171,)


In [115]:
# Splitting training reduced data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_train_reduced, y_train, test_size=0.1, stratify=y_train, random_state=42
)

In [116]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1819
           1       0.51      0.32      0.39       299

    accuracy                           0.86      2118
   macro avg       0.70      0.63      0.66      2118
weighted avg       0.84      0.86      0.85      2118



In [None]:
%pip install xgboost

from xgboost import XGBClassifier

# create model
xg_model = XGBClassifier()

# fitting the model
xg_model.fit(X_train, y_train)