In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
np.random.seed(123)
warnings.filterwarnings('ignore')

In [107]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [108]:
print("Train dataset shape:", train.shape)
print("Test dataset shape:", test.shape)

Train dataset shape: (23524, 13)
Test dataset shape: (10086, 12)


In [109]:
# Display the first few rows of the training dataset
train.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [110]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


In [111]:
#import preprocessing module
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Convert target label to numerical Data
le = LabelEncoder()
train['bank_account'] = le.fit_transform(train['bank_account'])

# Separate training features from target
X_train = train.drop(['bank_account'], axis=1)
y_train = train['bank_account']

X_train


Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed
...,...,...,...,...,...,...,...,...,...,...,...,...
23519,Uganda,2018,uniqueid_2113,Rural,Yes,4,48,Female,Head of Household,Divorced/Seperated,No formal education,Other Income
23520,Uganda,2018,uniqueid_2114,Rural,Yes,2,27,Female,Head of Household,Single/Never Married,Secondary education,Other Income
23521,Uganda,2018,uniqueid_2115,Rural,Yes,5,27,Female,Parent,Widowed,Primary education,Other Income
23522,Uganda,2018,uniqueid_2116,Urban,Yes,7,30,Female,Parent,Divorced/Seperated,Secondary education,Self employed


In [112]:
# function to preprocess our data from train models

def preprocessing_data(data):

    # Convert the following numerical labels from interger to float
    float_array = data[["household_size", "age_of_respondent", "year"]].values.astype(float
    )
    
    # categorical features to be converted to One Hot Encoding
    categ = [
        "relationship_with_head",
        "marital_status",
        "education_level",
        "job_type",
        "country"
    ]
    
    # One Hot Encoding conversion
    data = pd.get_dummies(data, prefix_sep="_", columns=categ)

    # Label Encoder conversion
    data["location_type"] = le.fit_transform(data["location_type"])
    data["cellphone_access"] = le.fit_transform(data["cellphone_access"])
    data["gender_of_respondent"] = le.fit_transform(data["gender_of_respondent"])
    
    # drop uniquid column
    data = data.drop(["uniqueid"], axis=1)
    
    # scale our data into range of 0 and 1
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_array = scaler.fit_transform(data)
    scaled_df = pd.DataFrame(scaled_array, columns=data.columns, index=data.index)
    return scaled_df

In [113]:
processed_train = preprocessing_data(X_train)
processed_test = preprocessing_data(test)
print("Processed Train dataset shape:", processed_train.shape)
print("Y train shape:", y_train.shape)
# processed_train

Processed Train dataset shape: (23524, 37)
Y train shape: (23524,)


In [114]:
# After preprocessing the data, we can now use some features selection techniques to select the most important features from the dataset.
# We will use the Random Forest Classifier to select the most important features.

In [115]:
# Ranking features using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


model = RandomForestClassifier()
model.fit(processed_train, y_train)

importances = model.feature_importances_
feature_names = processed_train.columns

importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(importance_df.head(10))
print(classification_report(y_train, model.predict(processed_train)))
# Top 10 most important features

                                            Feature  Importance
4                                 age_of_respondent    0.343155
3                                    household_size    0.144856
26               job_type_Formally employed Private    0.043322
21               education_level_Tertiary education    0.037648
1                                     location_type    0.037039
25            job_type_Formally employed Government    0.032231
2                                  cellphone_access    0.030587
22  education_level_Vocational/Specialised training    0.030524
19                education_level_Primary education    0.026841
5                              gender_of_respondent    0.024834
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     20212
           1       0.97      0.87      0.92      3312

    accuracy                           0.98     23524
   macro avg       0.97      0.93      0.95     23524
weighted avg       0.98 

In [116]:
# feature selection using xgboost classifier
from xgboost import XGBClassifier
xgb_model = XGBClassifier()
xgb_model.fit(processed_train, y_train)

importances = xgb_model.feature_importances_
feature_names = processed_train.columns
xgb_importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(xgb_importance_df.head(10))


                                            Feature  Importance
25            job_type_Formally employed Government    0.228168
2                                  cellphone_access    0.159908
26               job_type_Formally employed Private    0.099434
17              education_level_No formal education    0.095320
19                education_level_Primary education    0.045115
33                                    country_Kenya    0.044208
35                                 country_Tanzania    0.034309
21               education_level_Tertiary education    0.031950
22  education_level_Vocational/Specialised training    0.021209
7          relationship_with_head_Head of Household    0.020445


In [127]:
# Reduce training and test sets to top features
top_features_xgb = xgb_importance_df["Feature"].head(10).tolist()

X_train_reduced = processed_train.loc[y_train.index, top_features_xgb]
X_test_reduced = processed_test[top_features_xgb]
print("Reduced Train dataset shape:", X_train_reduced.shape)
print("Training y dataset shape:", y_train.shape)
# X_train_reduced
# X_test_reduced

Reduced Train dataset shape: (21171, 10)
Training y dataset shape: (21171,)


In [118]:
# Splitting training reduced data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_train_reduced, y_train, test_size=0.1, stratify=y_train, random_state=42
)

In [119]:
# model = RandomForestClassifier()
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)

# print(classification_report(y_test,y_pred))

In [120]:
from xgboost import XGBClassifier

# create model
xg_model = XGBClassifier()

# fitting the model
xg_model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [128]:
# Training model accuracy
score = xg_model.score(X_train, y_train)
print("XGBoost Model Accuracy:", score)

XGBoost Model Accuracy: 0.8864484436257144


In [130]:
from sklearn.metrics import accuracy_score

y_pred = xg_model.predict(X_test) # --> testing the model 
# score = xg_model.score(X_test, y_pred)
print("Accuracy score",accuracy_score(y_test, y_pred))
print("XGBoost Model Test Accuracy:", score)

Accuracy score 0.8895027624309392
XGBoost Model Test Accuracy: 0.8864484436257144


In [123]:
# make predictions with the test dataset
y_predictions = xg_model.predict(X_test_reduced)

from sklearn.metrics import classification_report, accuracy_score


print(classification_report(y_test,y_pred))
acc = accuracy_score(y_test,y_pred)
print("Accuracy_score: ",acc)

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      2022
           1       0.75      0.32      0.45       331

    accuracy                           0.89      2353
   macro avg       0.82      0.65      0.70      2353
weighted avg       0.88      0.89      0.87      2353

Accuracy_score:  0.8895027624309392


In [124]:
submission = pd.DataFrame({"uniqueid": test["uniqueid"] + " x " + test["country"],
                           "bank_account": y_predictions})

In [125]:
submission.head(10)

Unnamed: 0,uniqueid,bank_account
0,uniqueid_6056 x Kenya,1
1,uniqueid_6060 x Kenya,1
2,uniqueid_6065 x Kenya,0
3,uniqueid_6072 x Kenya,0
4,uniqueid_6073 x Kenya,0
5,uniqueid_6074 x Kenya,0
6,uniqueid_6075 x Kenya,0
7,uniqueid_6076 x Kenya,0
8,uniqueid_6077 x Kenya,0
9,uniqueid_6078 x Kenya,0
