In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/inclusao-financeira-na-africa/train.csv
/kaggle/input/inclusao-financeira-na-africa/test.csv


## Data Ingestion

In [2]:
%%time

df_train = pd.read_csv('/kaggle/input/inclusao-financeira-na-africa/train.csv')
df_train.head()

CPU times: user 30.4 ms, sys: 8.92 ms, total: 39.4 ms
Wall time: 64.6 ms


Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type,uid,bank_account
0,Rwanda,2016,uniqueid_4858,Rural,Yes,6,45,Male,Head of Household,Divorced/Seperated,Primary education,Farming and Fishing,Rwanda_uniqueid_4858,No
1,Tanzania,2017,uniqueid_3015,Urban,No,4,33,Female,Head of Household,Single/Never Married,Primary education,Self employed,Tanzania_uniqueid_3015,No
2,Rwanda,2016,uniqueid_103,Rural,Yes,7,43,Male,Head of Household,Married/Living together,Secondary education,Farming and Fishing,Rwanda_uniqueid_103,No
3,Rwanda,2016,uniqueid_4582,Rural,No,6,35,Female,Head of Household,Married/Living together,Primary education,Farming and Fishing,Rwanda_uniqueid_4582,No
4,Tanzania,2017,uniqueid_2854,Urban,Yes,2,30,Male,Head of Household,Single/Never Married,Primary education,Informally employed,Tanzania_uniqueid_2854,No


## Data understanding

## Data dimension

In [3]:
df_train.shape

(11762, 14)

## Class distribution

In [4]:
df_train['bank_account'].value_counts(normalize=True)*100

No     85.674205
Yes    14.325795
Name: bank_account, dtype: float64

## Data types

In [5]:
df_train.dtypes

country                   object
year                       int64
uniqueid                  object
location_type             object
cellphone_access          object
household_size             int64
age_of_respondent          int64
gender_of_respondent      object
relationship_with_head    object
marital_status            object
education_level           object
job_type                  object
uid                       object
bank_account              object
dtype: object

## NAN?

In [6]:
df_train.isna().sum()

country                   0
year                      0
uniqueid                  0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
uid                       0
bank_account              0
dtype: int64

## Descriptive Statistics

In [7]:
df_train.describe(include='object').T.sort_values(by='unique', ascending=False)

Unnamed: 0,count,unique,top,freq
uid,11762,11762,Rwanda_uniqueid_4858,1
uniqueid,11762,6916,uniqueid_403,4
job_type,11762,10,Self employed,3207
relationship_with_head,11762,6,Head of Household,6358
education_level,11762,6,Primary education,6408
marital_status,11762,5,Married/Living together,5433
country,11762,4,Rwanda,4335
location_type,11762,2,Rural,7100
cellphone_access,11762,2,Yes,8786
gender_of_respondent,11762,2,Female,6902


In [8]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,11762.0,2016.983336,0.848669,2016.0,2016.0,2017.0,2018.0,2018.0
household_size,11762.0,3.793913,2.225423,1.0,2.0,3.0,5.0,21.0
age_of_respondent,11762.0,38.602364,16.334624,16.0,26.0,35.0,48.0,100.0


## Data Preparation

In [9]:
df_pp = df_train.copy()

## Remove useless features

In [10]:
cols_remove = ['uid', 'uniqueid']
df_pp = df_pp.drop(columns=cols_remove)

In [11]:
X = df_pp.drop(columns=['bank_account'])
y = df_pp['bank_account']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Feature Encoding

### Class encoding

In [12]:
#class_col_enc = 'bank_account'

d_class = {
    'No': 0,
    'Yes': 1
}

y_train = y_train.map(d_class)

### One-hot encoding

In [13]:
cols_enc = ['country', 'location_type', 'cellphone_access','gender_of_respondent',
 'relationship_with_head', 'marital_status', 'education_level', 'job_type']

ohe = OneHotEncoder(drop='first')
res_ohe = ohe.fit_transform(X_train[cols_enc]).todense()

X_train = X_train.drop(columns=cols_enc)
X_train = pd.concat([X_train.reset_index(drop=True), pd.DataFrame(res_ohe, columns=ohe.get_feature_names_out()).reset_index(drop=True)], axis=1)

## Modeling & Evaluation

In [14]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier

In [15]:
rf = RandomForestClassifier(random_state=42)
cross_val_score(estimator=rf, X=X_train,y=y_train,scoring='f1_weighted',cv=5, n_jobs=-1)

array([0.85860854, 0.85399546, 0.86212604, 0.86131375, 0.86642492])

In [16]:
rf = RandomForestClassifier(random_state=42, max_depth=8)
cross_validate(estimator=rf, X=X_train,y=y_train,scoring='f1_weighted',cv=5, n_jobs=-1, return_train_score=True)

{'fit_time': array([0.63606596, 0.64067721, 0.63777995, 0.64365315, 0.45519996]),
 'score_time': array([0.05222654, 0.05199885, 0.05172873, 0.05190015, 0.03533649]),
 'test_score': array([0.85228881, 0.85194628, 0.86154706, 0.85554222, 0.86490669]),
 'train_score': array([0.87224576, 0.87204434, 0.86861359, 0.87044888, 0.86873567])}

In [17]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, random_state=42)

In [18]:
X_test

Unnamed: 0,country,year,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
3717,Rwanda,2016,Rural,Yes,6,48,Female,Spouse,Married/Living together,Other/Dont know/RTA,Informally employed
11036,Rwanda,2016,Urban,No,3,27,Female,Spouse,Married/Living together,No formal education,Formally employed Private
2088,Tanzania,2017,Urban,No,4,32,Female,Spouse,Single/Never Married,Primary education,No Income
1399,Uganda,2018,Rural,No,3,19,Female,Spouse,Married/Living together,Primary education,Self employed
1178,Tanzania,2017,Urban,Yes,2,36,Male,Head of Household,Single/Never Married,Primary education,Self employed
...,...,...,...,...,...,...,...,...,...,...,...
1330,Tanzania,2017,Rural,Yes,3,65,Female,Head of Household,Widowed,No formal education,Informally employed
3194,Tanzania,2017,Urban,No,2,21,Male,Head of Household,Single/Never Married,Primary education,Self employed
957,Rwanda,2016,Rural,Yes,4,32,Male,Head of Household,Married/Living together,Secondary education,Informally employed
10526,Rwanda,2016,Rural,Yes,4,57,Male,Head of Household,Married/Living together,No formal education,Informally employed


In [19]:
X_train

Unnamed: 0,year,household_size,age_of_respondent,country_Rwanda,country_Tanzania,country_Uganda,location_type_Urban,cellphone_access_Yes,gender_of_respondent_Male,relationship_with_head_Head of Household,...,education_level_Vocational/Specialised training,job_type_Farming and Fishing,job_type_Formally employed Government,job_type_Formally employed Private,job_type_Government Dependent,job_type_Informally employed,job_type_No Income,job_type_Other Income,job_type_Remittance Dependent,job_type_Self employed
0,2018,4,28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2016,10,56,1.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2016,2,52,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2016,4,28,1.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2016,4,29,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9404,2018,6,45,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9405,2018,1,20,0.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9406,2017,3,22,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9407,2018,9,34,0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
y_test = y_test.map(d_class)

In [21]:
res_ohe_test = ohe.transform(X_test[cols_enc]).todense()
X_test = X_test.drop(columns=cols_enc)
X_test = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(res_ohe_test, columns=ohe.get_feature_names_out()).reset_index(drop=True)], axis=1)

In [22]:
from sklearn.metrics import f1_score

f1_score(y_true=y_test, y_pred=rf.predict(X_test), average='weighted')

0.8534330617779646

In [23]:
## All dataset

In [24]:
ohe = OneHotEncoder(drop='first')
res_ohe_test = ohe.fit_transform(df_pp[cols_enc]).todense()

df_pp = df_pp.drop(columns=cols_enc)
df_pp = pd.concat([df_pp.reset_index(drop=True), pd.DataFrame(res_ohe_test, columns=ohe.get_feature_names_out()).reset_index(drop=True)], axis=1)

df_pp['bank_account'] = df_pp['bank_account'].map(d_class)

In [25]:
rf.fit(df_pp.drop(columns=['bank_account']), df_pp['bank_account'])

RandomForestClassifier(max_depth=8, random_state=42)

## Preparing submission

In [26]:
df_test

NameError: name 'df_test' is not defined

In [None]:
%%time

df_test = pd.read_csv('/kaggle/input/inclusao-financeira-na-africa/test.csv')
df_test.head()

df_test = df_test.drop(columns=cols_remove)#remove colunas

#df_test['bank_account'] = df_test['bank_account'].map(d_class) #mapeamento class

res_ohe_test = ohe.transform(df_test[cols_enc]).todense() #ohe

df_test = df_test.drop(columns=cols_enc)
df_test = pd.concat([df_test.reset_index(drop=True), pd.DataFrame(res_ohe_test, columns=ohe.get_feature_names_out()).reset_index(drop=True)], axis=1)

In [None]:
df_ss = pd.read_csv('/kaggle/input/inclusao-financeira-na-africa/test.csv')
preds = rf.predict(df_test)


In [None]:
d_class = {
    'No': 0,
    'Yes': 1
}

preds = pd.Series(preds).map({v: k for k, v in d_class.items()})
preds

In [None]:
df_ss['bank_account'] = preds
df_ss[['uid', 'bank_account']].to_csv('submission.csv', index=False)

In [None]:
### Ideias de melhorias
    ### 1. Testar novas formas de encoding (1d)
    ### 2. Feature engineering (2d)
    ### 3. Hyper-parameter tuning (1d)
    ### 4. Combinação de modelos