# 1.0. Imports

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder,StandardScaler, OneHotEncoder, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline 

# 2.0. Data Load

In [2]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,id_cliente,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros,investe_exterior,pessoa_polit_exp,limite_adicional
0,1767,21,278.172008,2577.05,24196.89636,104.306544,31.038763,6,5,7,21,14,9,15,Não,Não,Negar
1,11920,40,268.874152,2465.39,19227.37796,69.858778,36.917093,5,8,5,40,23,10,18,Não,Não,Negar
2,8910,36,446.643127,1055.29,42822.28223,134.201478,34.561714,0,3,6,26,13,3,15,Sim,Não,Negar
3,4964,58,321.141267,703.05,51786.826,297.350067,31.493561,0,3,7,12,7,2,1,Sim,Não,Negar
4,10100,35,428.716114,891.29,44626.85346,134.201478,28.028887,2,8,7,24,10,8,20,Sim,Não,Negar


# 3.0. Initial Data Exploration

In [3]:
data.describe()

Unnamed: 0,id_cliente,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros
count,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0,9500.0
mean,6210.532947,126.398421,404.532032,1437.08642,195037.3,611.999323,32.270823,10.863684,17.552316,21.433158,20.990316,29.832421,24.937158,66.618632
std,3603.243914,685.036005,217.696725,1156.263217,1454274.0,2002.44388,5.106053,62.555417,120.980186,122.67739,14.800612,222.637763,183.733346,440.118524
min,1.0,18.0,0.088628,0.23,10473.87,0.0,20.992914,0.0,0.0,1.0,-5.0,0.0,0.0,1.0
25%,3091.75,28.0,272.36306,573.62,26207.9,76.083981,28.078872,2.0,3.0,4.0,10.0,9.0,2.0,8.0
50%,6172.5,38.0,337.169588,1175.73,45991.11,134.201478,32.262649,3.0,6.0,6.0,18.0,14.0,4.0,14.0
75%,9320.25,47.0,471.264657,1963.3325,80449.83,249.182915,36.464575,6.0,7.0,7.0,28.0,18.0,8.0,20.0
max,12500.0,8598.0,1602.040519,4998.07,23279660.0,10000.0,49.564519,1496.0,1779.0,1479.0,67.0,4388.0,2589.0,5788.0


In [4]:
data.isnull().sum()

id_cliente                 0
idade                      0
saldo_atual                0
divida_atual               0
renda_anual                0
valor_em_investimentos     0
taxa_utilizacao_credito    0
num_emprestimos            0
num_contas_bancarias       0
num_cartoes_credito        0
dias_atraso_dt_venc        0
num_pgtos_atrasados        0
num_consultas_credito      0
taxa_juros                 0
investe_exterior           0
pessoa_polit_exp           0
limite_adicional           0
dtype: int64

In [5]:
data.dtypes

id_cliente                   int64
idade                        int64
saldo_atual                float64
divida_atual               float64
renda_anual                float64
valor_em_investimentos     float64
taxa_utilizacao_credito    float64
num_emprestimos              int64
num_contas_bancarias         int64
num_cartoes_credito          int64
dias_atraso_dt_venc          int64
num_pgtos_atrasados          int64
num_consultas_credito        int64
taxa_juros                   int64
investe_exterior            object
pessoa_polit_exp            object
limite_adicional            object
dtype: object

In [6]:
# duas categorias - 2 colunas
data.investe_exterior.unique()

array(['Não', 'Sim'], dtype=object)

In [7]:
# duas categorias
data.pessoa_polit_exp.unique()

array(['Não', 'Sim'], dtype=object)

In [8]:
# duas categorias
data.limite_adicional.unique()

array(['Negar', 'Conceder'], dtype=object)

# 4.0. Train Test Split


In [9]:
df4 = data.copy()

In [10]:
df4.head()

Unnamed: 0,id_cliente,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros,investe_exterior,pessoa_polit_exp,limite_adicional
0,1767,21,278.172008,2577.05,24196.89636,104.306544,31.038763,6,5,7,21,14,9,15,Não,Não,Negar
1,11920,40,268.874152,2465.39,19227.37796,69.858778,36.917093,5,8,5,40,23,10,18,Não,Não,Negar
2,8910,36,446.643127,1055.29,42822.28223,134.201478,34.561714,0,3,6,26,13,3,15,Sim,Não,Negar
3,4964,58,321.141267,703.05,51786.826,297.350067,31.493561,0,3,7,12,7,2,1,Sim,Não,Negar
4,10100,35,428.716114,891.29,44626.85346,134.201478,28.028887,2,8,7,24,10,8,20,Sim,Não,Negar


In [11]:
# Drop id columns - irrelevant for the model
data_dropped = data.drop(['id_cliente'], axis=1)

In [21]:
# Separação de x e y 
X = data_dropped.iloc[:,0:-1]
y = data_dropped['limite_adicional']

In [22]:
X.head()

Unnamed: 0,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros,investe_exterior,pessoa_polit_exp
0,21,278.172008,2577.05,24196.89636,104.306544,31.038763,6,5,7,21,14,9,15,Não,Não
1,40,268.874152,2465.39,19227.37796,69.858778,36.917093,5,8,5,40,23,10,18,Não,Não
2,36,446.643127,1055.29,42822.28223,134.201478,34.561714,0,3,6,26,13,3,15,Sim,Não
3,58,321.141267,703.05,51786.826,297.350067,31.493561,0,3,7,12,7,2,1,Sim,Não
4,35,428.716114,891.29,44626.85346,134.201478,28.028887,2,8,7,24,10,8,20,Sim,Não


In [19]:
# stratified split - gera indexes
# com o n_splits=1, possivelmente é mesma coisa que o train_test_split 
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=2187)

for train_index, test_index in split.split(X,y):
    X_train, X_val = X.loc[train_index], X.loc[test_index]
    y_train, y_val = y.loc[train_index], y.loc[test_index]

In [20]:
X_train.head()

Unnamed: 0,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros,investe_exterior
3796,506,268.332833,2674.89,22572.60257,71.327411,28.266779,8,6,9,55,15,10,22,Não
3706,41,350.902092,566.21,37121.41349,55.794286,35.325584,4,7,4,15,10,5,20,Não
7449,40,421.669669,1911.65,45150.0945,79.93131,35.359605,2,4,5,16,12,6,14,Não
1813,1962,330.55804,900.26,29630.125245,118.208585,27.049205,0,1,1,9,8,1,1,Sim
4783,32,344.885047,1960.58,21142.2043,10000.0,38.608559,2,5,7,18,17,4,15,Não


# 5.0. RandomForest - Pipeline

parametros padrão utilizados inicialmente:

Standard:
(n_estimators=100, criterion='gini', max_depth=X, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=X, min_impurity_decrease=0.0, 
bootstrap=True, oob_score=False, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=X)

In [20]:
rf = RandomForestClassifier(random_state=2187)

# parametros random forest (precisa do '__' antes de cada parametro para que o gridsearch entenda que são parametros do modelo
params_rf = {}
params_rf['model__criterion'] = ['entropy', 'gini']
params_rf['model__n_estimators'] = [100, 150]
params_rf['model__max_depth'] = [None,10,15]
params_rf['model__class_weight'] = [None,'balanced', 'balanced_subsample']
# params_rf['model__max_features'] = [None, 'sqrt']
# params_rf['model__min_samples_split'] = [,10,]
# params_rf['model__min_samples_leaf'] = [1,2,] 
params_rf['model'] = [rf]

# StratifiedKFold para que cada fold mantenha a proporção do target
cv_kfold= StratifiedKFold(n_splits=5,random_state=2187,shuffle=True)

# colunas que passarão pelo OneHotEncoding
onehot_columns = ['investe_exterior', 'pessoa_polit_exp']

# onehot_columns = ['marital_status','education_level', 'cellphone_access', 'job_type' , 
#                   'gender_of_respondent']

# transformação - passthrough para que as colunas numericas que nao são transformadas pelo one hot passem reto
# dentro de transformers, poderia ter colocado varias transformações diferentes para diferentes colunas
encoders = ColumnTransformer(transformers= [('onehotencod', OneHotEncoder(), onehot_columns)],
                             remainder = 'passthrough', 
                             verbose_feature_names_out = True,
                             verbose=True)

# pipeline - colunas transformadas E colunas numericas entram no pipeline para treinar o modelo
# nos steps, poderia colocar SMOTE() em um dos passos, MinMaxScaler(), StandardScaler(), etc
pipeline = Pipeline(steps = [('columntransformers', encoders),
                             ('model', rf)],
                    verbose=True)

# criação do f1_score usando average 'micro'
f1_score_micro = make_scorer(f1_score, average='micro')

# Gridsearch - pega o pipeline, os parametros do modelo, a 'logica' de divisão kfold e treina modelos, 
# avaliando pelo f1 score e no final, retreinando o melhor modelo.
grid_rf = GridSearchCV(pipeline, params_rf, 
                      cv = cv_kfold, 
                      scoring = f1_score_micro,
                      return_train_score = True,
                      refit=True,
                      verbose = 3)

In [21]:
# transforma y em 0 e 1 
target_encod = LabelBinarizer() 
y_train = target_encod.fit_transform(y_train)

In [22]:
# fit na grid
# .ravel() devido à warning
grid_rf.fit(X_train,y_train.ravel())

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[ColumnTransformer] ... (1 of 2) Processing onehotencod, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[Pipeline]  (step 1 of 2) Processing columntransformers, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   2.8s
[CV 1/5] END model=RandomForestClassifier(random_state=2187), model__class_weight=None, model__criterion=entropy, model__max_depth=None, model__n_estimators=100;, score=(train=0.990, test=0.871) total time=   2.9s
[ColumnTransformer] ... (1 of 2) Processing onehotencod, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[Pipeline]  (step 1 of 2) Processing columntransformers, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   2.7s
[CV 2/5] END model=RandomForestClassifier(random_state=2187), model__class_weight=None, model__criterion=entropy, model__max_depth=None, model__n_estimators

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2187, shuffle=True),
             estimator=Pipeline(steps=[('columntransformers',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('onehotencod',
                                                                         OneHotEncoder(),
                                                                         ['country',
                                                                          'relationship_with_head',
                                                                          'marital_status',
                                                                          'education_level',
                                                                          'cellphone_access',
                                                                          'job_type',
                                                 

In [23]:
# quais os parametros do melhor modelo encontrado
grid_rf.best_params_

{'model': RandomForestClassifier(criterion='entropy', max_depth=10, random_state=2187),
 'model__class_weight': None,
 'model__criterion': 'entropy',
 'model__max_depth': 10,
 'model__n_estimators': 100}

In [24]:
# qual o score do melhor modelo
grid_rf.best_score_

0.884488900987915

In [25]:
# checando o score do modelo encontrado pelo gridsearch na validação
y_val = target_encod.transform(y_val)
grid_rf.score(X_val,y_val)

0.8821195806177388

# 6.0. Submission

In [26]:
teste = pd.read_csv('/kaggle/input/inclusao-financeira-na-africa/test.csv')

# drop same columns as the train dataset
# teste_clean = teste.drop(['uniqueid', 'uid'], axis=1)
teste_clean_dropped = teste.drop(['uniqueid', 'uid'], axis=1)

teste_clean_dropped.head()

Unnamed: 0,country,year,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Rwanda,2016,Rural,Yes,7,40,Male,Head of Household,Married/Living together,No formal education,Informally employed
1,Rwanda,2016,Rural,Yes,3,24,Male,Child,Single/Never Married,Secondary education,Farming and Fishing
2,Rwanda,2016,Urban,Yes,3,25,Female,Spouse,Married/Living together,Primary education,Farming and Fishing
3,Tanzania,2017,Urban,Yes,1,35,Female,Head of Household,Married/Living together,Primary education,Self employed
4,Rwanda,2016,Rural,Yes,3,60,Male,Head of Household,Married/Living together,Primary education,Farming and Fishing


In [27]:
# predict com o modelo treinado
bank_account = grid_rf.predict(teste_clean_dropped)

# dataframe com as predições
submission = pd.DataFrame(bank_account, columns=['bank_account'])

# transforming the prediction data
submission_words = submission.bank_account.apply(lambda x: 'Yes' if x==1 else 'No' )

# concatenando com o teste
submission2 = pd.concat([teste.uid, submission_words],axis=1)

In [28]:
submission2.head()

Unnamed: 0,uid,bank_account
0,Rwanda_uniqueid_625,No
1,Rwanda_uniqueid_1561,No
2,Rwanda_uniqueid_4806,No
3,Tanzania_uniqueid_4902,No
4,Rwanda_uniqueid_980,No


In [29]:
submission2.to_csv('../submission4.csv',index=False) #gridsearch