# Imports

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder,StandardScaler, OneHotEncoder, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

# Data load

In [2]:
data = pd.read_csv('/home/jeffsmedines/repos/HACK/train.csv')
data.head()
# household / age / gender / cellphone / education_level / job_type = DEIXAR
# marital status / relationship with head? / country? /  location_type? / year?  = TIRAR

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type,uid,bank_account
0,Rwanda,2016,uniqueid_4858,Rural,Yes,6,45,Male,Head of Household,Divorced/Seperated,Primary education,Farming and Fishing,Rwanda_uniqueid_4858,No
1,Tanzania,2017,uniqueid_3015,Urban,No,4,33,Female,Head of Household,Single/Never Married,Primary education,Self employed,Tanzania_uniqueid_3015,No
2,Rwanda,2016,uniqueid_103,Rural,Yes,7,43,Male,Head of Household,Married/Living together,Secondary education,Farming and Fishing,Rwanda_uniqueid_103,No
3,Rwanda,2016,uniqueid_4582,Rural,No,6,35,Female,Head of Household,Married/Living together,Primary education,Farming and Fishing,Rwanda_uniqueid_4582,No
4,Tanzania,2017,uniqueid_2854,Urban,Yes,2,30,Male,Head of Household,Single/Never Married,Primary education,Informally employed,Tanzania_uniqueid_2854,No


# Initial Data Exploration

In [3]:
data.describe()

Unnamed: 0,year,household_size,age_of_respondent
count,11762.0,11762.0,11762.0
mean,2016.983336,3.793913,38.602364
std,0.848669,2.225423,16.334624
min,2016.0,1.0,16.0
25%,2016.0,2.0,26.0
50%,2017.0,3.0,35.0
75%,2018.0,5.0,48.0
max,2018.0,21.0,100.0


In [4]:
data.isnull().sum()

country                   0
year                      0
uniqueid                  0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
uid                       0
bank_account              0
dtype: int64

In [5]:
data.dtypes

country                   object
year                       int64
uniqueid                  object
location_type             object
cellphone_access          object
household_size             int64
age_of_respondent          int64
gender_of_respondent      object
relationship_with_head    object
marital_status            object
education_level           object
job_type                  object
uid                       object
bank_account              object
dtype: object

In [6]:
data.location_type.unique()

array(['Rural', 'Urban'], dtype=object)

In [7]:
# binário - 0 ou 1 
data.cellphone_access.unique()

array(['Yes', 'No'], dtype=object)

In [8]:
# binário - 0 ou 1 
data.gender_of_respondent.unique()

array(['Male', 'Female'], dtype=object)

In [9]:
# 6 categorias - sem ordem
data.relationship_with_head.unique()

array(['Head of Household', 'Spouse', 'Child', 'Parent', 'Other relative',
       'Other non-relatives'], dtype=object)

In [10]:
# 5 categorias - sem ordem
data.marital_status.unique()

array(['Divorced/Seperated', 'Single/Never Married',
       'Married/Living together', 'Widowed', 'Dont know'], dtype=object)

In [11]:
# 6 categorias - com ordem 
data.education_level.unique()

array(['Primary education', 'Secondary education',
       'Vocational/Specialised training', 'Tertiary education',
       'No formal education', 'Other/Dont know/RTA'], dtype=object)

In [12]:
# 10 categorias - sem ordem
data.job_type.unique()

array(['Farming and Fishing', 'Self employed', 'Informally employed',
       'No Income', 'Other Income', 'Remittance Dependent',
       'Formally employed Private', 'Government Dependent',
       'Dont Know/Refuse to answer', 'Formally employed Government'],
      dtype=object)

In [13]:
data.groupby('bank_account').count()

Unnamed: 0_level_0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type,uid
bank_account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
No,10077,10077,10077,10077,10077,10077,10077,10077,10077,10077,10077,10077,10077
Yes,1685,1685,1685,1685,1685,1685,1685,1685,1685,1685,1685,1685,1685


In [14]:
print(pd.value_counts(np.array(data['job_type'])))
print(' ')
print(pd.value_counts(np.array(data['marital_status'])) 
)
print('')
print(pd.value_counts(np.array(data['relationship_with_head']))
)
print('')
print(pd.value_counts(np.array(data['education_level'])))

Self employed                   3207
Informally employed             2788
Farming and Fishing             2732
Remittance Dependent            1252
Other Income                     569
Formally employed Private        514
No Income                        310
Formally employed Government     206
Government Dependent             125
Dont Know/Refuse to answer        59
dtype: int64
 
Married/Living together    5433
Single/Never Married       3970
Widowed                    1321
Divorced/Seperated         1034
Dont know                     4
dtype: int64

Head of Household      6358
Spouse                 3287
Child                  1099
Parent                  560
Other relative          368
Other non-relatives      90
dtype: int64

Primary education                  6408
No formal education                2240
Secondary education                2129
Tertiary education                  566
Vocational/Specialised training     399
Other/Dont know/RTA                  20
dtype: int64


# Train/Validation Split 

In [21]:
# drop columns teste - todas as colunas
data_dropped = data.drop(['uniqueid', 'uid'], axis=1)

In [22]:
# separation of x and y 
X = data_dropped.iloc[:,0:11]
y = data_dropped['bank_account']

In [7]:
data_edited = data.drop(['uniqueid', 'uid','country', 'relationship_with_head','location_type'], axis=1)
# separation of x and y 
X = data_edited.iloc[:,0:8]
y = data_edited['bank_account']

In [23]:
# aula 05 - Airbnb classificação 
# stratified split - gera indexes 
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=2187)

for train_index, test_index in split.split(X,y):
    X_train, X_val = X.loc[train_index], X.loc[test_index]
    y_train, y_val = y.loc[train_index], y.loc[test_index]

# RandomForest - Pipeline

parametros padrão utilizados inicialmente:


(n_estimators=100, criterion='gini', max_depth=X, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=X, min_impurity_decrease=0.0, 
bootstrap=True, oob_score=False, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=X)

In [72]:
rf = RandomForestClassifier(random_state=2187)

# parametros random forest - precisa do __ antes de cada parametro para que o gridsearch 
# entenda que são parametros do modelo
params_rf = {}
params_rf['model__criterion'] = ['entropy', 'gini']
params_rf['model__n_estimators'] = [100, 150]
params_rf['model__max_depth'] = [10]
params_rf['model__oob_score'] = [True, False]
# params_rf['model__min_samples_leaf'] = [1,2,] 
# params_rf['model__class_weight'] = [None,'balanced', 'balanced_subsample']
# params_rf['model__max_features'] = [None, 'sqrt']
params_rf['model'] = [rf]

# max_features=sqrt(n_features) - n_features=40 = sqrt(40)

# StratifiedKFold para que cada fold mantenha a proporção
cv_kfold= StratifiedKFold(n_splits=5,random_state=2187,shuffle=True)

# colunas que passarão pelo onehot
onehot_columns = ['country', 'relationship_with_head', 'marital_status',  
                  'education_level', 'cellphone_access', 'job_type' , 'location_type', 
                  'gender_of_respondent']

# onehot_columns = ['marital_status','education_level', 'cellphone_access', 'job_type' , 
#                   'gender_of_respondent']

# transformação - passthrough para que as colunas numericas que nao são transformadas pelo one hot passem reto
encoders = ColumnTransformer(transformers= [('onehotencod', OneHotEncoder(), onehot_columns)],
                                 remainder = 'passthrough', 
                                 verbose_feature_names_out = True,
                                 verbose=True)

# pipeline - colunas transformadas E colunas numericas entram no pipeline para treinar o modelo
pipeline = Pipeline(steps = [('encoders', encoders),
                             ('model', rf)],
                    verbose=True)

# criação de um f1 score em que a label positiva seja 'Yes' - Deu erro com a f1 score que não entendia 
# qual a label positiva, então foi necessário 'personalizar' o score f1.
f1_scorer = make_scorer(f1_score, average='weighted')

# Gridsearch - pega o pipeline, os parametros do modelo, a 'logica' de divisão kfold e treina modelos, 
# avaliando pelo f1 score e no final, retreinando o melhor modelo.
grid_rf = GridSearchCV(pipeline, params_rf, 
                      cv = cv_kfold, 
                      scoring = f1_scorer,
                      return_train_score = True,
                      refit=True,
                      verbose = 3)

In [73]:
target_encod = LabelBinarizer()
y_train = target_encod.fit_transform(y_train)

In [74]:
# fit na grid
grid_rf.fit(X_train,y_train.ravel())

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[ColumnTransformer] ... (1 of 2) Processing onehotencod, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing encoders, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   0.9s
[CV 1/5] END model=RandomForestClassifier(random_state=2187), model__criterion=entropy, model__max_depth=10, model__n_estimators=100, model__oob_score=True;, score=(train=0.887, test=0.867) total time=   0.9s
[ColumnTransformer] ... (1 of 2) Processing onehotencod, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing encoders, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   0.9s
[CV 2/5] END model=RandomForestClassifier(random_state=2187), model__criterion=entropy, model__max_depth=10, model__n_estimators=100, model__oob_score=True;, score

[Pipeline] ............. (step 2 of 2) Processing model, total=   1.2s
[CV 3/5] END model=RandomForestClassifier(random_state=2187), model__criterion=entropy, model__max_depth=10, model__n_estimators=150, model__oob_score=False;, score=(train=0.888, test=0.861) total time=   1.3s
[ColumnTransformer] ... (1 of 2) Processing onehotencod, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing encoders, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   1.2s
[CV 4/5] END model=RandomForestClassifier(random_state=2187), model__criterion=entropy, model__max_depth=10, model__n_estimators=150, model__oob_score=False;, score=(train=0.890, test=0.855) total time=   1.3s
[ColumnTransformer] ... (1 of 2) Processing onehotencod, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing encoders, total=   0.0s
[Pipeline] .

[Pipeline] ............. (step 2 of 2) Processing model, total=   1.4s
[CV 5/5] END model=RandomForestClassifier(random_state=2187), model__criterion=gini, model__max_depth=10, model__n_estimators=150, model__oob_score=True;, score=(train=0.887, test=0.865) total time=   1.4s
[ColumnTransformer] ... (1 of 2) Processing onehotencod, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing encoders, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   1.2s
[CV 1/5] END model=RandomForestClassifier(random_state=2187), model__criterion=gini, model__max_depth=10, model__n_estimators=150, model__oob_score=False;, score=(train=0.891, test=0.866) total time=   1.2s
[ColumnTransformer] ... (1 of 2) Processing onehotencod, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing encoders, total=   0.0s
[Pipeline] ........

In [75]:
# quais os parametros do melhor modelo encontrado
grid_rf.best_params_

{'model': RandomForestClassifier(criterion='entropy', max_depth=10, oob_score=True,
                        random_state=2187),
 'model__criterion': 'entropy',
 'model__max_depth': 10,
 'model__n_estimators': 100,
 'model__oob_score': True}

In [76]:
# qual o score do melhor modelo
grid_rf.best_score_

0.8624159964963047

In [77]:
y_val = target_encod.transform(y_val)
grid_rf.score(X_val,y_val)

0.8573607547363677

In [108]:
y_val = target_encod.transform(y_val)
pipeline.score(X_val, y_val)
#1

0.9481439501275148

# Documento da Submissão

In [78]:
teste = pd.read_csv('/home/jeffsmedines/repos/HACK/test.csv')

# drop same columns as the train dataset
teste_clean = teste.drop(['uniqueid', 'uid'], axis=1)
# teste_clean_dropped = teste.drop(['uniqueid', 'uid','country', 'relationship_with_head','location_type'], axis=1)

teste_clean.head()

Unnamed: 0,country,year,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Rwanda,2016,Rural,Yes,7,40,Male,Head of Household,Married/Living together,No formal education,Informally employed
1,Rwanda,2016,Rural,Yes,3,24,Male,Child,Single/Never Married,Secondary education,Farming and Fishing
2,Rwanda,2016,Urban,Yes,3,25,Female,Spouse,Married/Living together,Primary education,Farming and Fishing
3,Tanzania,2017,Urban,Yes,1,35,Female,Head of Household,Married/Living together,Primary education,Self employed
4,Rwanda,2016,Rural,Yes,3,60,Male,Head of Household,Married/Living together,Primary education,Farming and Fishing


In [79]:
bank_account = grid_rf.predict(teste_clean)

submission = pd.DataFrame(bank_account, columns=['bank_account'])

submission_words = submission.bank_account.apply(lambda x: 'Yes' if x==1 else 'No' )

submission2 = pd.concat([teste.uid, submission_words],axis=1)

In [80]:
submission2.head()

Unnamed: 0,uid,bank_account
0,Rwanda_uniqueid_625,No
1,Rwanda_uniqueid_1561,No
2,Rwanda_uniqueid_4806,No
3,Tanzania_uniqueid_4902,No
4,Rwanda_uniqueid_980,No


In [81]:
submission2.to_csv('/home/jeffsmedines/repos/HACK/submission10.csv',index=False) #gridsearch