# Modelos de regressão com árvores

In [35]:
import pandas as pd
import numpy as np

#training procedures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

#models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import tree

In [24]:
#df = pd.read_csv('data/ENEM_CLEAN.csv')
df = pd.read_csv('data/ENEM_CLEAN_WITH_NAN.csv')
df = df.loc[:, ~(df == 'FALTANTE').any()]

  res_values = method(rvalues)


## Modelos com árvore de decisão

Iremos avaliar modelos utilizando três conjuntos diferentes dos dados, as duas primeiras serão utilizando a biblioteca Scikit-learn e a segunda utilizando H20.

- Apenas as variáveis numérica.
- Variáveis numérica e categórica com one hot encoding (Scikit-lean).
- Variáveis numérica e categórica com one hot encoding (H20)

A implementaçã do Scikit-learn de árvores de decisão considera apenas variáveis numéricas, o algoritmo não é capaz de considerar divisões adequadas na hora de decidir se irá criar mais _branchs_, dessa forma, ao utilizar _one hot encoding_, se trata de uma aproximação da solução do problema com variáveis categóricas.

### Variáveis numéricas

In [26]:
X = df[[col for col in df.columns if col[0:3] == 'NUM']].drop(columns = ['NUM_NOTA'])
X_columns_names = X.columns
X = X.values
Y = df.NUM_NOTA.values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1)

In [27]:
X

array([[17.,  0.,  5., ...,  1.,  2.,  1.],
       [17.,  0.,  3., ...,  0.,  3.,  0.],
       [22.,  0.,  3., ...,  1.,  2.,  0.],
       ...,
       [18.,  1.,  3., ...,  1.,  1.,  0.],
       [17.,  0.,  3., ...,  1.,  1.,  0.],
       [17.,  0.,  3., ...,  2.,  3.,  1.]])

In [32]:
results = []
i = 0
for max_depth in np.arange(10, 20, 3):
    for min_samples_split in np.arange(20, 45, 3):
        for min_samples_leaf in np.arange(30, 60, 3):
            model = tree.DecisionTreeRegressor(max_depth= max_depth,
                                              min_samples_split = min_samples_split,
                                              min_samples_leaf = min_samples_leaf)
            model.fit(x_train, y_train)
            results.append([max_depth, 
                            min_samples_split, 
                            min_samples_leaf, 
                            model.score(x_train, y_train),
                            model.score(x_test, y_test)])
            i+= 1
            if i % 10 == 0:
                print(f"On iteration {i}.")
                print(f"Best train score: {max([ii[4] for ii in results]):.3f}")
                
decision_tree_numeric_vars_results = pd.DataFrame(results, columns = ['max_depth',
                                                            'min_samples_split',
                                                            'min_samples_leaf',
                                                            'r2_train',
                                                            'r2_test'])

On iteration 10.
Best train score: 0.249
On iteration 20.
Best train score: 0.249
On iteration 30.
Best train score: 0.249
On iteration 40.
Best train score: 0.249
On iteration 50.
Best train score: 0.249
On iteration 60.
Best train score: 0.249
On iteration 70.
Best train score: 0.249
On iteration 80.
Best train score: 0.249
On iteration 90.
Best train score: 0.249
On iteration 100.
Best train score: 0.252
On iteration 110.
Best train score: 0.252
On iteration 120.
Best train score: 0.252
On iteration 130.
Best train score: 0.252
On iteration 140.
Best train score: 0.252
On iteration 150.
Best train score: 0.252
On iteration 160.
Best train score: 0.252
On iteration 170.
Best train score: 0.252
On iteration 180.
Best train score: 0.252
On iteration 190.
Best train score: 0.252
On iteration 200.
Best train score: 0.252
On iteration 210.
Best train score: 0.252
On iteration 220.
Best train score: 0.252
On iteration 230.
Best train score: 0.252
On iteration 240.
Best train score: 0.252
O

In [33]:
decision_tree_numeric_vars_results.sort_values('r2_test', ascending=False)

Unnamed: 0,max_depth,min_samples_split,min_samples_leaft,r2_train,r2_test
138,13,32,54,0.273526,0.252010
178,13,44,54,0.273526,0.252010
128,13,29,54,0.273526,0.252010
168,13,41,54,0.273526,0.252010
158,13,38,54,0.273526,0.252010
...,...,...,...,...,...
270,19,20,30,0.297657,0.241018
350,19,44,30,0.297657,0.241000
330,19,38,30,0.297657,0.241000
310,19,32,30,0.297657,0.241000


In [23]:
x_train.dtype

dtype('float64')

In [33]:
model = tree.DecisionTreeRegressor(max_depth = 13, min_samples_split = 32, min_samples_leaf = 54)
model.fit(x_train, y_train)
columns_importances = list(zip(X_columns_names, model.feature_importances_))
columns_importances.sort(key = lambda x : x[1], reverse = True)
for i in range(len(X_columns_names)):
    print(f"{columns_importances[i][0]}: {columns_importances[i][1]:.3f}")

NUM_BANHEIRO: 0.390
NUM_COMPUTADOR: 0.268
NUM_IDADE: 0.089
NUM_ANO_CONCLUIU: 0.073
NUM_EMPREGADO_DOMESTICO: 0.060
NUM_PESSOAS_RESIDENCIA: 0.042
NUM_FREEZER: 0.020
NUM_CARRO: 0.020
NUM_CELULAR: 0.013
NUM_MOTO: 0.005
NUM_LAVAR_LOUCA: 0.005
NUM_TV: 0.004
NUM_QUARTOS: 0.003
NUM_MAQUINA_SECAR: 0.003
NUM_GELADEIRA: 0.001
NUM_MICRO_ONDAS: 0.001
NUM_MAQUINA_LAVAR: 0.001


## Modelo Adaboost

### Variáveis numérica

In [38]:
#training with numeric variables
X = df[[col for col in df.columns if col[0:3] == 'NUM']].drop(columns = ['NUM_NOTA'])
X_columns_names = X.columns
X = X.values
Y = df.NUM_NOTA.values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1)

params = {
 'n_estimators': [50, 100],
 'learning_rate' : [0.01,0.05,0.1,0.3,1],
 'loss' : ['linear', 'square', 'exponential']
}

adaboost_tunning = GridSearchCV(AdaBoostRegressor(), 
             param_grid = params,
             scoring= 'r2',
             verbose= 2.5)

adaboost_tunning.fit(x_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] learning_rate=0.01, loss=linear, n_estimators=50 ................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  learning_rate=0.01, loss=linear, n_estimators=50, score=0.186, total=  37.1s
[CV] learning_rate=0.01, loss=linear, n_estimators=50 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   37.0s remaining:    0.0s


[CV]  learning_rate=0.01, loss=linear, n_estimators=50, score=0.183, total=  36.0s
[CV] learning_rate=0.01, loss=linear, n_estimators=50 ................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV]  learning_rate=0.01, loss=linear, n_estimators=50, score=0.186, total=  37.3s
[CV] learning_rate=0.01, loss=linear, n_estimators=50 ................
[CV]  learning_rate=0.01, loss=linear, n_estimators=50, score=0.184, total=  36.4s
[CV] learning_rate=0.01, loss=linear, n_estimators=50 ................
[CV]  learning_rate=0.01, loss=linear, n_estimators=50, score=0.185, total=  38.3s
[CV] learning_rate=0.01, loss=linear, n_estimators=100 ...............
[CV]  learning_rate=0.01, loss=linear, n_estimators=100, score=0.186, total= 1.4min
[CV] learning_rate=0.01, loss=linear, n_estimators=100 ...............
[CV]  learning_rate=0.01, loss=linear, n_estimators=100, score=0.184, total= 1.6min
[CV] learning_rate=0.01, loss=linear, n_estimators=100 ...............
[CV]  learning_rate=0.01, loss=linear, n_estimators=100, score=0.186, total= 1.5min
[CV] learning_rate=0.01, loss=linear, n_estimators=100 ...............
[CV]  learning_rate=0.01, loss=linear, n_estimators=100, score=0.185, tot

[CV]  learning_rate=0.05, loss=exponential, n_estimators=100, score=0.200, total= 1.5min
[CV] learning_rate=0.05, loss=exponential, n_estimators=100 ..........
[CV]  learning_rate=0.05, loss=exponential, n_estimators=100, score=0.200, total= 1.6min
[CV] learning_rate=0.05, loss=exponential, n_estimators=100 ..........
[CV]  learning_rate=0.05, loss=exponential, n_estimators=100, score=0.202, total= 1.5min
[CV] learning_rate=0.05, loss=exponential, n_estimators=100 ..........
[CV]  learning_rate=0.05, loss=exponential, n_estimators=100, score=0.199, total= 1.6min
[CV] learning_rate=0.05, loss=exponential, n_estimators=100 ..........
[CV]  learning_rate=0.05, loss=exponential, n_estimators=100, score=0.200, total= 1.6min
[CV] learning_rate=0.1, loss=linear, n_estimators=50 .................
[CV]  learning_rate=0.1, loss=linear, n_estimators=50, score=0.200, total=  41.8s
[CV] learning_rate=0.1, loss=linear, n_estimators=50 .................
[CV]  learning_rate=0.1, loss=linear, n_estimat

[CV]  learning_rate=0.3, loss=square, n_estimators=100, score=0.089, total=  58.2s
[CV] learning_rate=0.3, loss=square, n_estimators=100 ................
[CV]  learning_rate=0.3, loss=square, n_estimators=100, score=0.136, total=  58.5s
[CV] learning_rate=0.3, loss=exponential, n_estimators=50 ............
[CV]  learning_rate=0.3, loss=exponential, n_estimators=50, score=0.200, total=  30.8s
[CV] learning_rate=0.3, loss=exponential, n_estimators=50 ............
[CV]  learning_rate=0.3, loss=exponential, n_estimators=50, score=0.196, total=  30.8s
[CV] learning_rate=0.3, loss=exponential, n_estimators=50 ............
[CV]  learning_rate=0.3, loss=exponential, n_estimators=50, score=0.196, total=  30.8s
[CV] learning_rate=0.3, loss=exponential, n_estimators=50 ............
[CV]  learning_rate=0.3, loss=exponential, n_estimators=50, score=0.201, total=  30.7s
[CV] learning_rate=0.3, loss=exponential, n_estimators=50 ............
[CV]  learning_rate=0.3, loss=exponential, n_estimators=50, 

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 144.1min finished


GridSearchCV(estimator=AdaBoostRegressor(),
             param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.3, 1],
                         'loss': ['linear', 'square', 'exponential'],
                         'n_estimators': [50, 100]},
             scoring='r2', verbose=2.5)

In [49]:
adaboost_tunning_df = pd.DataFrame(adaboost_tunning.cv_results_).sort_values('rank_test_score')
adaboost_tunning_df.to_csv('results/adaboost_tunning_numeric.csv', index = False)

### Variáveis numérica e categóricas

In [54]:
#training with numeric and categoric variables
X = pd.get_dummies(df, drop_first = True).drop(columns = ['NUM_NOTA'])
X_columns_names = X.columns
X = X.values
Y = df.NUM_NOTA.values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1)

params = {
 'n_estimators': [100],
 'learning_rate' : [0.01,0.05,0.1],
 'loss' : ['linear', 'exponential']
}

adaboost_tunning = GridSearchCV(AdaBoostRegressor(), 
             param_grid = params,
             scoring= 'r2',
             verbose= 2.5)

adaboost_tunning.fit(x_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] learning_rate=0.01, loss=linear, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  learning_rate=0.01, loss=linear, n_estimators=100, score=0.211, total= 5.3min
[CV] learning_rate=0.01, loss=linear, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.3min remaining:    0.0s


[CV]  learning_rate=0.01, loss=linear, n_estimators=100, score=0.210, total= 5.2min
[CV] learning_rate=0.01, loss=linear, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 10.5min remaining:    0.0s


[CV]  learning_rate=0.01, loss=linear, n_estimators=100, score=0.211, total= 5.2min
[CV] learning_rate=0.01, loss=linear, n_estimators=100 ...............
[CV]  learning_rate=0.01, loss=linear, n_estimators=100, score=0.214, total= 5.2min
[CV] learning_rate=0.01, loss=linear, n_estimators=100 ...............
[CV]  learning_rate=0.01, loss=linear, n_estimators=100, score=0.213, total= 5.2min
[CV] learning_rate=0.01, loss=exponential, n_estimators=100 ..........
[CV]  learning_rate=0.01, loss=exponential, n_estimators=100, score=0.211, total= 5.3min
[CV] learning_rate=0.01, loss=exponential, n_estimators=100 ..........
[CV]  learning_rate=0.01, loss=exponential, n_estimators=100, score=0.209, total= 5.2min
[CV] learning_rate=0.01, loss=exponential, n_estimators=100 ..........
[CV]  learning_rate=0.01, loss=exponential, n_estimators=100, score=0.211, total= 5.0min
[CV] learning_rate=0.01, loss=exponential, n_estimators=100 ..........
[CV]  learning_rate=0.01, loss=exponential, n_estimator

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 155.2min finished


GridSearchCV(estimator=AdaBoostRegressor(),
             param_grid={'learning_rate': [0.01, 0.05, 0.1],
                         'loss': ['linear', 'exponential'],
                         'n_estimators': [100]},
             scoring='r2', verbose=2.5)

In [35]:
for max_depth in [2, 4, 6, 8, 10, 12, 14, 16, 18]:
    model = RandomForestRegressor(max_depth= max_depth)
    model.fit(x_train, y_train)
    print(f"Depth:{max_depth}")
    print(f"Train score: {model.score(x_train, y_train):.3f}")
    print(f"Test score: {model.score(x_test, y_test):.3f}")

Depth:2
Train score: 0.166
Test score: 0.162
Depth:4
Train score: 0.203
Test score: 0.201
Depth:6
Train score: 0.224
Test score: 0.219
Depth:8
Train score: 0.242
Test score: 0.228
Depth:10
Train score: 0.265
Test score: 0.232
Depth:12
Train score: 0.295
Test score: 0.232
Depth:14
Train score: 0.334
Test score: 0.228
Depth:16
Train score: 0.378
Test score: 0.222
Depth:18
Train score: 0.423
Test score: 0.211


In [36]:
model = RandomForestRegressor(max_depth= 12)
model.fit(x_train, y_train)
for i, col in enumerate(X_columns_names):
    print(f"{col}: {model.feature_importances_[i]:.3f}")

NUM_IDADE: 0.043
NUM_ANO_CONCLUIU: 0.000
NUM_PESSOAS_RESIDENCIA: 0.056
NUM_EMPREGADO_DOMESTICO: 0.099
NUM_BANHEIRO: 0.143
NUM_QUARTOS: 0.019
NUM_CARRO: 0.031
NUM_MOTO: 0.017
NUM_GELADEIRA: 0.012
NUM_FREEZER: 0.029
NUM_MAQUINA_LAVAR: 0.011
NUM_MAQUINA_SECAR: 0.014
NUM_MICRO_ONDAS: 0.011
NUM_LAVAR_LOUCA: 0.012
NUM_TV: 0.023
NUM_CELULAR: 0.028
NUM_COMPUTADOR: 0.451
