# Modelos de regressão com árvores

Nesse Notebook iremos realizar o procedimento de afinamento dos hiper-parâmetros dos 3 modelos a serem considerados: árvores de decisão, árvores de decisão com boosting (CatBoost) e florestas aleatórias (Distributed Random Forests). As reespectivas bibliotecas utilizadas para esses modelos são Scikit-learn, CatBoost e H2O.

Separando em dados de treino e teste, iremos realizar uma busca com valores possíveis para os parâmetros (_grid search_ ), realizando validação cruzada (_cross validation_ ) em cada uma das combinações possível. O melhor conjunto de hiper-parâmetros é então validado com o dado de teste.

In [None]:
!pip install catboost

In [None]:
! apt-get install default-jre
!java -version

In [None]:
! pip install h2o

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
import time

#training aux
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

#models
from sklearn import tree
import h2o
from catboost import CatBoostRegressor
from h2o.estimators import H2ORandomForestEstimator

In [8]:
path = '/content/drive/MyDrive/analise_enem/data/ENEM_CLEAN_WITH_NAN.csv'
df = pd.read_csv(path)
df = df.loc[:, ~(df == 'FALTANTE').any()]

## Modelos com árvore de decisão

In [None]:
#models with numeric variables
X = df[[col for col in df.columns if col[0:3] == 'NUM']].drop(columns = ['NUM_NOTA'])
X_columns_names = X.columns
X = X.values
Y = df.NUM_NOTA.values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

results_path = '/content/drive/MyDrive/analise_enem/results/decision_tree_numeric.csv'
with open(results_path, "w+") as f:
    f.write("max_depth;min_samples_split;min_samples_leaf;time;data;score")

i = 0
for max_depth in [10, 15, 20]:
    for min_samples_split in [20, 30, 40]:
        for min_samples_leaf in [30, 45, 60]:
            start = time.time()
            model = tree.DecisionTreeRegressor(max_depth= max_depth,
                                              min_samples_split = min_samples_split,
                                              min_samples_leaf = min_samples_leaf)
            
            cv_scores = cross_val_score(model, x_train, y_train, cv = 3)
            end = time.time()
            with open(results_path, "a") as f:
                f.write(f"\n{max_depth};{min_samples_split};{min_samples_leaf};{(end - start)/3:.4f};train;{cv_scores.mean()}")
            i+= 1
            if i % 25 == 0:
                print(f"On iteration {i}.")

decision_tree_numeric_results = pd.read_csv(results_path, sep = ";").sort_values('score', ascending = False)
max_depth = decision_tree_numeric_results.max_depth.iloc[0]
min_samples_split = decision_tree_numeric_results.min_samples_split.iloc[0]
min_samples_leaf = decision_tree_numeric_results.min_samples_leaf.iloc[0]

start = time.time()
model = tree.DecisionTreeRegressor(max_depth= max_depth,
                                              min_samples_split = min_samples_split,
                                              min_samples_leaf = min_samples_leaf)

model.fit(x_train, y_train)
end = time.time()

with open(results_path, "a") as f:
    f.write(f"\n{max_depth};{min_samples_split};{min_samples_leaf};{end - start:.4f};test;{model.score(x_test, y_test)}")

On iteration 25.


In [None]:
#models with numeric variables and categorics
X = pd.get_dummies(df, drop_first = True, columns = [col for col in df.columns if col[0:3] == 'CAT']).drop(columns = ['NUM_NOTA'])
X_columns_names = X.columns
X = X.values
Y = df.NUM_NOTA.values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

results_path = '/content/drive/MyDrive/analise_enem/results/decision_tree_numeric_categoric.csv'
with open(results_path, "w+") as f:
    f.write("max_depth;min_samples_split;min_samples_leaf;time;data;score")

i = 0
for max_depth in [10, 15, 20]:
    for min_samples_split in [20, 30, 40]:
        for min_samples_leaf in [30, 45, 60]:
            start = time.time()
            model = tree.DecisionTreeRegressor(max_depth= max_depth,
                                              min_samples_split = min_samples_split,
                                              min_samples_leaf = min_samples_leaf,
                                              )
            
            cv_scores = cross_val_score(model, x_train, y_train, cv = 3)
            end = time.time()
            with open(results_path, "a") as f:
                f.write(f"\n{max_depth};{min_samples_split};{min_samples_leaf};{(end - start)/3:.4f};train;{cv_scores.mean()}")
            i+= 1
            if i % 25 == 0:
                print(f"On iteration {i}.")

decision_tree_numeric_categoric_results = pd.read_csv(results_path, sep = ";").sort_values('score', ascending = False)
max_depth = decision_tree_numeric_categoric_results.max_depth.iloc[0]
min_samples_split = decision_tree_numeric_categoric_results.min_samples_split.iloc[0]
min_samples_leaf = decision_tree_numeric_categoric_results.min_samples_leaf.iloc[0]

start = time.time()
model = tree.DecisionTreeRegressor(max_depth= max_depth,
                                    min_samples_split = min_samples_split,
                                    min_samples_leaf = min_samples_leaf)

model.fit(x_train, y_train)
end = time.time()

with open(results_path, "a") as f:
    f.write(f"\n{max_depth};{min_samples_split};{min_samples_leaf};{end - start:.4f};test;{model.score(x_test, y_test)}")

On iteration 25.


## Modelo CatBoost

In [None]:
#models with numeric variables and categorics
X = df.drop(columns = ['NUM_NOTA'])
X_columns_names = X.columns
#X = X.values
Y = df.NUM_NOTA.values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

results_path = '/content/drive/MyDrive/analise_enem/results/catboost_numeric_categoric.csv'
with open(results_path, "w+") as f:
    f.write("iterations;learning_rate;depth;time;data;score")

i = 0
for iterations in [400, 500, 650]:
    for learning_rate in [0.1, 0.2, 0.5]:
        for depth in [6, 10, 14]:
            start = time.time()
            model = CatBoostRegressor(iterations = iterations,
                                      learning_rate = learning_rate,
                                      depth = depth,
                                      verbose = False,
                                      cat_features = [col for col in X_columns_names if col[0:3] == 'CAT'],
                                      random_state = 1,
                                      task_type='GPU'
                                      )            
            end = time.time()
            cv_scores = cross_val_score(model, x_train, y_train, cv = 3)

            with open(results_path, "a") as f:
                f.write(f"\n{iterations};{learning_rate};{depth};{(end - start)/3:.4f};train;{cv_scores.mean()}")
            i+= 1
            if i % 5 == 0:
                print(f"On iteration {i}.")

catboost_results = pd.read_csv(results_path, sep = ";").sort_values('score', ascending = False)
iterations = catboost_results.iterations.iloc[0]
learning_rate = catboost_results.learning_rate.iloc[0]
depth = catboost_results.depth.iloc[0]

start = time.time()
model = CatBoostRegressor(iterations = iterations,
                               learning_rate = learning_rate,
                               depth = depth,
                               verbose = False,
                               cat_features = [col for col in X_columns_names if col[0:3] == 'CAT'],
                               random_state = 1,
                               task_type='GPU'
                              )            

model.fit(x_train, y_train)
end = time.time()

with open(results_path, "a") as f:
    f.write(f"\n{iterations};{learning_rate};{depth};{end - start:.4f};test;{model.score(x_test, y_test)}")

On iteration 5.
On iteration 10.
On iteration 15.
On iteration 20.
On iteration 25.


## Modelo Distributed Random Forest

In [None]:
h2o.init()

h2o_df = h2o.H2OFrame(df)
for col in h2o_df.columns:
    if col[0:3] == 'CAT':
        h2o_df[col] = h2o_df[col].asfactor()

df_train, df_test = h2o_df.split_frame(ratios=[.8], seed=1)


results_path = '/content/drive/MyDrive/analise_enem/results/drf_numeric_categoric.csv'
#with open(results_path, "w+") as f:
#    f.write("ntrees;max_depth;time;data;score")


i = 0
for ntrees in []:#[25, 50, 75]:
    for max_depth in []: #[5, 10, 15]:
        start = time.time()
        model = H2ORandomForestEstimator(
                                        nfolds = 3,
                                        ntrees = ntrees,
                                        max_depth = max_depth,
                                        seed = 1,
                                        )            
        model.train(x = [col for col in h2o_df.columns if col != 'NUM_NOTA'],
                    y = 'NUM_NOTA',
                    training_frame = df_train)
        end = time.time()

        with open(results_path, "a") as f:
            f.write(f"\n{ntrees};{max_depth};{(end-start)/3:.4f};train;{model.r2()}")
        i+= 1
        if i % 5 == 0:
            print(f"On iteration {i}.")

drf_results = pd.read_csv(results_path, sep = ";").sort_values('score', ascending = False)
ntrees = drf_results.ntrees.iloc[0]
max_depth = drf_results.max_depth.iloc[0]

start = time.time()
model = H2ORandomForestEstimator(
                                ntrees = int(ntrees),
                                max_depth = int(max_depth),
                                seed = 1
                                )            
model.train(x = [col for col in h2o_df.columns if col != 'NUM_NOTA'],
            y = 'NUM_NOTA',
            training_frame = df_train,
            validation_frame = df_test)
end = time.time()

with open(results_path, "a") as f:
    f.write(f"\n{ntrees};{max_depth};{end - start:.4f};test;{model.r2(valid = True)}")