In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression
import re
from sklearn.feature_selection import SelectKBest, f_classif, f_regression, mutual_info_regression
from sklearn.feature_selection import RFECV, RFE
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import shap
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import optuna
from metrics import *
from xgboost import XGBClassifier, XGBRegressor

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [2]:
df = pd.read_csv('../tabelas/dados_final.csv', encoding='ISO-8859-1', sep=";")

In [3]:
#pd.set_option('display.max_columns', None)
df

Unnamed: 0,ALUNOS_ID_ESCOLA,ALUNOS_PORCENTAGEM_Q002_Branca,ALUNOS_PORCENTAGEM_Q002_Preta,ALUNOS_PORCENTAGEM_Q002_Parda,ALUNOS_PORCENTAGEM_Q002_Amarela,ALUNOS_PORCENTAGEM_Q002_Indigena,ALUNOS_PORCENTAGEM_Q002_Nao_Declarado,ALUNOS_PORCENTAGEM_Q003A_Nao_moro_com_mae_madrasta,ALUNOS_PORCENTAGEM_Q003A_Sim_moro_com_mae_madrasta,ALUNOS_PORCENTAGEM_Q003B_Nao_moro_com_pai_padrasto,...,MEDIA_EM_LP,MEDIA_EM_MT,Escola_Federal,Escola_Estadual,Escola_Municipal,Escola_Privada,Capital_Area,Interior_Area,Regiao_Urbana,Regiao_Rural
0,11024968,0.303030,0.075758,0.575758,0.000000,0.015152,0.000000,0.121212,0.833333,0.287879,...,306.34,313.39,False,True,False,False,False,True,True,False
1,11025638,0.225352,0.049296,0.654930,0.007042,0.014085,0.014085,0.147887,0.732394,0.218310,...,266.00,272.43,False,True,False,False,False,True,True,False
2,11007168,0.178571,0.285714,0.464286,0.000000,0.035714,0.000000,0.178571,0.821429,0.500000,...,275.19,272.74,False,True,False,False,False,True,True,False
3,11007885,0.353333,0.066667,0.500000,0.046667,0.013333,0.000000,0.146667,0.786667,0.266667,...,302.97,301.57,False,True,False,False,False,True,True,False
4,11007893,0.258929,0.098214,0.531250,0.044643,0.000000,0.013393,0.120536,0.812500,0.263393,...,279.15,278.61,False,True,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10669,53014308,0.149254,0.164179,0.522388,0.074627,0.014925,0.000000,0.089552,0.791045,0.253731,...,298.54,298.90,False,True,False,False,True,False,True,False
10670,53014316,0.120000,0.200000,0.480000,0.060000,0.000000,0.100000,0.140000,0.760000,0.420000,...,276.98,262.95,False,True,False,False,True,False,True,False
10671,53051009,0.247423,0.113402,0.515464,0.030928,0.020619,0.030928,0.144330,0.783505,0.360825,...,300.33,294.99,False,True,False,False,True,False,True,False
10672,53068068,0.180328,0.169399,0.612022,0.005464,0.000000,0.027322,0.087432,0.885246,0.234973,...,284.52,274.22,False,True,False,False,True,False,True,False


In [4]:
df["MEDIA_FINAL"] = df[["MEDIA_EM_LP", "MEDIA_EM_MT"]].mean(axis=1)

In [5]:
def selecionar_melhor_variavel(prefixo, df):
    melhores_variaveis = {}
    
    padrao_pergunta = re.compile(f"{prefixo}_PORCENTAGEM_Q\\d{{3}}[A-Z]?_")
    perguntas_unicas = set()
    
    # Identificar prefixos únicos
    for col in df.columns:
        match = padrao_pergunta.match(col)
        if match:
            perguntas_unicas.add(match.group())
    
    for pergunta_prefixo in perguntas_unicas:
        colunas_pergunta = [col for col in df.columns if col.startswith(pergunta_prefixo)]
        
        if not colunas_pergunta:
            continue  # Pula perguntas que não existem no dataset
        
        X = df[colunas_pergunta].fillna(0)  # Preencher valores NaN com 0
        y = df["MEDIA_FINAL"]
        
        # Calcular Mutual Information
        mi_scores = mutual_info_regression(X, y)
        
        # Selecionar a melhor variável
        melhor_coluna = colunas_pergunta[np.argmax(mi_scores)]
        melhores_variaveis[pergunta_prefixo] = melhor_coluna
    
    return melhores_variaveis

In [6]:
# Aplicar para cada grupo
grupos = ["ALUNOS", "PROFESSOR", "DIRETOR"]
melhores_variaveis_por_grupo = {}

for grupo in grupos:
    melhores_variaveis_por_grupo[grupo] = selecionar_melhor_variavel(grupo, df)

In [7]:
melhores_df = pd.DataFrame.from_dict(melhores_variaveis_por_grupo, orient='index').transpose()

In [8]:
todas_melhores_colunas = [col for grupo in melhores_variaveis_por_grupo.values() for col in grupo.values()]
df_selecionado = df[todas_melhores_colunas + ["MEDIA_FINAL"]]

In [9]:
df_selecionado

Unnamed: 0,ALUNOS_PORCENTAGEM_Q010C_Sim_Quarto_So_Meu,ALUNOS_PORCENTAGEM_Q008B_Sim_Agua_Tratada,ALUNOS_PORCENTAGEM_Q018B_Le_Livros_Quase_Nunca,ALUNOS_PORCENTAGEM_Q010E_Sim_Garagem,ALUNOS_PORCENTAGEM_Q014_Abandonou_Escola_Publica_Particular,ALUNOS_PORCENTAGEM_Q003C_Nao_moro_com_irmaos,ALUNOS_PORCENTAGEM_Q009B_1_Tablet_Em_Casa,ALUNOS_PORCENTAGEM_Q010B_Sim_Wifi,ALUNOS_PORCENTAGEM_Q009A_Nenhum_Geladeira_Em_Casa,ALUNOS_PORCENTAGEM_Q017C_Trabalhos_Domesticos_1_a_2_Horas,...,DIRETOR_PORCENTAGEM_Q038_Ensino_Medio_Nao,DIRETOR_PORCENTAGEM_Q241_Treinamento_Para_Lidar_Educacao_Especial_Sim,DIRETOR_PORCENTAGEM_Q236_Necessidade_Professor_Libras_Sim,DIRETOR_PORCENTAGEM_Q118_Conselho_Escolar_Existe_Ativo,DIRETOR_PORCENTAGEM_Q093_Area_Externa_Equipamento_Tanque_Areia_Sim,DIRETOR_PORCENTAGEM_Q101_Area_Externa_Equipamento_Banco_Nao,DIRETOR_PORCENTAGEM_Q248_Treinamento_Sobre_Baixa_Visao_Nao,DIRETOR_PORCENTAGEM_Q082_Area_Externa_Existe_Area_Coberta_Sim,DIRETOR_PORCENTAGEM_Q202_Para_Reduzir_Repetencia_ReforÃÂÃÂ§o_Escolar_Acao_Nao_Realizada,MEDIA_FINAL
0,0.575758,0.727273,0.333333,0.469697,0.045455,0.272727,0.196970,0.893939,0.000000,0.303030,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,309.865
1,0.669014,0.394366,0.323944,0.478873,0.021127,0.267606,0.161972,0.830986,0.014085,0.316901,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,269.215
2,0.464286,0.892857,0.357143,0.357143,0.035714,0.357143,0.071429,0.607143,0.000000,0.392857,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,273.965
3,0.673333,0.913333,0.340000,0.506667,0.233333,0.313333,0.100000,0.820000,0.000000,0.400000,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,302.270
4,0.633929,0.656250,0.348214,0.522321,0.133929,0.316964,0.147321,0.691964,0.004464,0.352679,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,278.880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10669,0.552239,0.955224,0.328358,0.746269,0.164179,0.134328,0.164179,0.820896,0.000000,0.402985,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,298.720
10670,0.600000,0.860000,0.240000,0.640000,0.100000,0.180000,0.120000,0.860000,0.040000,0.540000,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,269.965
10671,0.412371,0.938144,0.319588,0.721649,0.154639,0.247423,0.206186,0.896907,0.000000,0.371134,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,297.660
10672,0.595628,0.950820,0.245902,0.726776,0.038251,0.125683,0.120219,0.852459,0.000000,0.448087,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,279.370


In [10]:
# Lista das colunas adicionais que devem ser incluídas
colunas_adicionais = [
    "ALUNOS_ID_ESCOLA", "ALUNOS_PORCENTAGEM_Regiao_Norte", "ALUNOS_PORCENTAGEM_Regiao_Nordeste",
    "ALUNOS_PORCENTAGEM_Regiao_Sudeste", "ALUNOS_PORCENTAGEM_Regiao_Sul", "ALUNOS_PORCENTAGEM_Regiao_Centro_Oeste",
    "ALUNOS_PORCENTAGEM_Capital_Area", "ALUNOS_PORCENTAGEM_Interior_Area", "ALUNOS_PORCENTAGEM_Escola_Federal",
    "ALUNOS_PORCENTAGEM_Escola_Estadual", "ALUNOS_PORCENTAGEM_Escola_Municipal", "ALUNOS_PORCENTAGEM_Escola_Privada",
    "ALUNOS_PORCENTAGEM_Regiao_Urbana", "ALUNOS_PORCENTAGEM_Regiao_Rural", "ALUNOS_PORCENTAGEM_Ensino_Medio_Tradicional",
    "ALUNOS_PORCENTAGEM_Ensino_Medio_Integral", "PROFESSOR_PORCENTAGEM_Regiao_Norte", "PROFESSOR_PORCENTAGEM_Regiao_Nordeste",
    "PROFESSOR_PORCENTAGEM_Regiao_Sudeste", "PROFESSOR_PORCENTAGEM_Regiao_Sul", "PROFESSOR_PORCENTAGEM_Regiao_Centro_Oeste",
    "PROFESSOR_PORCENTAGEM_Capital_Area", "PROFESSOR_PORCENTAGEM_Interior_Area", "PROFESSOR_PORCENTAGEM_Escola_Federal",
    "PROFESSOR_PORCENTAGEM_Escola_Estadual", "PROFESSOR_PORCENTAGEM_Escola_Municipal", "PROFESSOR_PORCENTAGEM_Escola_Privada",
    "PROFESSOR_PORCENTAGEM_Regiao_Urbana", "PROFESSOR_PORCENTAGEM_Regiao_Rural", "DIRETOR_PORCENTAGEM_Regiao_Norte",
    "DIRETOR_PORCENTAGEM_Regiao_Nordeste", "DIRETOR_PORCENTAGEM_Regiao_Sudeste", "DIRETOR_PORCENTAGEM_Regiao_Sul",
    "DIRETOR_PORCENTAGEM_Regiao_Centro_Oeste", "DIRETOR_PORCENTAGEM_Capital_Area", "DIRETOR_PORCENTAGEM_Interior_Area",
    "DIRETOR_PORCENTAGEM_Escola_Federal", "DIRETOR_PORCENTAGEM_Escola_Estadual", "DIRETOR_PORCENTAGEM_Escola_Municipal",
    "DIRETOR_PORCENTAGEM_Escola_Privada", "DIRETOR_PORCENTAGEM_Regiao_Urbana", "DIRETOR_PORCENTAGEM_Regiao_Rural",
    "Escola_Federal", "Escola_Estadual", "Escola_Municipal", "Escola_Privada", "Capital_Area", "Interior_Area",
    "Regiao_Urbana", "Regiao_Rural"
]

# Garantir que só adicionamos colunas que realmente existem no df
colunas_adicionais = [col for col in colunas_adicionais if col in df.columns]

# Concatenar as colunas selecionadas com as colunas adicionais
df_final = df_selecionado.join(df[colunas_adicionais])

In [11]:
df_final

Unnamed: 0,ALUNOS_PORCENTAGEM_Q010C_Sim_Quarto_So_Meu,ALUNOS_PORCENTAGEM_Q008B_Sim_Agua_Tratada,ALUNOS_PORCENTAGEM_Q018B_Le_Livros_Quase_Nunca,ALUNOS_PORCENTAGEM_Q010E_Sim_Garagem,ALUNOS_PORCENTAGEM_Q014_Abandonou_Escola_Publica_Particular,ALUNOS_PORCENTAGEM_Q003C_Nao_moro_com_irmaos,ALUNOS_PORCENTAGEM_Q009B_1_Tablet_Em_Casa,ALUNOS_PORCENTAGEM_Q010B_Sim_Wifi,ALUNOS_PORCENTAGEM_Q009A_Nenhum_Geladeira_Em_Casa,ALUNOS_PORCENTAGEM_Q017C_Trabalhos_Domesticos_1_a_2_Horas,...,DIRETOR_PORCENTAGEM_Regiao_Urbana,DIRETOR_PORCENTAGEM_Regiao_Rural,Escola_Federal,Escola_Estadual,Escola_Municipal,Escola_Privada,Capital_Area,Interior_Area,Regiao_Urbana,Regiao_Rural
0,0.575758,0.727273,0.333333,0.469697,0.045455,0.272727,0.196970,0.893939,0.000000,0.303030,...,1.0,0.0,False,True,False,False,False,True,True,False
1,0.669014,0.394366,0.323944,0.478873,0.021127,0.267606,0.161972,0.830986,0.014085,0.316901,...,1.0,0.0,False,True,False,False,False,True,True,False
2,0.464286,0.892857,0.357143,0.357143,0.035714,0.357143,0.071429,0.607143,0.000000,0.392857,...,1.0,0.0,False,True,False,False,False,True,True,False
3,0.673333,0.913333,0.340000,0.506667,0.233333,0.313333,0.100000,0.820000,0.000000,0.400000,...,1.0,0.0,False,True,False,False,False,True,True,False
4,0.633929,0.656250,0.348214,0.522321,0.133929,0.316964,0.147321,0.691964,0.004464,0.352679,...,1.0,0.0,False,True,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10669,0.552239,0.955224,0.328358,0.746269,0.164179,0.134328,0.164179,0.820896,0.000000,0.402985,...,1.0,0.0,False,True,False,False,True,False,True,False
10670,0.600000,0.860000,0.240000,0.640000,0.100000,0.180000,0.120000,0.860000,0.040000,0.540000,...,1.0,0.0,False,True,False,False,True,False,True,False
10671,0.412371,0.938144,0.319588,0.721649,0.154639,0.247423,0.206186,0.896907,0.000000,0.371134,...,1.0,0.0,False,True,False,False,True,False,True,False
10672,0.595628,0.950820,0.245902,0.726776,0.038251,0.125683,0.120219,0.852459,0.000000,0.448087,...,1.0,0.0,False,True,False,False,True,False,True,False


In [12]:
X = df_final.drop(columns=['ALUNOS_ID_ESCOLA', 'MEDIA_FINAL'])  # X agora contém apenas as features
y = df_final['MEDIA_FINAL']  # y contém a variável alvo

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
parameters = {
    'lambda': 0.007342571956423321, 
    'alpha': 0.0049894947342883815, 
    'colsample_bytree': 0.5, 
    'subsample': 0.7, 
    'learning_rate': 0.03263662172830031, 
    'n_estimators': 801, 
    'max_depth': 4, 
    'min_child_weight': 9, 
    'gamma': 0.0014832545774497129
}


model = XGBRegressor()#LGBMRegressor(**parameters)
# fit model
model.fit(X_train, y_train)

In [15]:
estimator = DecisionTreeRegressor(random_state=42)
params = {'lambda': 5.727904470799623, 'alpha': 3.7958531426706403, 'colsample_bytree': 0.7, 'subsample': 1.0, 'learning_rate': 0.02447244097399012, 'n_estimators': 344, 'max_depth': 9, 'min_child_weight': 4, 'gamma': 0.013296521457299506}


pipe = Pipeline([('select_k', SelectKBest(mutual_info_regression, k=200)),
                 ('feature_selector', RFE(estimator, step=0.05, n_features_to_select=50, verbose=3)),
                 ('xgb', XGBRegressor(**params))])
pipe.fit(X_train, y_train)

Fitting estimator with 200 features.
Fitting estimator with 190 features.
Fitting estimator with 180 features.
Fitting estimator with 170 features.
Fitting estimator with 160 features.
Fitting estimator with 150 features.
Fitting estimator with 140 features.
Fitting estimator with 130 features.
Fitting estimator with 120 features.
Fitting estimator with 110 features.
Fitting estimator with 100 features.
Fitting estimator with 90 features.
Fitting estimator with 80 features.
Fitting estimator with 70 features.
Fitting estimator with 60 features.


In [16]:
# Filtragem das colunas por prefixo
alunos_cols = [col for col in X_train.columns if col.startswith('ALUNOS_PORCENTAGEM_')]
professor_cols = [col for col in X_train.columns if col.startswith('PROFESSOR_PORCENTAGEM_')]
diretor_cols = [col for col in X_train.columns if col.startswith('DIRETOR_PORCENTAGEM_')]
outros_cols = [col for col in X_train.columns if not col.startswith(('ALUNOS_PORCENTAGEM_', 'PROFESSOR_PORCENTAGEM_', 'DIRETOR_PORCENTAGEM_'))]

# Pipeline de seleção de features e modelo
estimator = DecisionTreeRegressor(random_state=42)
params = {'lambda': 5.7279, 'alpha': 3.7958, 'colsample_bytree': 0.7, 'subsample': 1.0, 
          'learning_rate': 0.02447, 'n_estimators': 344, 'max_depth': 9, 'min_child_weight': 4, 'gamma': 0.01329}

pipe = Pipeline([
    ('select_k_alunos', SelectKBest(mutual_info_regression, k=min(30, len(alunos_cols)))),
    ('select_k_professor', SelectKBest(mutual_info_regression, k=min(10, len(professor_cols)))),
    ('select_k_diretor', SelectKBest(mutual_info_regression, k=min(10, len(diretor_cols)))),
    ('xgb', XGBRegressor(**params))
])

pipe.fit(X_train, y_train)

# Obtendo as colunas selecionadas
selected_features = []
for step_name, cols in zip(['select_k_alunos', 'select_k_professor', 'select_k_diretor'], [alunos_cols, professor_cols, diretor_cols]):
    step = pipe.named_steps[step_name]
    selected_mask = step.get_support()
    selected_features.extend([col for col, selected in zip(cols, selected_mask) if selected])

In [17]:
selected_features

['ALUNOS_PORCENTAGEM_Q010C_Sim_Quarto_So_Meu',
 'ALUNOS_PORCENTAGEM_Q008B_Sim_Agua_Tratada',
 'ALUNOS_PORCENTAGEM_Q010E_Sim_Garagem',
 'ALUNOS_PORCENTAGEM_Q010B_Sim_Wifi',
 'ALUNOS_PORCENTAGEM_Q006A_Conversa_Sobre_Escola_Quase_Sempre',
 'ALUNOS_PORCENTAGEM_Q009F_2_Banheiro_Em_Casa',
 'ALUNOS_PORCENTAGEM_Q008C_Sim_Iluminacao_Na_Rua',
 'ALUNOS_PORCENTAGEM_Q009E_2_Televisao_Em_Casa',
 'ALUNOS_PORCENTAGEM_Q018C_Le_Quadrinhos_Quase_Nunca',
 'ALUNOS_PORCENTAGEM_Q012_Meio_Transporte_Carro',
 'ALUNOS_PORCENTAGEM_Q019_Depois_EM_Somente_Trabalhar',
 'ALUNOS_PORCENTAGEM_Q016_Abandonou_Escola_Nunca',
 'ALUNOS_PORCENTAGEM_Q010F_Sim_Microondas',
 'ALUNOS_PORCENTAGEM_Q004_Mae_Nao_Sabe_Escolaridade',
 'ALUNOS_PORCENTAGEM_Q005_Pai_Nao_Sabe_Escolaridade',
 'ALUNOS_PORCENTAGEM_Q002_Branca',
 'ALUNOS_PORCENTAGEM_Q009G_Nenhum_Carro_Em_Casa',
 'ALUNOS_PORCENTAGEM_Q020_Nao_Concluiu_EM_Eja',
 'ALUNOS_PORCENTAGEM_Q017E_Trabalhar_Mais_2_Hora',
 'ALUNOS_PORCENTAGEM_Q010H_Sim_Lavadoura_Roupas',
 'ALUNOS_PORCENTAG

In [18]:
# Criando um subconjunto de X e y com as colunas selecionadas
X_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Treinando um novo modelo apenas com as colunas selecionadas
model = XGBRegressor(**params)
model.fit(X_selected, y_train)

# Fazendo previsões
y_pred = model.predict(X_test_selected)

# Avaliando o modelo
error_metrics(y_test, y_pred)


RMSE:  136.02979598250562
Max error:  57.520502929687495
MAPE:  0.032939015987930205
R2:  0.7052818839930994
MAE:  8.912808305584017


In [None]:
resultados com 30 colunas antigo:
RMSE:  165.0146211243572
Max error:  60.860867309570324
MAPE:  0.03619159377763197
R2:  0.695623850037049
MAE:  9.791737287534762

resultados com 30 colunas nesse notebook:
RMSE:  132.2445886124595
Max error:  56.11862915039063
MAPE:  0.032711287728734384
R2:  0.7134828018636143
MAE:  8.847187283875511

resultados com 50 colunas nesse notebook:
RMSE:  128.12717280248
Max error:  53.31000915527346
MAPE:  0.03204183413783028
R2:  0.7224034726737818
MAE:  8.664571427999672

In [19]:
# Acessa o seletor SelectKBest dentro do pipeline
select_k_step = pipe.named_steps['select_k']
selected_k_features_mask = select_k_step.get_support()  # Máscara booleana das 200 melhores features

# Obtém os nomes das features selecionadas pelo SelectKBest
selected_k_features = X_train.columns[selected_k_features_mask]

# Acessa o seletor RFE dentro do pipeline
rfe_step = pipe.named_steps['feature_selector']
selected_rfe_features_mask = rfe_step.support_  # Máscara booleana das 29 melhores features

# Aplica a máscara do RFE às features selecionadas pelo SelectKBest
final_selected_features = selected_k_features[selected_rfe_features_mask]

# Exibe as features escolhidas
print("Features escolhidas:", final_selected_features.tolist())

KeyError: 'select_k'

In [None]:
#pd.set_option('display.max_columns', None)
dados_selecionados = df_final[final_selected_features]

# Seleciona as colunas adicionais manualmente
colunas_adicionais = df_final[['ALUNOS_ID_ESCOLA', 'MEDIA_FINAL']]

# Junta as colunas selecionadas com as adicionais
dados = pd.concat([dados_selecionados, colunas_adicionais], axis=1)

In [None]:
dados.columns

In [None]:
error_metrics(y_test, pipe.predict(X_test))

resultados com 30 colunas antigo:
RMSE:  165.0146211243572
Max error:  60.860867309570324
MAPE:  0.03619159377763197
R2:  0.695623850037049
MAE:  9.791737287534762

resultados com 30 colunas nesse notebook:
RMSE:  132.2445886124595
Max error:  56.11862915039063
MAPE:  0.032711287728734384
R2:  0.7134828018636143
MAE:  8.847187283875511

resultados com 50 colunas nesse notebook:
RMSE:  128.12717280248
Max error:  53.31000915527346
MAPE:  0.03204183413783028
R2:  0.7224034726737818
MAE:  8.664571427999672

In [None]:
dados

In [None]:
#dados.to_csv('novas_30_colunas.csv', index=False)