# bibliotecas

In [5]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go

import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler


# variáveis iniciais

In [None]:
random_state = 42

# importação dos dados e tratamento

In [7]:
data = pd.read_csv('housing.csv')
data.rename(columns={"median_house_value":"target"}, inplace=True)
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,target,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


## ocean_proximity to dummies

In [11]:
data = pd.get_dummies(data,'ocean_proximity')
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,target,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,0,1,0,0,0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,0,1,0,0,0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,0,1,0,0,0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,0,1,0,0,0


# previsão com grid search

In [12]:
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Random Forest
rf_params = {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, 30]}

# XGBoost
xgb_params = {'n_estimators': [100, 200, 500], 'max_depth': [5, 10, 15]}

# LightGBM
lgb_params = {'n_estimators': [100, 200, 500], 'max_depth': [5, 10, 15]}

rf = RandomForestRegressor(random_state=random_state)
xgb_model = xgb.XGBRegressor(random_state=random_state)
lgb_model = lgb.LGBMRegressor(random_state=random_state)

models = [
    {'estimator': rf, 'params': rf_params, 'name': 'Random Forest'},
    {'estimator': xgb_model, 'params': xgb_params, 'name': 'XGBoost'},
    {'estimator': lgb_model, 'params': lgb_params, 'name': 'LightGBM'}
]

# Definindo os scalers
scalers = [StandardScaler(), MinMaxScaler(), RobustScaler()]

best_model = None
best_score = float('inf')

for model in models:
    for scaler in scalers:
        pipeline = Pipeline([
            ('scaler', scaler),
            ('estimator', model['estimator'])
        ])

        param_grid = {f'estimator__{k}': v for k, v in model['params'].items()}

        grid_search = GridSearchCV(pipeline, param_grid, scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False), cv=5)
        grid_search.fit(X_train, y_train)

        if -grid_search.best_score_ < best_score:
            best_score = -grid_search.best_score_
            best_model_grid = grid_search
            
        print(50*'-')
        print(model['name'])
        print('Scaler:', scaler)
        print('RMSE:', -grid_search.best_score_)
        print(50*'-')

print(best_model_grid)
print(best_model_grid.best_params_)

--------------------------------------------------
Random Forest
Scaler: StandardScaler()
RMSE: 49004.110834580875
--------------------------------------------------
--------------------------------------------------
Random Forest
Scaler: MinMaxScaler()
RMSE: 49018.779055404266
--------------------------------------------------
--------------------------------------------------
Random Forest
Scaler: RobustScaler()
RMSE: 49007.795741102855
--------------------------------------------------
--------------------------------------------------
XGBoost
Scaler: StandardScaler()
RMSE: 47922.41918073893
--------------------------------------------------
--------------------------------------------------
XGBoost
Scaler: MinMaxScaler()
RMSE: 47931.53432816075
--------------------------------------------------
--------------------------------------------------
XGBoost
Scaler: RobustScaler()
RMSE: 47952.52388546918
--------------------------------------------------
---------------------------------

**O melhor modelo foi o LGBMRegressor, através do MinMaxScaler e com os parâmetros {'estimator__max_depth': 15, 'estimator__n_estimators': 500}. Obtendo o menor RMSE de 46222**

# plot

In [31]:
# Fazendo previsões
y_train_pred = best_model_grid.predict(X_train)
y_test_pred = best_model_grid.predict(X_test)
y_all_pred = np.concatenate([y_train_pred, y_test_pred])

# Criando o gráfico para os dados de treino e teste
fig1 = go.Figure()

fig1.add_trace(go.Scatter(y=y_test, mode='lines', name='Test Data'))
fig1.add_trace(go.Scatter(y=best_model_grid.predict(X_test), mode='lines', name='Test Prediction'))

fig1.update_layout(title='Train and Test Data vs Predictions', xaxis_title='Index', yaxis_title='Value')

fig1.show()

# Criando o gráfico para os dados originais e previsões
fig2 = go.Figure()

fig2.add_trace(go.Scatter(y=y, mode='lines', name='Original Data'))
fig2.add_trace(go.Scatter(y=best_model_grid.predict(X), mode='lines', name='All Predictions'))

fig2.update_layout(title='Original Data vs All Predictions', xaxis_title='Index', yaxis_title='Value')

fig2.show()


Professor, plotei de novo, mas deixei um zoom manual, achei que estava muito ruim de enxergar mais no detalhe no plot acima. Fique a vontade para navegar no plot arrastando o eixo x com o mouse.

In [30]:
# Fazendo previsões
y_train_pred = best_model_grid.predict(X_train)
y_test_pred = best_model_grid.predict(X_test)
y_all_pred = np.concatenate([y_train_pred, y_test_pred])

# Criando o gráfico para os dados de treino e teste
fig1 = go.Figure()

fig1.add_trace(go.Scatter(y=y_test, mode='lines', name='Test Data'))
fig1.add_trace(go.Scatter(y=best_model_grid.predict(X_test), mode='lines', name='Test Prediction'))

fig1.update_layout(title='Train and Test Data vs Predictions', xaxis_title='Index', yaxis_title='Value',
                   xaxis_range=[300,600])  # Zoom manual

fig1.show()

# Criando o gráfico para os dados originais e previsões
fig2 = go.Figure()

fig2.add_trace(go.Scatter(y=y, mode='lines', name='Original Data'))
fig2.add_trace(go.Scatter(y=best_model_grid.predict(X), mode='lines', name='All Predictions'))

fig2.update_layout(title='Original Data vs All Predictions', xaxis_title='Index', yaxis_title='Value',
                   xaxis_range=[16200,16500])  # Zoom manual

fig2.show()
