In [1]:
import pandas as pd
import numpy as np
import math
import statsmodels.formula.api as sm
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import seaborn as sns

In [2]:
# Display Pipeline
from sklearn import set_config
set_config(display='diagram')


In [3]:
def imprimir_metricas (y, y_pred, dataset_name='valid'):
  mean_y = np.mean(y)
  mean_pred = np.mean(y_pred)

  rmse = round(math.sqrt(mean_squared_error(y, y_pred)), 2)
  mae = mean_absolute_error(y, y_pred)
  r2 = r2_score(y, y_pred)
  mape = mean_absolute_percentage_error(y, y_pred)
  
  report = pd.DataFrame({
    'metric': ['RMSE', 'MAE', 'R^2', 'MAPE', 'Avg. target', 'Avg. Prediction'],
    dataset_name: [rmse, mae, r2, mape, mean_y, mean_pred]
  })

  report[dataset_name] = report[dataset_name].round(2)
  return report

In [32]:
df = pd.read_csv('Base_EMBRAESP_trabalhada.csv')

In [61]:
df.shape

(16935, 87)

In [67]:
target = 'PC_AU_ATU'
numericas = ['ANO_LAN', 'DORM_UNID', 'BANH_UNID', 'GAR_UNID']
categoricas = ['REGIAO']
features = numericas + categoricas
#['', 'ANO_LAN', 'DORM_UNID', 'BANH_UNID', 'GAR_UNID']

In [63]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(target, axis=1), df[target], test_size=0.2)

In [78]:
preprocessing = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(), categoricas)], remainder='passthrough')

In [79]:
model = Pipeline(steps=[
    ('preprocessamento', preprocessing),
    ('regressao', RandomForestRegressor())
])

In [80]:
model.fit(X_train[features], y_train)

In [81]:
model.score(X_test, y_test)

0.658913751857553

In [82]:
y_pred = model.predict(X_test)

In [83]:
baseline_report = imprimir_metricas(y_test, y_pred)
baseline_report

Unnamed: 0,metric,valid
0,RMSE,1403.41
1,MAE,1011.03
2,R^2,0.66
3,MAPE,0.2
4,Avg. target,5455.46
5,Avg. Prediction,5478.53


In [84]:
reg_linear1 = sm.ols(formula='PC_AU_ATU ~ REGIAO', data=df.iloc[X_train.index,:]).fit()
reg_linear1.summary()

0,1,2,3
Dep. Variable:,PC_AU_ATU,R-squared:,0.314
Model:,OLS,Adj. R-squared:,0.314
Method:,Least Squares,F-statistic:,684.6
Date:,"Wed, 01 Mar 2023",Prob (F-statistic):,0.0
Time:,20:36:58,Log-Likelihood:,-121400.0
No. Observations:,13443,AIC:,242800.0
Df Residuals:,13433,BIC:,242900.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7081.7674,80.319,88.171,0.000,6924.331,7239.204
REGIAO[T.Centro-Sul],-201.5390,90.720,-2.222,0.026,-379.363,-23.715
REGIAO[T.Leste 1],-3447.5500,107.896,-31.953,0.000,-3659.041,-3236.059
REGIAO[T.Leste 2],-4110.7311,177.604,-23.145,0.000,-4458.860,-3762.602
REGIAO[T.Nordeste],-1975.6289,102.164,-19.338,0.000,-2175.885,-1775.373
REGIAO[T.Noroeste],-3502.5926,170.070,-20.595,0.000,-3835.953,-3169.232
REGIAO[T.Oeste],23.9266,89.290,0.268,0.789,-151.094,198.947
REGIAO[T.RMSP],-3136.2708,88.517,-35.431,0.000,-3309.776,-2962.766
REGIAO[T.Sudeste],-2078.8997,93.239,-22.297,0.000,-2261.660,-1896.139

0,1,2,3
Omnibus:,624.694,Durbin-Watson:,2.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,713.646
Skew:,0.556,Prob(JB):,1.08e-155
Kurtosis:,3.194,Cond. No.,16.6
