In [24]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb 
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV


In [2]:
caminho = "../dados/CO2_Emissions_Canada.csv"
df_raw = pd.read_csv(caminho)
df_raw.shape

(7385, 12)

In [3]:
# Remover duplicados
df_trat = df_raw.drop_duplicates()
print(f"Dataset após remover duplicatas: {df_trat.shape}")

# Remover coluna redundante
base = df_trat.drop(columns=['Fuel Consumption Comb (mpg)'])

Dataset após remover duplicatas: (6282, 12)


In [17]:
df_trat


Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244
...,...,...,...,...,...,...,...,...,...,...,...,...
7380,VOLVO,XC40 T5 AWD,SUV - SMALL,2.0,4,AS8,Z,10.7,7.7,9.4,30,219
7381,VOLVO,XC60 T5 AWD,SUV - SMALL,2.0,4,AS8,Z,11.2,8.3,9.9,29,232
7382,VOLVO,XC60 T6 AWD,SUV - SMALL,2.0,4,AS8,Z,11.7,8.6,10.3,27,240
7383,VOLVO,XC90 T5 AWD,SUV - STANDARD,2.0,4,AS8,Z,11.2,8.3,9.9,29,232


In [4]:
# Separar target e features
target = 'CO2 Emissions(g/km)'

In [5]:
# Features numéricas
num_features = [
    'Engine Size(L)',
    'Cylinders',
    'Fuel Consumption City (L/100 km)',
    'Fuel Consumption Hwy (L/100 km)',
    'Fuel Consumption Comb (L/100 km)'
]

# Features categóricas
cat_features = [
    'Make',
    'Model',
    'Vehicle Class',
    'Transmission',
    'Fuel Type'
]

features = num_features + cat_features

In [6]:
X = base[features]
y = base[target]

In [7]:
print("Features (X):")
print(X.head())

Features (X):
   Engine Size(L)  Cylinders  Fuel Consumption City (L/100 km)  \
0             2.0          4                               9.9   
1             2.4          4                              11.2   
2             1.5          4                               6.0   
3             3.5          6                              12.7   
4             3.5          6                              12.1   

   Fuel Consumption Hwy (L/100 km)  Fuel Consumption Comb (L/100 km)   Make  \
0                              6.7                               8.5  ACURA   
1                              7.7                               9.6  ACURA   
2                              5.8                               5.9  ACURA   
3                              9.1                              11.1  ACURA   
4                              8.7                              10.6  ACURA   

        Model Vehicle Class Transmission Fuel Type  
0         ILX       COMPACT          AS5         Z  
1       

In [8]:
print("\nTarget (y):")
print(y.head())


Target (y):
0    196
1    221
2    136
3    255
4    244
Name: CO2 Emissions(g/km), dtype: int64


### Transformar categóricas em numéricas (One-Hot Encoding ou Label Encoding)  
    Usamos o one hot encoder pois não há uma sequência lógica nos dados categóricos.


In [9]:
X = pd.get_dummies(X, columns=cat_features)

In [10]:
X

Unnamed: 0,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Make_ACURA,Make_ALFA ROMEO,Make_ASTON MARTIN,Make_AUDI,Make_BENTLEY,...,Transmission_AV7,Transmission_AV8,Transmission_M5,Transmission_M6,Transmission_M7,Fuel Type_D,Fuel Type_E,Fuel Type_N,Fuel Type_X,Fuel Type_Z
0,2.0,4,9.9,6.7,8.5,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,2.4,4,11.2,7.7,9.6,True,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
2,1.5,4,6.0,5.8,5.9,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,True
3,3.5,6,12.7,9.1,11.1,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,3.5,6,12.1,8.7,10.6,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7380,2.0,4,10.7,7.7,9.4,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7381,2.0,4,11.2,8.3,9.9,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7382,2.0,4,11.7,8.6,10.3,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7383,2.0,4,11.2,8.3,9.9,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


### Padronizar ou normalizar features numéricas
    A normalização neste caso não é bem vista, visto que é sensível a outliers então um carro com um consumo muito alto de combustivel estaria em uma mesma escala de carros com combustiveis normais, já a Padronização é ótimo para esses tipos de caso.


In [11]:
scaler = StandardScaler()

In [12]:
X[num_features] = scaler.fit_transform(X[num_features])

In [13]:
X

Unnamed: 0,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Make_ACURA,Make_ALFA ROMEO,Make_ASTON MARTIN,Make_AUDI,Make_BENTLEY,...,Transmission_AV7,Transmission_AV8,Transmission_M5,Transmission_M6,Transmission_M7,Fuel Type_D,Fuel Type_E,Fuel Type_N,Fuel Type_X,Fuel Type_Z
0,-0.851086,-0.876934,-0.762844,-1.040321,-0.854490,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,-0.558066,-0.876934,-0.396934,-0.601475,-0.481184,True,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
2,-1.217362,-0.876934,-1.860575,-1.435283,-1.736851,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,True
3,0.247740,0.206429,0.025270,0.012910,0.027870,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,0.247740,0.206429,-0.143611,-0.162629,-0.141815,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7380,-0.851086,-0.876934,-0.537668,-0.601475,-0.549058,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7381,-0.851086,-0.876934,-0.396934,-0.338167,-0.379373,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7382,-0.851086,-0.876934,-0.256199,-0.206513,-0.243625,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
7383,-0.851086,-0.876934,-0.396934,-0.338167,-0.379373,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [16]:
y

0       196
1       221
2       136
3       255
4       244
       ... 
7380    219
7381    232
7382    240
7383    232
7384    248
Name: CO2 Emissions(g/km), Length: 6282, dtype: int64

### Dividir dataset em treino/teste


In [15]:
X_train, X_test, y_train , y_test = train_test_split(X,y, test_size= 0.2 , random_state=42)

### LinearRegression 


In [20]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

mae_linear = mean_absolute_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print(f"MAE (Erro Absoluto Médio): {mae_linear:.2f}")
print(f"R² (Coeficiente de Determinação): {r2_linear:.2f}")
print("-" * 30)

MAE (Erro Absoluto Médio): 3.68
R² (Coeficiente de Determinação): 0.99
------------------------------


### RandomForestRegressor 

In [28]:
rf_model = RandomForestRegressor(max_depth=None, min_samples_leaf=1,n_estimators= 200) 

rf_model.fit(X_train, y_train)


y_pred_rf = rf_model.predict(X_test)


mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"MAE (Erro Absoluto Médio): {mae_rf:.2f}")
print(f"R² (Coeficiente de Determinação): {r2_rf:.2f}")
print("-" * 30)

MAE (Erro Absoluto Médio): 2.17
R² (Coeficiente de Determinação): 1.00
------------------------------


In [30]:
print("--- Tuning do Random Forest ---")


param_grid_rf = {
    'n_estimators': [100, 200],         
    'max_depth': [10, 20, None],      
    'min_samples_leaf': [1, 2, 4]       
}


grid_rf = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42, n_jobs=-1),
    param_grid=param_grid_rf,
    scoring='neg_mean_absolute_error',
    cv=5, 
    verbose=1,
    n_jobs=-1
)


grid_rf.fit(X_train, y_train)


best_rf = grid_rf.best_estimator_
print(f"Melhores parâmetros para RF: {grid_rf.best_params_}")


y_pred_rf_tuned_tuned = best_rf.predict(X_test)


mae_rf = mean_absolute_error(y_test, y_pred_rf_tuned)
r2_rf = r2_score(y_test, y_pred_rf_tuned)

print(f"MAE (Erro Absoluto Médio): {mae_rf:.2f}")
print(f"R² (Coeficiente de Determinação): {r2_rf:.2f}")
print("-" * 30)

--- Tuning do Random Forest ---
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Melhores parâmetros para RF: {'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 200}
MAE (Erro Absoluto Médio): 2.17
R² (Coeficiente de Determinação): 1.00
------------------------------


### XGBoost

In [22]:

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)


xgb_model.fit(X_train, y_train)


y_pred_xgb = xgb_model.predict(X_test)


mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"MAE (Erro Absoluto Médio): {mae_xgb:.2f}")
print(f"R² (Coeficiente de Determinação): {r2_xgb:.2f}")
print("-" * 30)

MAE (Erro Absoluto Médio): 2.33
R² (Coeficiente de Determinação): 0.99
------------------------------


In [29]:
print("--- Tuning do XGBoost ---")

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1],
    'colsample_bytree': [0.7, 1.0] 
}


grid_xgb = GridSearchCV(
    estimator=xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
    param_grid=param_grid_xgb,
    scoring='neg_mean_absolute_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)


grid_xgb.fit(X_train, y_train)


best_xgb = grid_xgb.best_estimator_
print(f"Melhores parâmetros para XGBoost: {grid_xgb.best_params_}")


y_pred_xgb_tuned_tuned = best_xgb.predict(X_test)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb_tuned)
r2_xgb = r2_score(y_test, y_pred_xgb_tuned)

print(f"MAE (Erro Absoluto Médio): {mae_xgb:.2f}")
print(f"R² (Coeficiente de Determinação): {r2_xgb:.2f}")
print("-" * 30)


--- Tuning do XGBoost ---
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Melhores parâmetros para XGBoost: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
MAE (Erro Absoluto Médio): 2.23
R² (Coeficiente de Determinação): 0.99
------------------------------


### Comparar Modelos

In [31]:
resultados = pd.DataFrame({
    'Modelo': ['Regressão Linear', 'Random Forest', 'XGBoost'],
    'MAE': [mae_linear, mae_rf, mae_xgb],
    'R²': [r2_linear, r2_rf, r2_xgb]
})

resultados = resultados.sort_values(by='MAE', ascending=True)

print("--- Tabela de Comparação de Modelos ---")
print(resultados.round(2))

--- Tabela de Comparação de Modelos ---
             Modelo   MAE    R²
1     Random Forest  2.17  1.00
2           XGBoost  2.23  0.99
0  Regressão Linear  3.68  0.99
