In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score


# Carregar datasets

In [54]:
# Carregar CSV
caminho = "../dados/CO2_Emissions_Canada.csv"
df = pd.read_csv(caminho)

# Verificar primeiras linhas
df.head()


Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


# Pré Processamento inicial

In [55]:
# Mapear Fuel Type para detalhado
fuel_map = {"Z": "Premium Gasoline", "X": "Regular Gasoline", "D": "Diesel", 
            "E": "Ethanol", "N": "Natural Gas"}
df["Fuel Type Detailed"] = df["Fuel Type"].map(fuel_map)
df["Fuel Type Detailed"].fillna("Other", inplace=True)

# Criar colunas binárias para tipos de combustível
df["Is_Diesel"] = (df["Fuel Type Detailed"] == "Diesel").astype(int)
df["Is_Hybrid"] = (df["Fuel Type Detailed"] == "Hybrid").astype(int)
df["Is_Electric"] = (df["Fuel Type Detailed"] == "Electric").astype(int)

# Intensidade de emissão por combustível (gCO2/L)
fuel_co2_factor = {
    "Gasoline": 2392,
    "Regular Gasoline": 2392,
    "Diesel": 2640,
    "Ethanol": 1500,
    "Natural Gas": 2010,
    "Hybrid": 1800,
    "Electric": 0,
    "Premium Gasoline": 2392,
    "Other": 2200
}
df["Fuel_CO2_Factor"] = df["Fuel Type Detailed"].map(fuel_co2_factor)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Fuel Type Detailed"].fillna("Other", inplace=True)


# Agrupar categorias de veiculos

In [56]:
def agrupar_categoria(cat):
    cat = str(cat).upper()
    if "SUV" in cat:
        return "SUV"
    elif "COMPACT" in cat or "SUBCOMPACT" in cat or "MID" in cat or "FULL" in cat or "MINI" in cat:
        return "CAR"
    elif "PICKUP" in cat or "TRUCK" in cat:
        return "TRUCK"
    elif "VAN" in cat:
        return "VAN"
    elif "STATION" in cat:
        return "WAGON"
    else:
        return "OTHER"

df["Vehicle Category"] = df["Vehicle Class"].apply(agrupar_categoria)


# Criar features

In [57]:
# One-hot encoding para Vehicle Category
df_model = pd.get_dummies(df, columns=["Vehicle Category"], drop_first=True)

# Features originais + detalhadas
features = [
    "Engine Size(L)",
    "Cylinders",
    "Fuel Consumption City (L/100 km)",
    "Fuel Consumption Hwy (L/100 km)",
    "Fuel Consumption Comb (L/100 km)",
    "Is_Diesel",
    "Is_Hybrid",
    "Is_Electric",
    "Fuel_CO2_Factor"
] + [col for col in df_model.columns if col.startswith("Vehicle Category_")]

# Features derivadas
df_model["Engine_per_Cylinder"] = df_model["Engine Size(L)"] / df_model["Cylinders"]
df_model["City_Hwy_Ratio"] = df_model["Fuel Consumption City (L/100 km)"] / df_model["Fuel Consumption Hwy (L/100 km)"]
df_model["Weighted_Fuel"] = df_model["Fuel_CO2_Factor"] * df_model["Fuel Consumption Comb (L/100 km)"]

features += ["Engine_per_Cylinder", "City_Hwy_Ratio", "Weighted_Fuel"]

# Suavizar variável alvo
np.random.seed(42)
df_model["CO2_Smoothed"] = df_model["CO2 Emissions(g/km)"] + np.random.normal(0, 5, size=len(df_model))

target = "CO2_Smoothed"


# Regressão linear com cross-validation

In [58]:
X = df_model[features]
y = df_model[target]

models = [LinearRegression(), Ridge(alpha=1.0), Lasso(alpha=0.1)]

for model in models:
    scores_r2 = cross_val_score(model, X, y, cv=5, scoring="r2")
    scores_mae = -cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error")
    print(f"{model.__class__.__name__}")
    print(f"R² média: {scores_r2.mean():.4f}")
    print(f"MAE médio: {scores_mae.mean():.4f}")
    print("-" * 30)


  model = cd_fast.enet_coordinate_descent(


LinearRegression
R² média: 0.9895
MAE médio: 4.7172
------------------------------
Ridge
R² média: 0.9895
MAE médio: 4.7179
------------------------------


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso
R² média: 0.9893
MAE médio: 4.7633
------------------------------


  model = cd_fast.enet_coordinate_descent(


# Random Forest com hiperparâmetros controlados

In [59]:
# Separar X e y
X = df_model[features]
y = df_model["CO2 Emissions(g/km)"]  # usar alvo original aqui

# Random Forest
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = cross_val_score(rf_model, X, y, cv=kf, scoring='r2')
mae_scores = -cross_val_score(rf_model, X, y, cv=kf, scoring='neg_mean_absolute_error')

print("R² por fold:", r2_scores)
print("Média R²:", r2_scores.mean())
print("MAE por fold:", mae_scores)
print("Média MAE:", mae_scores.mean())

# Treinar no dataset completo
rf_model.fit(X, y)
y_pred = rf_model.predict(X)
print("-" * 30)
print("Teste MAE:", mean_absolute_error(y, y_pred))
print("Teste R²:", r2_score(y, y_pred))


R² por fold: [0.99731148 0.99775387 0.99713445 0.99684342 0.99717432]
Média R²: 0.9972435088095131
MAE por fold: [2.15578378 2.06787792 2.13068792 2.1769259  2.06335102]
Média MAE: 2.1189253088597746
------------------------------
Teste MAE: 1.7710051858148135
Teste R²: 0.998203617730196
