### Importar bibliotecas

In [0]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


### Carregar os dados

In [0]:
caminho = "../datasets/input/car_features.csv"
df = pd.read_csv(caminho)

df.head()

### Tratamentos de nulos e duplicados

In [0]:
# Preencher nulos
df["Engine Fuel Type"].fillna("Unknown", inplace=True)
df["Engine HP"].fillna(df["Engine HP"].median(), inplace=True)
df["Engine Cylinders"].fillna(df["Engine Cylinders"].median(), inplace=True)
df["Number of Doors"].fillna(df["Number of Doors"].mode()[0], inplace=True)
df["Market Category"].fillna("Unknown", inplace=True)

# Remover duplicados
df = df.drop_duplicates()

### Features e targets

In [0]:
# Features
X = df.drop(columns=["MSRP", "city mpg", "highway MPG"])

# Targets (multi-output: preço + consumo)
y = df[["MSRP", "city mpg", "highway MPG"]]

print("X shape:", X.shape)
print("y shape:", y.shape)


### Separar numéricas e categóricas

In [0]:
# Colunas numéricas e categóricas
num_features = ["Engine HP", "Engine Cylinders", "Popularity", "Year", "Number of Doors"]
cat_features = [col for col in X.columns if col not in num_features]

print("Numéricas:", num_features)
print("Categóricas:", cat_features)


### Criar pipeline de pré-processamento

In [0]:
# Transformação para numéricas: padronização
num_transformer = StandardScaler()

# Transformação para categóricas: one-hot
cat_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Pipeline de pré-processamento
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)

### Separar treino e teste

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Treino:", X_train.shape, y_train.shape)
print("Teste:", X_test.shape, y_test.shape)

### Criar modelo
Esse modelo é o randomForestRegressor, só que usando multioutput, ou seja, ele prevê várias variáveis contínuas ao mesmo tempo com base nas características do carro
No caso estamos prevendo:

Preço do carro (MSRP) – valor em reais -> Prediz quanto custaria um carro com determinadas características (marca, motor, cilindros, transmissão, estilo, etc.).
Consumo em cidade (city mpg) – litros por 100 km -> Prediz o consumo médio do carro na cidade, útil para estimar eficiência energética.
Consumo na estrada (highway MPG) – litros por 100 km -> Prediz o consumo médio do carro em rodovias.

In [0]:
# Regressor multi-output usando RandomForest
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", MultiOutputRegressor(RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        random_state=42
    )))
])

### Treinar modelo

In [0]:
model.fit(X_train, y_train)

### Avaliar modelo

In [0]:
y_pred = model.predict(X_test)

# Avaliar cada alvo separadamente
for i, col in enumerate(y.columns):
    mae = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
    rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
    r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
    print(f"--- {col} ---")
    print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.2f}\n")

In [0]:
# # Avaliação no treino (teste para verificar overfitting)
# y_train_pred = model.predict(X_train)

# for i, col in enumerate(y.columns):
#     mae_train = mean_absolute_error(y_train.iloc[:, i], y_train_pred[:, i])
#     rmse_train = np.sqrt(mean_squared_error(y_train.iloc[:, i], y_train_pred[:, i]))
#     r2_train = r2_score(y_train.iloc[:, i], y_train_pred[:, i])
#     print(f"--- {col} (Treino) ---")
#     print(f"MAE: {mae_train:.2f}, RMSE: {rmse_train:.2f}, R2: {r2_train:.2f}\n")

### Gerar previsão para toda a base

In [0]:
y_pred_full = model.predict(X)

pred_df = pd.DataFrame(y_pred_full, columns=["MSRP_pred", "city_mpg_pred", "highway_mpg_pred"])

final_df = pd.concat([df.reset_index(drop=True), pred_df], axis=1)

# Salvar em CSV
final_df.to_csv("../datasets/output/car_features_predictions.csv", index=False)
