In [None]:
import os
from pathlib import Path
from typing import Final

import numpy as np
import pandas as pd
import statsmodels.api as sm

from infra.ai_utils import DataIngestor, DataProcessor
from infra.common import logger
from infra.common.config import RAW_DIR

In [None]:
data_ingestor: DataIngestor = DataIngestor()
data_processor: DataProcessor = DataProcessor()

In [None]:
# Ingest data from the file

GDRIVE_FILE_ID: Final[str] = os.getenv("GDRIVE_FILE_ID")
RAW_FILE_NAME: Final[str] = "cars.xlsx"
local_raw_path: Path = RAW_DIR / RAW_FILE_NAME

df_raw: pd.DataFrame = data_ingestor.get_data(local_raw_path, "cars")
logger.info(str(f"\n{df_raw}"))

df_raw.columns = df_raw.columns.str.strip().str.capitalize()

In [None]:
categorical_features: list[str] = ["Make", "Model", "Trim", "Type"]
numerical_features: list[str] = ["Mileage", "Cylinder", "Liter", "Doors"]
indicator_features: list[str] = ["Cruise", "Sound", "Leather"]

In [None]:
# Categorical Encoding

df_encoded: pd.DataFrame = data_processor.encode_categorical_features(
    df=df_raw, columns=categorical_features
)
logger.info(str(f"\n{df_encoded}"))

In [None]:
# Scaling

df_scaled: pd.DataFrame = data_processor.scale_features(
    df=df_encoded, columns=numerical_features
)

logger.info(str(f"\n{df_scaled}"))
logger.info(str(f"\n{df_scaled}"))

In [None]:
# Feature selection

encoded_features: list[str] = []

for col in df_encoded.columns:
    if col.startswith(("Make_", "Model_", "Trim_", "Type_")):
        encoded_features.append(col)

active_features: list[str] = numerical_features + encoded_features + indicator_features

In [None]:
# Training

X: pd.DataFrame = df_scaled[active_features]
y: pd.Series = df_raw["Price"]

X_final: pd.DataFrame = sm.add_constant(X)
results = sm.OLS(y, X_final).fit()

logger.info(results.summary())

In [None]:
# Refined scaling
# Removed: all indicator_features

active_features_refined: list[str] = numerical_features + encoded_features

X_refined: pd.DataFrame = df_scaled[active_features_refined]
y_refined: pd.Series = df_raw["Price"]

X_final_refined: pd.DataFrame = sm.add_constant(X_refined)
results = sm.OLS(y_refined, X_final_refined).fit()

logger.info(results.summary())

In [None]:
# Diagnostic Plot: Check for Heteroscedasticity

import matplotlib.pyplot as plt
import seaborn as sns

# Use the results from your last model
predictions = results.predict(X_final_refined)
residuals = results.resid

plt.figure(figsize=(12, 7))
sns.scatterplot(x=predictions, y=residuals, alpha=0.6)
plt.axhline(0, color="red", linestyle="--")
plt.xlabel("Predicted Values (Price)")
plt.ylabel("Residuals")
plt.title("Residuals vs. Predicted Values Plot")
plt.show()

In [None]:
# --- Log-Transformed Model ---
logger.section("Training Log-Transformed Model (log(Price))")

# 1. Transformar o alvo
y_log = np.log(df_raw["Price"])

# 2. Treinar o modelo com as mesmas features refinadas
results_log = sm.OLS(y_log, X_final_refined).fit(cov_type="HC3")

logger.info(results_log.summary())

# 3. Plot de Res√≠duos para o Modelo Log
predictions_log = results_log.predict(X_final_refined)
residuals_log = results_log.resid

plt.figure(figsize=(12, 7))
sns.scatterplot(x=predictions_log, y=residuals_log, alpha=0.6)
plt.axhline(0, color="red", linestyle="--")
plt.xlabel("Predicted Values (log(Price))")
plt.ylabel("Residuals")
plt.title("Residuals vs. Predicted Values (Log-Transformed)")
plt.show()