In [None]:
import os
import sys
from pathlib import Path
from typing import Any

import joblib
import pandas as pd
import statsmodels.api as sm
from ai_utils_client.data_ingestor_client import DataIngestorClient

# Automation Hub Client Imports
from ai_utils_client.data_processor_client import DataProcessorClient
from core_lib_client.logger_client import logger
from dotenv import load_dotenv
from gdrive_client import GDriveClient
from IPython.display import display
from sklearn.preprocessing import StandardScaler

# Add automation-hub to sys.path to resolve the 'clients' module error
# This ensures that 'from clients.gdrive' works inside the library
hub_path = str(Path(os.getcwd()).parents[1] / "automation-hub")
if hub_path not in sys.path:
    sys.path.append(hub_path)

# Load environment variables from ai-lab root
load_dotenv()

# Configuration from Single Source of Truth (.env)
GDRIVE_FILE_ID: str | None = os.getenv("CAR_DATA_FILE_ID")
# Local Directories
RAW_DIR: str = os.getenv("LOCAL_RAW_DIR", "data/raw")
PROCESSED_DIR: str = os.getenv("LOCAL_PROCESSED_DIR", "data/processed")
MODELS_DIR: str = os.getenv("LOCAL_MODELS_DIR", "data/models")

# GDrive IDs
GDRIVE_PROCESSED_DATA_ID: str | None = os.getenv("GDRIVE_DATA_PROCESSED_FOLDER_ID")
GDRIVE_MODELS_PROD_ID: str | None = os.getenv("GDRIVE_MODELS_PROD_FOLDER_ID")
GDRIVE_MODELS_DEV_ID: str | None = os.getenv("GDRIVE_MODELS_DEV_FOLDER_ID")

# Initialize Shared Automation Clients
gdrive: GDriveClient = GDriveClient()
data_processor: DataProcessorClient = DataProcessorClient()
data_ingestor: DataIngestorClient = DataIngestorClient()

In [None]:
def train_linear_model(
    data: pd.DataFrame, features: list[str]
) -> tuple[Any, StandardScaler]:
    """
    Standardizes features and fits an Ordinary Least Squares (OLS) regression model.
    """
    # Feature selection and target isolation
    X: pd.DataFrame = data[features].copy()
    y: pd.Series = data["Price"]

    # Feature scaling (Standardization)
    scaler: StandardScaler = StandardScaler()
    X_scaled: Any = scaler.fit_transform(X)

    # Reconstruct DataFrame to maintain metadata for Statsmodels
    X_scaled_df: pd.DataFrame = pd.DataFrame(X_scaled, columns=features, index=X.index)

    # Add constant for Intercept (Alpha)
    X_final: pd.DataFrame = sm.add_constant(X_scaled_df)

    # Fit OLS Model
    model: Any = sm.OLS(y, X_final).fit()

    return model, scaler

In [None]:
def export_assets(
    model: Any,
    scaler: StandardScaler,
    df_prepared: pd.DataFrame,
    env: str = "prod",  # Added environment toggle
) -> None:
    """
    Persists artifacts locally and syncs to specific GDrive folders.
    """
    # 1. Local Persistence (Always ensure directories exist)
    os.makedirs(PROCESSED_DIR, exist_ok=True)
    os.makedirs(MODELS_DIR, exist_ok=True)

    csv_path: str = os.path.join(PROCESSED_DIR, "df_prepared.csv")
    model_path: str = os.path.join(MODELS_DIR, "car_price_model.pkl")
    scaler_path: str = os.path.join(MODELS_DIR, "scaler.pkl")

    df_prepared.to_csv(csv_path, index=False)
    joblib.dump(model, model_path)
    joblib.dump(scaler, scaler_path)

    # 2. GDrive Sync - Processed Data
    if GDRIVE_PROCESSED_DATA_ID:
        logger.info(f"Syncing processed data to GDrive ID: {GDRIVE_PROCESSED_DATA_ID}")
        gdrive.upload_file(file_path=csv_path, folder_id=GDRIVE_PROCESSED_DATA_ID)

    # 3. GDrive Sync - Models (Logic for Dev vs Prod)
    # If we are just testing, we should use GDRIVE_MODELS_DEV_ID
    target_folder: str | None = (
        GDRIVE_MODELS_PROD_ID if env == "prod" else GDRIVE_MODELS_DEV_ID
    )

    if target_folder:
        logger.info(f"Syncing model artifacts to GDrive Folder: {target_folder}")
        gdrive.upload_file(file_path=model_path, folder_id=target_folder)
        gdrive.upload_file(file_path=scaler_path, folder_id=target_folder)
        logger.success(f"Artifacts successfully synced to {env.upper()} environment.")

In [None]:
def run_experiment() -> None:
    """
    Main pipeline execution: Ingestion, Engineering, Training, and Export.
    """

    logger.section("STARTING STANDARDIZED EXPERIMENT")

    # Ingestion Phase
    local_raw_file_path: str = os.path.join(RAW_DIR, "cars.xls")
    df_raw: pd.DataFrame = data_ingestor.get_spreadsheet_data(
        file_id=GDRIVE_FILE_ID, local_file_path=local_raw_file_path
    )

    # Normalize columns
    df_raw.columns = df_raw.columns.str.strip().str.capitalize()

    # Categorical Encoding (Make, Type)
    logger.info("Encoding categorical features...")
    categorical_cols: list[str] = ["Make", "Model", "Type"]
    df_prepared: pd.DataFrame = data_processor.encode_categorical_features(
        df=df_raw, columns=categorical_cols, drop_first=True
    )

    # Feature Selection
    base_numerical: list[str] = ["Mileage", "Doors"]
    binary_flags: list[str] = ["Leather"]
    encoded_features: list[str] = [
        col
        for col in df_prepared.columns
        if col.startswith(("Make_", "Model_", "Type_"))
    ]

    active_features: list[str] = base_numerical + binary_flags + encoded_features
    logger.info(f"Training with {len(active_features)} active features.")

    # Model Training
    model, scaler = train_linear_model(df_prepared, active_features)

    # Reporting
    logger.section("REGRESSION SUMMARY REPORT")
    display(model.summary())

    # 6. Persistence
    logger.section("EXPORT ASSETS")
    export_assets(model, scaler, df_prepared, env="prod")
    logger.success("EXPERIMENT COMPLETED AND ARCHIVED")


# Execute the experiment
run_experiment()