# Model Building & Evaluation


In [None]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Define paths
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
processed_data_path = os.path.join(project_root, 'src', 'Data', 'processed', 'youtube_ad_revenue_processed.csv')
model_dir = os.path.join(project_root, 'src', 'models')
os.makedirs(model_dir, exist_ok=True)


In [None]:
def load_data(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    return pd.read_csv(path)

df = load_data(processed_data_path)
df.head()


## Train/Test Split


In [None]:
# Separate features and target
X = df.drop(columns=['ad_revenue_usd'])
y = df['ad_revenue_usd']

# Scale features (StandardScaler) - Important for Linear Models
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


## Model Training


In [None]:
# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42, n_estimators=100)
}

results = {}
best_model = None
best_r2 = -float('inf')
best_model_name = ""

print("-" * 50)
print(f"{'Model':<20} | {'R2 Score':<10} | {'RMSE':<10} | {'MAE':<10}")
print("-" * 50)

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    results[name] = {"R2": r2, "RMSE": rmse, "MAE": mae}
    
    print(f"{name:<20} | {r2:<10.4f} | {rmse:<10.4f} | {mae:<10.4f}")
    
    if r2 > best_r2:
        best_r2 = r2
        best_model = model
        best_model_name = name
        
print("-" * 50)
print(f"Best Model: {best_model_name} with R2: {best_r2:.4f}")


## Save Best Model


In [None]:
# Save best model and scaler
joblib.dump(best_model, os.path.join(model_dir, 'best_model.pkl'))
joblib.dump(scaler, os.path.join(model_dir, 'scaler.pkl'))
print(f"Saved best model and scaler to {model_dir}")

# Also save column names to ensure correct order during inference
feature_names = X.columns.tolist()
joblib.dump(feature_names, os.path.join(model_dir, 'feature_names.pkl'))
