# Model Building & Evaluation


In [1]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Define paths
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
processed_data_path = os.path.join(project_root, 'src', 'Data', 'processed', 'youtube_ad_revenue_processed.csv')
model_dir = os.path.join(project_root, 'src', 'models')
os.makedirs(model_dir, exist_ok=True)


In [2]:
def load_data(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    return pd.read_csv(path)

df = load_data(processed_data_path)
df.head()


Unnamed: 0,views,likes,comments,watch_time_minutes,video_length_minutes,subscribers,ad_revenue_usd,engagement_rate,category_Entertainment,category_Gaming,...,category_Music,category_Tech,device_Mobile,device_TV,device_Tablet,country_CA,country_DE,country_IN,country_UK,country_US
0,9936,1221.0,320.0,26497.214184,2.862137,228086,203.178237,0.155093,True,False,...,False,False,False,True,False,False,False,True,False,False
1,10017,642.0,346.0,15209.747445,23.738069,736015,140.880508,0.098632,False,True,...,False,False,False,False,True,True,False,False,False,False
2,10097,1979.0,187.0,57332.658498,26.200634,240534,360.134008,0.214519,False,False,...,False,False,False,True,False,True,False,False,False,False
3,10034,1191.0,242.0,31334.517771,11.77034,434482,224.638261,0.142814,True,False,...,False,False,True,False,False,False,False,False,True,False
4,9889,1858.0,477.0,15665.666434,6.635854,42030,165.514388,0.236121,False,False,...,False,False,True,False,False,True,False,False,False,False


## Train/Test Split


In [3]:
# Separate features and target
X = df.drop(columns=['ad_revenue_usd'])
y = df['ad_revenue_usd']

# Scale features (StandardScaler) - Important for Linear Models
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


## Model Training


In [4]:
# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42, n_estimators=100)
}

results = {}
best_model = None
best_r2 = -float('inf')
best_model_name = ""

print("-" * 50)
print(f"{'Model':<20} | {'R2 Score':<10} | {'RMSE':<10} | {'MAE':<10}")
print("-" * 50)

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    
    results[name] = {"R2": r2, "RMSE": rmse, "MAE": mae}
    
    print(f"{name:<20} | {r2:<10.4f} | {rmse:<10.4f} | {mae:<10.4f}")
    
    if r2 > best_r2:
        best_r2 = r2
        best_model = model
        best_model_name = name
        
print("-" * 50)
print(f"Best Model: {best_model_name} with R2: {best_r2:.4f}")


--------------------------------------------------
Model                | R2 Score   | RMSE       | MAE       
--------------------------------------------------
Linear Regression    | 0.9526     | 13.4806    | 3.1119    
Ridge Regression     | 0.9526     | 13.4803    | 3.1096    
Lasso Regression     | 0.9519     | 13.5754    | 4.1427    
Decision Tree        | 0.9023     | 19.3491    | 5.3111    
Random Forest        | 0.9499     | 13.8534    | 3.5837    
--------------------------------------------------
Best Model: Ridge Regression with R2: 0.9526


## Save Best Model


In [5]:
# Save best model and scaler
joblib.dump(best_model, os.path.join(model_dir, 'best_model.pkl'))
joblib.dump(scaler, os.path.join(model_dir, 'scaler.pkl'))
print(f"Saved best model and scaler to {model_dir}")

# Also save column names to ensure correct order during inference
feature_names = X.columns.tolist()
joblib.dump(feature_names, os.path.join(model_dir, 'feature_names.pkl'))


Saved best model and scaler to d:\Jeet\projects\Data_Science\Project\Project_Content_Monetization_Model\src\models


['d:\\Jeet\\projects\\Data_Science\\Project\\Project_Content_Monetization_Model\\src\\models\\feature_names.pkl']