In [None]:
import pandas as pd

import pandas as pd

df = pd.read_csv("../data/youtube_ad_revenue_dataset.csv")

df.head()
df.info()
df.isnull().sum()
df.duplicated().sum()


## Exploratory Data Analysis (EDA)
In this section, we analyze the distribution of YouTube ad revenue and study how different features influence monetization.


In [None]:
%pip install matplotlib seaborn
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,5))
sns.histplot(df["ad_revenue_usd"], bins=50, kde=True)
plt.title("Distribution of YouTube Ad Revenue")
plt.xlabel("Ad Revenue (USD)")
plt.ylabel("Frequency")
plt.show()


Most videos generate low ad revenue, while a small number of videos earn significantly higher revenue, indicating a right-skewed distribution.


In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(x="views", y="ad_revenue_usd", data=df, alpha=0.4)
plt.title("Ad Revenue vs Views")
plt.xlabel("Views")
plt.ylabel("Ad Revenue (USD)")
plt.show()


Ad revenue increases with the number of views, showing a strong positive relationship between video reach and monetization.


In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.select_dtypes(include="number").corr(), annot=False, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()


Views, watch time, and subscribers show strong correlation with ad revenue, indicating their importance in revenue prediction.


In [None]:
category_revenue = df.groupby("category")["ad_revenue_usd"].mean().sort_values(ascending=False)

category_revenue.plot(kind="bar", figsize=(10,5), title="Average Ad Revenue by Category")
plt.ylabel("Average Revenue (USD)")
plt.show()


Certain content categories consistently generate higher ad revenue, suggesting category selection plays a key role in monetization strategy.


In [None]:
import matplotlib.pyplot as plt

plt.plot([1,2,3], [4,5,6])
plt.show()


In [None]:
# Check missing values before cleaning
df.isnull().sum()


In [None]:
df["likes"].fillna(df["likes"].median(), inplace=True)
df["comments"].fillna(df["comments"].median(), inplace=True)
df["watch_time_minutes"].fillna(df["watch_time_minutes"].median(), inplace=True)


In [None]:
# Check missing values after cleaning
df.isnull().sum()


In [None]:
df.drop_duplicates(inplace=True)


In [None]:
df.duplicated().sum()


In [None]:
df["engagement_rate"] = (df["likes"] + df["comments"]) / df["views"]


In [None]:
df["date"] = pd.to_datetime(df["date"])

df["day"] = df["date"].dt.day
df["month"] = df["date"].dt.month


In [None]:
df = pd.get_dummies(
    df,
    columns=["category", "device", "country"],
    drop_first=True
)


In [None]:
df.info()
df.head()


In [None]:
# Separate features and target
X = df.drop(columns=["ad_revenue_usd", "video_id", "date"])
y = df["ad_revenue_usd"]


In [None]:
%pip install scikit-learn
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
X_train.shape, X_test.shape


In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

results = []

for name, model in models.items():
    preds = model.predict(X_test)
    results.append({
        "Model": name,
        "R2": r2_score(y_test, preds),
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "MAE": mean_absolute_error(y_test, preds)
    })

results_df = pd.DataFrame(results)
results_df


In [None]:
import pickle

best_model = models["Ridge Regression"]

with open("model.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


In [None]:
feature_names = X.columns.tolist()
len(feature_names), feature_names

In [None]:
len(X.columns)
