# Data Preprocessing & Feature Engineering


In [1]:
import pandas as pd
import os

# Define paths
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
raw_data_path = os.path.join(project_root, 'src', 'Data', 'raw', 'youtube_ad_revenue_dataset.csv')
processed_data_path = os.path.join(project_root, 'src', 'Data', 'processed', 'youtube_ad_revenue_processed.csv')

# Create processed directory if it doesn't exist
os.makedirs(os.path.dirname(processed_data_path), exist_ok=True)


In [2]:
def load_data(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    return pd.read_csv(path)

df = load_data(raw_data_path)
print("Original shape:", df.shape)


Original shape: (122400, 12)


## 1. Remove Duplicates


In [3]:
df = df.drop_duplicates()
print("Shape after dropping duplicates:", df.shape)


Shape after dropping duplicates: (120000, 12)


## 2. Handle Missing Values


In [4]:
# Impute missing numerical values with median
numeric_cols_with_nan = ['likes', 'comments', 'watch_time_minutes']
for col in numeric_cols_with_nan:
    df[col] = df[col].fillna(df[col].median())

print("Missing values after imputation:\n", df.isnull().sum())


Missing values after imputation:
 video_id                0
date                    0
views                   0
likes                   0
comments                0
watch_time_minutes      0
video_length_minutes    0
subscribers             0
category                0
device                  0
country                 0
ad_revenue_usd          0
dtype: int64


## 3. Feature Engineering


In [5]:
# Engagement Rate = (likes + comments) / views
df['engagement_rate'] = (df['likes'] + df['comments']) / df['views']
df.head()


Unnamed: 0,video_id,date,views,likes,comments,watch_time_minutes,video_length_minutes,subscribers,category,device,country,ad_revenue_usd,engagement_rate
0,vid_3092,2024-09-24 10:50:40.993199,9936,1221.0,320.0,26497.214184,2.862137,228086,Entertainment,TV,IN,203.178237,0.155093
1,vid_3459,2024-09-22 10:50:40.993199,10017,642.0,346.0,15209.747445,23.738069,736015,Gaming,Tablet,CA,140.880508,0.098632
2,vid_4784,2024-11-21 10:50:40.993199,10097,1979.0,187.0,57332.658498,26.200634,240534,Education,TV,CA,360.134008,0.214519
3,vid_4078,2025-01-28 10:50:40.993199,10034,1191.0,242.0,31334.517771,11.77034,434482,Entertainment,Mobile,UK,224.638261,0.142814
4,vid_3522,2025-04-28 10:50:40.993199,9889,1858.0,477.0,15665.666434,6.635854,42030,Education,Mobile,CA,165.514388,0.236121


## 4. Encoding & Scaling


In [6]:
# Drop irrelevant columns for modeling
model_df = df.drop(columns=['video_id', 'date'])

# Define features
categorical_features = ['category', 'device', 'country']
numerical_features = ['views', 'likes', 'comments', 'watch_time_minutes', 'video_length_minutes', 'subscribers', 'engagement_rate']
target = 'ad_revenue_usd'

# One-Hot Encoding
model_df = pd.get_dummies(model_df, columns=categorical_features, drop_first=True)

print("Shape after encoding:", model_df.shape)
print("Columns:", model_df.columns.tolist())


Shape after encoding: (120000, 21)
Columns: ['views', 'likes', 'comments', 'watch_time_minutes', 'video_length_minutes', 'subscribers', 'ad_revenue_usd', 'engagement_rate', 'category_Entertainment', 'category_Gaming', 'category_Lifestyle', 'category_Music', 'category_Tech', 'device_Mobile', 'device_TV', 'device_Tablet', 'country_CA', 'country_DE', 'country_IN', 'country_UK', 'country_US']


## Save Processed Data


In [7]:
model_df.to_csv(processed_data_path, index=False)
print(f"Processed data saved to {processed_data_path}")


Processed data saved to d:\Jeet\projects\Data_Science\Project\Project_Content_Monetization_Model\src\Data\processed\youtube_ad_revenue_processed.csv
