# Data Preprocessing & Feature Engineering


In [None]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Define paths
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
raw_data_path = os.path.join(project_root, 'src', 'Data', 'raw', 'youtube_ad_revenue_dataset.csv')
processed_data_path = os.path.join(project_root, 'src', 'Data', 'processed', 'youtube_ad_revenue_processed.csv')

# Create processed directory if it doesn't exist
os.makedirs(os.path.dirname(processed_data_path), exist_ok=True)


In [None]:
def load_data(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    return pd.read_csv(path)

df = load_data(raw_data_path)
print("Original shape:", df.shape)


## 1. Remove Duplicates


In [None]:
df = df.drop_duplicates()
print("Shape after dropping duplicates:", df.shape)


## 2. Handle Missing Values


In [None]:
# Impute missing numerical values with median
numeric_cols_with_nan = ['likes', 'comments', 'watch_time_minutes']
for col in numeric_cols_with_nan:
    df[col] = df[col].fillna(df[col].median())

print("Missing values after imputation:\n", df.isnull().sum())


## 3. Feature Engineering


In [None]:
# Engagement Rate = (likes + comments) / views
df['engagement_rate'] = (df['likes'] + df['comments']) / df['views']
df.head()


## 4. Encoding & Scaling


In [None]:
# Drop irrelevant columns for modeling
model_df = df.drop(columns=['video_id', 'date'])

# Define features
categorical_features = ['category', 'device', 'country']
numerical_features = ['views', 'likes', 'comments', 'watch_time_minutes', 'video_length_minutes', 'subscribers', 'engagement_rate']
target = 'ad_revenue_usd'

# One-Hot Encoding
model_df = pd.get_dummies(model_df, columns=categorical_features, drop_first=True)

print("Shape after encoding:", model_df.shape)
print("Columns:", model_df.columns.tolist())


## Save Processed Data


In [None]:
model_df.to_csv(processed_data_path, index=False)
print(f"Processed data saved to {processed_data_path}")
