# Exploratory Data Analysis (EDA)


In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Define paths
# Assuming notebook is in src/notebooks
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
raw_data_path = os.path.join(project_root, 'src', 'Data', 'raw', 'youtube_ad_revenue_dataset.csv')

# Check if file exists
if not os.path.exists(raw_data_path):
    # Fallback for different CWD
    raw_data_path = os.path.join(os.getcwd(), '..', '..', 'src', 'Data', 'raw', 'youtube_ad_revenue_dataset.csv')

print(f"Loading data from: {raw_data_path}")


In [None]:
df = pd.read_csv(raw_data_path)
df.head()


## Dataset Info


In [None]:
df.info()


## Missing Values


In [None]:
df.isnull().sum()


## Duplicates


In [None]:
df.duplicated().sum()


## Descriptive Statistics


In [None]:
df.describe()


## Correlation Matrix


In [None]:
numeric_df = df.select_dtypes(include=['float64', 'int64'])
corr_matrix = numeric_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()
print(corr_matrix['ad_revenue_usd'].sort_values(ascending=False))


## Outlier Detection


In [None]:
# Outlier Detection using IQR for ad_revenue_usd
Q1 = df['ad_revenue_usd'].quantile(0.25)
Q3 = df['ad_revenue_usd'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['ad_revenue_usd'] < lower_bound) | (df['ad_revenue_usd'] > upper_bound)]
print(f"Number of outliers: {len(outliers)}")
print(f"Percentage of outliers: {len(outliers) / len(df) * 100:.2f}%")


## Categorical Analysis


In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"{col}: {df[col].nunique()} unique values")
    if df[col].nunique() < 20:
        print(df[col].value_counts())
        print("-" * 10)
