## Loading Data Set

In [None]:
import pandas as pd
df = pd.read_csv("../data/advertising.csv")


## Basic Info

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

## Renaming columns 

In [None]:
df = df.rename(columns={
    "Daily Time Spent on Site": "daily_time_on_site",
    "Age": "age",
    "Area Income": "area_income",
    "Daily Internet Usage": "internet_usage",
    "Ad Topic Line": "ad_topic",
    "City": "city",
    "Male": "male",
    "Timestamp": "timestamp",
    "Clicked on Ad": "clicked"
})


## Checking any missing values

In [None]:
df.isnull().sum()

## Basic statistics:

In [None]:
df.describe()


## Target distribution:

In [None]:
print(df["clicked"].value_counts())
print(df["clicked"].value_counts(normalize=True))

## Dropping unnecessary columns

In [None]:
df = df.drop(columns=["city"])
df.head()

## Converting Timestamp

In [None]:
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["hour"] = df["timestamp"].dt.hour
df.head()
# keeping time stamp for viusualizations

## Analyzing ad catrgories

In [None]:
print(df["ad_topic"].head(20))
print(df["ad_topic"].sample(20, random_state=42))

In [None]:
print(df["ad_topic"].nunique())
print(df["ad_topic"].value_counts().head(10))


## Creating numeric features for ad_topics

In [None]:
df["ad_topic_len"] = df["ad_topic"].str.len()
df["ad_topic_words"] = df["ad_topic"].str.split().str.len()
df.head()

## EDA plots and tables

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df.isnull().sum().plot(kind="bar")
plt.title("Missing values per column")
plt.show()


In [None]:
num_cols = ["age", "area_income", "daily_time_on_site", "internet_usage",
            "ad_topic_len", "ad_topic_words"]

df[num_cols].hist(figsize=(12, 8), bins=20)
plt.tight_layout()
plt.show()


## Box plots of features by click outcome

In [None]:
import seaborn as sns

sns.boxplot(x="clicked", y="age", data=df)
plt.title("Age vs Clicked")
plt.show()


In [None]:
import seaborn as sns

sns.boxplot(x="clicked", y="daily_time_on_site", data=df)
plt.title("daily_time_on_site vs Clicked")
plt.show()


In [None]:
import seaborn as sns

sns.boxplot(x="clicked", y="internet_usage", data=df)
plt.title("internet_usage vs Clicked")
plt.show()


In [None]:
import seaborn as sns

sns.boxplot(x="clicked", y="area_income", data=df)
plt.title("area_income vs Clicked")
plt.show()


In [None]:
# Average click rate by gender (male = 0 or 1)
gender_click = df.groupby("male")["clicked"].mean()

gender_click.plot(kind="bar")
plt.title("Click Rate by Gender (male = 1)")
plt.ylabel("Average click rate")
plt.xlabel("male")
plt.show()


In [None]:

bins = [0, 25, 35, 45, 60, 100]
labels = ["<=25", "26-35", "36-45", "46-60", "60+"]

df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=True)

# Average click rate per age group
age_click = df.groupby("age_group", observed=False)["clicked"].mean()


age_click.plot(kind="bar")
plt.title("Click Rate by Age Group")
plt.ylabel("Average click rate")
plt.xlabel("Age group")
plt.show()


## Correlation analysis

In [None]:
corr = df[["age", "area_income", "daily_time_on_site", "internet_usage",
           "ad_topic_len", "ad_topic_words", "hour", "male", "clicked"]].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()
