In [None]:
import kagglehub
# Download the "ashydv/advertising-dataset" from Kaggle
path = kagglehub.dataset_download("ashydv/advertising-dataset")
print(f"Dataset downloaded to: {path}")


In [None]:
import os
files = os.listdir(path)
print(f"Files in folder: {files}")

csv_file =  [file for file in files if file.endswith('.csv')][0]
csv_path = os.path.join(path, csv_file) # Full path to the CSV file
print(f"CSV file path: {csv_path}")


In [None]:
import pandas as pd

data = pd.read_csv(csv_path)
print(data.head()) #first 5 rows
print(data.info()) #data summary
print(data.describe()) #statistical summary
print(data.isnull().sum()) #check for missing values

# As we can see, there are no missing values in the dataset and we are good to go without cleaning the data.



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid") # Set theme for plots

# Univariate Analysis
axes = data.hist(bins=15, figsize=(10,6), layout=(2,2)) # Create histograms for each feature
plt.suptitle("Distribution of Advertising Budgets and Sales", fontsize=16)

for ax in axes.ravel():
    ax.set_xlabel("Advertising Spend in USD dollars")
    ax.set_ylabel("Frequency")

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

In [None]:
# Bivariate Analysis
# Scatter plots to see relationships between each advertising channel and sales
pair = sns.pairplot(data, x_vars=["TV", "Radio", "Newspaper"], y_vars="Sales", height=4, aspect=1.1, kind='scatter') # Create scatter plots for each advertising channel vs sales
pair.figure.suptitle("Advertising Spend vs Sales", fontsize=16)

for ax in pair.axes.ravel():
    ax.set_xlabel(f"Advertising Spend in USD dollars ({ax.get_xlabel()})")
    ax.set_ylabel("Sales in USD dollars")

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [None]:
# Correlation Matrix
sns.heatmap(data.corr(),annot=True, cmap='coolwarm', center=0) # Create heatmap of correlations
plt.title("Correlation Matrix of Advertising Dataset", fontsize=16)
plt.show() 
# From the heatmap, we can see that TV and Radio advertising have a strong positive correlation with sales, while Newspaper advertising has a weak correlation with sales.

In [None]:
# Pairplot to see relationships between all features histogram for frequency vs sales and scatter plots for relationships between features
pair = sns.pairplot(data) 
pair.figure.suptitle("Pairplot of Advertising Dataset", fontsize=16)
for ax in pair.axes.flatten():
    ax.set_xlabel(f"{ax.get_xlabel()} (USD dollars)")
    ax.set_ylabel(f"{ax.get_ylabel()} (USD dollars)")
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust layout to fit title left, right, top, bottom
plt.show()