In [None]:
#-----------------------------
# Problem Statement:ADVERTISING SALES PREDICTION
#This dataset contains information about advertising spend on different platforms (TV, Radio, Newspaper)
#and the resulting sales. The goal is to build a model that can predict sales based on advertising spend.
#-----------------------------
# Importing Files 
#-----------------------------------------------------------

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
#-----------------------------------------------------------
#Importing data
data_ads = pd.read_csv('Advertising.csv')
#creating a copy of data
data = data_ads.copy()
#Checking if importing is complete
print(data.head())

"""
EXPLORATORY DATA ANALYSIS
1.Getting to know the data
2.Data processing(missing values)
3.Cross tables and data visualization

"""

print(data.info())
print("Data columns with null values:\n",data.isnull().sum())
#No null values Detected
#Summary of Numerical data
summary_num = data.describe()
print(summary_num)

"""
              TV        Radio    Newspaper     Sales
count  200.000000  200.000000   200.000000  200.00000
mean   147.042500   23.264150    30.554000   14.02250
std     85.854236   14.846809    21.778621    5.21745
min      0.700000    0.000000     0.300000    1.60000
25%     74.375000   11.250000    12.075000   10.37500
50%    149.050000   22.900000    25.750000   12.90000
75%    218.825000   31.275000    45.100000   17.40000
max    296.400000  112.000000   105.400000   27.00000"""

#From the above summary we can see that the data is symmetrically distributed

#Summary of categorical values
'''summary_cat = data.describe(include='O')
print(summary_cat)
print(data.columns)'''
#frequency of each categories,Find special charecters
pd.set_option('display.max_rows', None)
print(data['TV'].value_counts())
print(data['Radio'].value_counts())
print(data['Newspaper'].value_counts())
print(data['Sales'].value_counts())
#No unique value detected in any of the columns

#checking for duplicate values
data[data.duplicated()]
#Duplicates found
data.drop_duplicates()
#Dropping duplicates from the dataset
#Normalized value counts (percentage)
print(data.columns.value_counts(normalize=True))

#DATA VISUALIZATION:
    
#Histogram for numerical columns
data.hist(figsize=(10, 5))
"""TV:The data is left-skewed. This shows that most of the advertising spend on TV is relatively low,with a large concentration of values in the lower range. There are very few high TV spends.
Radio:This distribution is right-skewed. Most radio advertising spends are under 30,with very few having a high radio spend.
Newspaper:Similar to Radio, this is also right-skewed.The majority of newspaper spends are under 50, while very high spends are rare in this dataset.
Sales:This is heavily right-skewed. The vast majority of sales values are between 10-20,with a few sales having very high values.
"""

#Box plots for outlier detection
data.boxplot(figsize=(10,5))
"""TV, Radio, Newspaper, Sales: These features have very compressed box plots, indicating that the majority of their data is clustered within a narrow range. They each show a few high-value outliers.
Newspaper:The box itself is relatively small, showing that 50% of the spends have a similar range. However, there are numerous outliers with very high values, indicating a few campaigns with very high newspaper spends."""
#Box plot for sales
data["Sales"].plot(kind="box")
#The plot is right skewed.Outliers were detected.
#Box plot for TV
data["TV"].plot(kind="box")
#The plot is also right skewed.A few outliers were detected.

#Scatter plot between Sales and TV to find te relation between them
data.plot(kind="scatter", x="TV", y="Sales")
#A strong positive correlation is observed between the two variables
#This may result in a strong factor while preparing the model.

print(data.isnull().sum())
missing = data[data.isnull().any(axis=1)]#Checking coloumn missing values
print(missing)
#No missing valuies were found in any columns
#Relationship between independent variables
# Select only numeric columns
numeric_data = data.select_dtypes(include=[np.number])
correlation = numeric_data.corr()
print(correlation)
"""
                   TV     Radio  Newspaper     Sales
TV          1.000000  0.054809   0.056648  0.782224
Radio       0.054809  1.000000   0.354104  0.576223
Newspaper   0.056648  0.354104   1.000000  0.228299
Sales       0.782224  0.576223   0.228299  1.000000"""
#The above correlation suggests:
    # TV and Sales has a very high correlation
    #Radio and Sales has a moderate positive correlation
    #Newspaper and Sales has a low positive correlation
#consider categorical variables
print(data.columns)
#preparing cross tables and data visualization
#Sales proportion table
Sales = pd.crosstab( index = data["Sales"], columns = 'count',normalize=True) 
print(Sales) 
#At this point Nothing can be interpreted using the Sales table only, Use other factors with Sales to find important affecting factors

#Sales vs TV
#Creates a new column in the dataset named "TV_Category",Which categorizes TV spend
data["TV_Category"] = pd.cut(data["TV"], bins=5, labels=['Very_Low_TV', 'Low_TV', 'Medium_TV', 'High_TV', 'Very_High_TV'])
#Using a boxplot
plt.figure(figsize=(15,8))
sns.boxplot(x='Sales', y='TV_Category', data=data)
plt.xlabel("Sales")
plt.ylabel("TV_Category")
plt.title("Distribution of Sales by TV_Category")
plt.show()
"""High-Value TV spends(very high TV) have boxes situated far to the right , which means they have high median sales
Wide-Sales range categories like very high TV has a very wide box which means its sales are very spread out.
Low-Value TV spends (very low TV) have boxes clustered on the far left,Showing they have a low sales
"""

#Using a bar plot
TV_sales = data.groupby('TV_Category')["Sales"].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x=TV_sales.values, y=TV_sales.index)
plt.xlabel("Average Sales")
plt.ylabel("TV_Category")
plt.title("Average Sales by TV_Category")
plt.show()
"""As seen in the graph,Very High TV spends has a very high sales value,followed by High TV,Medium TV etc..
Categories like Very Low TV and Low TV has low avg sales
"""

#Sales vs Radio
#Creates a new column in the dataset named "Radio_Category",Which categorizes Radio spend
data["Radio_Category"] = pd.cut(data["Radio"], bins=5, labels=['Very_Low_Radio', 'Low_Radio', 'Medium_Radio', 'High_Radio', 'Very_High_Radio'])
#Using a ScatterPlot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Radio', y='Sales', data=data)
plt.xlabel("Radio")
plt.ylabel("Sales")
plt.title("Sales vs. Radio")
plt.grid(True)
plt.show()
#A positive correlation is observed with some outliers

#Sales vs Newspaper
#Creates a new column in the dataset named "Newspaper_Category",Which categorizes Newspaper spend
data["Newspaper_Category"] = pd.cut(data["Newspaper"], bins=5, labels=['Very_Low_News', 'Low_News', 'Medium_News', 'High_News', 'Very_High_News'])
#Using a ScatterPlot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Newspaper', y='Sales', data=data)
plt.xlabel("Newspaper")
plt.ylabel("Sales")
plt.title("Sales vs. Newspaper")
plt.grid(True)
plt.show()
#A right skewness is observed with a few outliers

#Sales vs TV

#Using a ScatterPlot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='TV', y='Sales', data=data)
plt.xlabel("TV")
plt.ylabel("Sales")
plt.title("Sales vs. TV")
plt.grid(True)
plt.show()
#We already saw their relation

#DATA PREPROCESSING

scaler = StandardScaler()
#This scales the values to have mean = 0 and standard deviation = 1 to better distribution
data[['TV','Radio','Newspaper','Sales']] = scaler.fit_transform(data[['TV','Radio','Newspaper','Sales']])
#Handling categorical values
data= pd.get_dummies(data, columns=['TV_Category', 'Radio_Category', 'Newspaper_Category'], drop_first=True)
#Outlier Handling
#TV outliers are being fixed because it contains the maximum outliers and the rest ofthe varibales are replaced with dummy variables
Q1 = data['TV'].quantile(0.25)
Q3 = data['TV'].quantile(0.75)
IQR = Q3 - Q1
data= data[~((data['TV'] < (Q1 - 1.5 * IQR)) | (data['TV'] > (Q3 + 1.5 * IQR)))]
#A total of 200-190 = 10 rows were removed , the method followed to remove outliers was interquantile range
print(data.columns)

#Creating a ML Model

# X contains all the predictive features
X = data.drop('Sales', axis=1) 
# y is the target variable we want to predict
y = data['Sales']

#Splitting the data so we can train the model and test it on unseen data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Create and Train the Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

#Make Predictions on the Test Data
predictions = model.predict(X_test)

#R-squared score
r2 = metrics.r2_score(y_test, predictions)
print(f"R-squared Score: {r2:.2%}")
#R-squared Score: 90.39%

#Mean Absolute Error (MAE)
mae = metrics.mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error (MAE): {mae:,.2f}")
#Mean Absolute Error (MAE): 0.24