In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
# Load dataset
df = pd.read_csv('ads.csv')

In [3]:
# Data Profiling and Analysis
print(df.head())
print(df.info())

   Unnamed: 0     TV  radio  newspaper  sales
0           1  230.1   37.8       69.2   22.1
1           2   44.5   39.3       45.1   10.4
2           3   17.2   45.9       69.3    9.3
3           4  151.5   41.3       58.5   18.5
4           5  180.8   10.8       58.4   12.9
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  200 non-null    int64  
 1   TV          200 non-null    float64
 2   radio       200 non-null    float64
 3   newspaper   200 non-null    float64
 4   sales       200 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB
None


In [5]:
# Feature Engineering
# Creating Interaction feature
df['TV_radio_interaction'] = df['TV'] * df['radio']
print(df.describe())

       Unnamed: 0          TV       radio   newspaper       sales  \
count  200.000000  200.000000  200.000000  200.000000  200.000000   
mean   100.500000  147.042500   23.264000   30.554000   14.022500   
std     57.879185   85.854236   14.846809   21.778621    5.217457   
min      1.000000    0.700000    0.000000    0.300000    1.600000   
25%     50.750000   74.375000    9.975000   12.750000   10.375000   
50%    100.500000  149.750000   22.900000   25.750000   12.900000   
75%    150.250000  218.825000   36.525000   45.100000   17.400000   
max    200.000000  296.400000   49.600000  114.000000   27.000000   

       TV_radio_interaction  
count            200.000000  
mean            3490.309900  
std             3360.740127  
min                0.000000  
25%              773.445000  
50%             2069.065000  
75%             5516.197500  
max            13540.410000  


In [6]:
# Selecting dependent and independent variables
X = df.loc[:, ["TV", "radio", "TV_radio_interaction"]]  # DataFrame 2-Dimension
y = df['sales']  # Series 1-Dimension

In [7]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
# Machine Learning model: Using Support Vector Regression(SVR)
model = Pipeline([('scaler', StandardScaler()), ('svm', SVR(kernel = 'rbf', gamma = 'scale'))])
model.fit(X_train, y_train)

In [9]:
y_pred = model.predict(X_test)

In [10]:
# using MSE, MAE and r2 score to check accuracy
print("Using Support Vector Regression")
print(f'mean_squared_error = {mean_squared_error(y_true = y_test, y_pred = y_pred)}')
print(f'mean_absolute_error = {mean_absolute_error(y_true = y_test, y_pred = y_pred)}')
print(f'r2_score = {r2_score(y_true = y_test, y_pred = y_pred)}')

Using Support Vector Regression
mean_squared_error = 0.9447123111344264
mean_absolute_error = 0.6729553763518041
r2_score = 0.9700695580288785


In [11]:
# Model dumping
with open("modelSVR.pickle", 'wb') as file:
    pickle.dump(model, file)