In [22]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

In [23]:
url = 'https://raw.githubusercontent.com/prasertcbs/basic-dataset/master/Advertising.csv'
df = pd.read_csv(url, usecols=[1,2,3,4])
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


## Train test split

In [24]:
df.columns

Index(['TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')

In [25]:
x = df[['TV','Radio','Newspaper']]
y = df['Sales']

In [26]:
x

Unnamed: 0,TV,Radio,Newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4
...,...,...,...
195,38.2,3.7,13.8
196,94.2,4.9,8.1
197,177.0,9.3,6.4
198,283.6,42.0,66.2


In [27]:
y

0      22.1
1      10.4
2       9.3
3      18.5
4      12.9
       ... 
195     7.6
196     9.7
197    12.8
198    25.5
199    13.4
Name: Sales, Length: 200, dtype: float64

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x,y, 
                                                    test_size=0.3,
                                                    random_state=7)

In [29]:
x_train.head()

Unnamed: 0,TV,Radio,Newspaper
88,88.3,25.5,73.4
58,210.8,49.6,37.7
113,209.6,20.6,10.7
149,44.7,25.8,20.6
36,266.9,43.8,5.0


In [30]:
x_test.head()

Unnamed: 0,TV,Radio,Newspaper
86,76.3,27.5,16.0
120,141.3,26.8,46.2
22,13.2,15.9,49.6
11,214.7,24.0,4.0
195,38.2,3.7,13.8


In [31]:
y_train.head()

88     12.9
58     23.8
113    15.9
149    10.1
36     25.4
Name: Sales, dtype: float64

In [32]:
y_test.head()

86     12.0
120    15.5
22      5.6
11     17.4
195     7.6
Name: Sales, dtype: float64

## LinearRegression

In [34]:
import sklearn
from sklearn.linear_model import LinearRegression

In [36]:
model = LinearRegression()
model

In [37]:
model.fit(x_train,y_train)

In [38]:
model.score(x_train,y_train) #R^2

0.8970470429900155

In [39]:
model.intercept_

2.5971913990213054

In [40]:
model.coef_

array([ 4.71259657e-02,  1.90987993e-01, -1.93812266e-05])

In [42]:
model.predict([[300, 50, 70],
               [200, 80, 50],
               [100,90,30]])



array([26.28302406, 27.30045491, 24.4981259 ])

In [43]:
y_hat = model.predict(x_train)
y_hat

array([11.62718541, 22.00361874, 16.40893908,  9.63081303, 23.54028882,
        4.19019633,  6.33372661,  9.16205331, 18.6252243 ,  9.30278314,
       10.02220298, 12.81990978, 16.3270569 , 15.55142217,  8.18788222,
       17.73045095, 24.32738343,  7.98376479, 12.34504657, 23.42048119,
       10.19514611, 17.25432468, 15.17745615, 16.45100472, 16.83421976,
       16.94913148, 14.92719588, 12.08456358, 17.31673623, 20.75868218,
       12.31079013, 14.68033357, 12.00487231,  4.23083643, 11.88897069,
       18.47946596, 17.82345369, 10.62147609, 21.24463139,  3.40353011,
       20.37338186,  9.53396325,  8.95973195, 10.92777341,  8.97930171,
       17.62344551, 20.54153871, 12.60435152, 13.71875285, 17.33050151,
       15.22001133, 20.47732861,  9.86186951, 14.93210984, 13.95134117,
       12.31718956, 16.32085396,  7.16859379, 18.16059791, 20.91470709,
       20.02490033, 18.17883475, 18.49821442, 17.0839661 , 19.35270232,
        4.19833505, 12.50911724, 17.95656145,  6.91422545,  9.90

In [45]:
train = pd.concat([x_train,y_train],axis='columns')
train

Unnamed: 0,TV,Radio,Newspaper,Sales
88,88.3,25.5,73.4,12.9
58,210.8,49.6,37.7,23.8
113,209.6,20.6,10.7,15.9
149,44.7,25.8,20.6,10.1
36,266.9,43.8,5.0,25.4
...,...,...,...,...
151,121.0,8.4,48.7,11.6
67,139.3,14.5,10.2,13.4
25,262.9,3.5,19.5,12.0
196,94.2,4.9,8.1,9.7


In [48]:
dc = pd.concat([train.reset_index(), pd.Series(y_hat, name = 'Predicted')],axis='columns')
dc.head()

Unnamed: 0,index,TV,Radio,Newspaper,Sales,Predicted
0,88,88.3,25.5,73.4,12.9,11.627185
1,58,210.8,49.6,37.7,23.8,22.003619
2,113,209.6,20.6,10.7,15.9,16.408939
3,149,44.7,25.8,20.6,10.1,9.630813
4,36,266.9,43.8,5.0,25.4,23.540289


In [49]:
y_hat_test = model.predict(x_test)
y_hat

array([11.62718541, 22.00361874, 16.40893908,  9.63081303, 23.54028882,
        4.19019633,  6.33372661,  9.16205331, 18.6252243 ,  9.30278314,
       10.02220298, 12.81990978, 16.3270569 , 15.55142217,  8.18788222,
       17.73045095, 24.32738343,  7.98376479, 12.34504657, 23.42048119,
       10.19514611, 17.25432468, 15.17745615, 16.45100472, 16.83421976,
       16.94913148, 14.92719588, 12.08456358, 17.31673623, 20.75868218,
       12.31079013, 14.68033357, 12.00487231,  4.23083643, 11.88897069,
       18.47946596, 17.82345369, 10.62147609, 21.24463139,  3.40353011,
       20.37338186,  9.53396325,  8.95973195, 10.92777341,  8.97930171,
       17.62344551, 20.54153871, 12.60435152, 13.71875285, 17.33050151,
       15.22001133, 20.47732861,  9.86186951, 14.93210984, 13.95134117,
       12.31718956, 16.32085396,  7.16859379, 18.16059791, 20.91470709,
       20.02490033, 18.17883475, 18.49821442, 17.0839661 , 19.35270232,
        4.19833505, 12.50911724, 17.95656145,  6.91422545,  9.90

In [50]:
test = pd.concat([x_test,y_test],axis='columns')
test.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
86,76.3,27.5,16.0,12.0
120,141.3,26.8,46.2,15.5
22,13.2,15.9,49.6,5.6
11,214.7,24.0,4.0,17.4
195,38.2,3.7,13.8,7.6


In [52]:
dt = pd.concat([test.reset_index(), pd.Series(y_hat_test, name = 'Predicted')],axis='columns')
dt


Unnamed: 0,index,TV,Radio,Newspaper,Sales,Predicted
0,86,76.3,27.5,16.0,12.0,11.444762
1,120,141.3,26.8,46.2,15.5,14.373673
2,22,13.2,15.9,49.6,5.6,6.255002
3,11,214.7,24.0,4.0,17.4,17.298771
4,195,38.2,3.7,13.8,7.6,5.103791
5,2,17.2,45.9,69.3,9.3,12.172764
6,121,18.8,21.7,50.4,7.0,7.626622
7,94,107.4,14.0,10.9,11.5,10.332141
8,66,31.5,24.6,2.2,9.5,8.779921
9,63,102.7,29.6,8.4,14.0,13.09011


In [54]:
dt.corr()

Unnamed: 0,index,TV,Radio,Newspaper,Sales,Predicted
index,1.0,0.216008,-0.091564,0.001199,0.095705,0.128056
TV,0.216008,1.0,0.128668,0.079286,0.812386,0.868771
Radio,-0.091564,0.128668,1.0,0.364127,0.587212,0.60288
Newspaper,0.001199,0.079286,0.364127,1.0,0.227696,0.245546
Sales,0.095705,0.812386,0.587212,0.227696,1.0,0.946814
Predicted,0.128056,0.868771,0.60288,0.245546,0.946814,1.0


## Stats model

In [55]:
train, test = train_test_split(df, train_size=0.7, random_state=5)

In [56]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [57]:
model_1 = smf.ols(formula='Sales ~ TV + Radio + Newspaper', data = train).fit()

In [58]:
print(model_1.summary())

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.900
Model:                            OLS   Adj. R-squared:                  0.898
Method:                 Least Squares   F-statistic:                     407.0
Date:                Wed, 15 Jun 2022   Prob (F-statistic):           1.04e-67
Time:                        16:12:58   Log-Likelihood:                -273.79
No. Observations:                 140   AIC:                             555.6
Df Residuals:                     136   BIC:                             567.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.9788      0.370      8.046      0.0

In [59]:
model_1.params #coeff.

Intercept    2.978767
TV           0.046685
Radio        0.185016
Newspaper   -0.001530
dtype: float64

In [60]:
model_1.pvalues

Intercept    3.757802e-13
TV           6.635435e-57
Radio        3.098625e-37
Newspaper    8.239084e-01
dtype: float64