In [53]:
# Common imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import random
from sklearn.linear_model import LinearRegression
import math

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (10, 5)

In [54]:
import os
import urllib
import shutil

def download_file(url, dir_path="data"):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        
    file_name = os.path.split(url)[-1]
    file_path = os.path.join(dir_path, file_name)
    
    with urllib.request.urlopen(url) as response, open(file_path, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
        
    return file_path

In [55]:
from sklearn.preprocessing import StandardScaler, normalize, MinMaxScaler

In [56]:
download_file("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv")
adv_df = pd.read_csv('data/Advertising.csv', usecols=[1,2,3,4])
adv_df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [57]:
adv_df['log_tv'] = adv_df.TV.apply(lambda x: math.pow(x, 0.4))

дата сет разделен 50/50

In [58]:
adv_df_test = adv_df.sample(int(len(adv_df)*0.5))
# create transactions dataset for train
adv_train = adv_df[~adv_df.isin(adv_df_test)].copy()
print("Total transactions in train dataset: ", len(adv_train))
# create transactions dataset for test
adv_test = adv_df[adv_df.isin(adv_df_test)].copy()
print("Total transactions in test dataset: ", len(adv_test))

Total transactions in train dataset:  200
Total transactions in test dataset:  200


In [59]:
three_x_lm = smf.ols('sales ~ log_tv + radio', adv_train).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 200.2349956038338
RSE: 1.0107458238487674
R^2: 0.9273049082016976


In [60]:
three_x_lm = smf.ols('sales ~ log_tv + radio', adv_test).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 165.22761992229803
RSE: 0.9181492584694774
R^2: 0.9368194886475539


In [61]:
three_x_lm = smf.ols('sales ~ TV + radio', adv_train).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 269.7967207754173
RSE: 1.173249283616756
R^2: 0.9020506014720118


In [62]:
three_x_lm = smf.ols('sales ~ TV + radio', adv_test).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 268.16303073846944
RSE: 1.1696917235996402
R^2: 0.8974585640352025


In [63]:
three_x_lm = smf.ols('sales ~ TV + radio + newspaper', adv_train).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 263.7072834762901
RSE: 1.1599333273333527
R^2: 0.9042613648908893


дата сет разделен 70/30

In [64]:
adv_df_test = adv_df.sample(int(len(adv_df)*0.7))
# create transactions dataset for train
adv_train = adv_df[~adv_df.isin(adv_df_test)].copy()
print("Total transactions in train dataset: ", len(adv_train))
# create transactions dataset for test
adv_test = adv_df[adv_df.isin(adv_df_test)].copy()
print("Total transactions in test dataset: ", len(adv_test))

Total transactions in train dataset:  200
Total transactions in test dataset:  200


In [65]:
three_x_lm = smf.ols('sales ~ log_tv + radio', adv_train).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 125.55972098639923
RSE: 0.8003816723033953
R^2: 0.9361058034203864


In [66]:
three_x_lm = smf.ols('sales ~ log_tv + radio', adv_test).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 233.58677300139493
RSE: 1.091681844681461
R^2: 0.9315456100577986


In [67]:
three_x_lm = smf.ols('sales ~ TV + radio', adv_train).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 183.5052313239895
RSE: 0.9676007338899842
R^2: 0.9066187848181623


In [68]:
three_x_lm = smf.ols('sales ~ TV + radio', adv_test).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 357.01429931249777
RSE: 1.3496301445598273
R^2: 0.8953742296875106


In [69]:
three_x_lm = smf.ols('sales ~ TV + radio + newspaper', adv_train).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 183.501073630311
RSE: 0.9675897723217142
R^2: 0.9066209005643198


In [70]:
three_x_lm = smf.ols('sales ~ TV + radio + newspaper', adv_test).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 357.0128583605467
RSE: 1.3496274209236088
R^2: 0.895374651969498


In [71]:
three_x_lm = smf.ols('sales ~ TV + radio + newspaper', adv_train).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 183.501073630311
RSE: 0.9675897723217142
R^2: 0.9066209005643198


дата сет разделен 80/20

In [72]:
adv_df_test = adv_df.sample(int(len(adv_df)*0.1))
# create transactions dataset for train
adv_train = adv_df[~adv_df.isin(adv_df_test)].copy()
print("Total transactions in train dataset: ", len(adv_train))
# create transactions dataset for test
adv_test = adv_df[adv_df.isin(adv_df_test)].copy()
print("Total transactions in test dataset: ", len(adv_test))

Total transactions in train dataset:  200
Total transactions in test dataset:  200


In [73]:
three_x_lm = smf.ols('sales ~ log_tv + radio', adv_train).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 321.59483528227855
RSE: 1.280933244134913
R^2: 0.9333801307021505


In [74]:
three_x_lm = smf.ols('sales ~ log_tv + radio', adv_test).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 43.19457059261591
RSE: 0.4694469748621499
R^2: 0.9209524042683456


In [75]:
three_x_lm = smf.ols('sales ~ TV + radio', adv_train).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 489.27635504211105
RSE: 1.5799708648859072
R^2: 0.8986441222079234


In [76]:
three_x_lm = smf.ols('sales ~ TV + radio', adv_test).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 44.07445248139375
RSE: 0.47420423397236244
R^2: 0.9193421892139655


In [77]:
three_x_lm = smf.ols('sales ~ TV + radio + newspaper', adv_train).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 488.7879819463587
RSE: 1.579182140989505
R^2: 0.8987452909713429


In [78]:
three_x_lm = smf.ols('sales ~ TV + radio + newspaper', adv_test).fit()
rss = np.sum(three_x_lm.resid ** 2)
print("RSS:", rss)
print("RSE:", np.sqrt(rss / (adv_df.shape[0] - 3 - 1)))
print("R^2:", three_x_lm.rsquared)

RSS: 42.17042799158744
RSE: 0.46384829939866973
R^2: 0.9228266215411873
