In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
data = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/ml/Advertising.csv")

In [3]:
data.drop("Unnamed: 0", axis=1, inplace=True)

In [4]:
X = data.drop("Sales", axis=1)
y = data[["Sales"]]

# **Statsmodel**
Statsmodel is used to build regression models and analyze the impact of independent variables on the dependent variable, particularly through methods like OLS (Ordinary Least Squares), allowing for the creation of linear regression models and statistical evaluation of results.

In [5]:
# Building a model with statsmodels
import statsmodels.api as sm
lm = sm.OLS(y, X) # create model object
model = lm.fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                  Sales   R-squared (uncentered):                   0.982
Model:                            OLS   Adj. R-squared (uncentered):              0.982
Method:                 Least Squares   F-statistic:                              3566.
Date:                Fri, 13 Sep 2024   Prob (F-statistic):                   2.43e-171
Time:                        04:27:20   Log-Likelihood:                         -423.54
No. Observations:                 200   AIC:                                      853.1
Df Residuals:                     197   BIC:                                      863.0
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [6]:
# sm.OLS(y, X)  model oluşturur (doğrusal regresyon modeli)
# model.fit()  oluşturulan modelin eğitilmesini sağlar
# summary() , statsmodel'a ait bir fonksiyondur. modelin yorum tablosunu verir

# **Sklearn**

**Model creation**

In [7]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [8]:
model = lm.fit(X, y)

In [9]:
model.intercept_

array([2.93888937])

In [10]:
model.coef_

array([[ 0.04576465,  0.18853002, -0.00103749]])

In [11]:
# model equation : y = 2.94 + TV*0.04 + Radio*0.19 - Newspaper*0.001

**Prediction**

In [12]:
# Variables (TV, Radio, Newspaper)
variables = [[10, 20, 30]] # TV, Radio, Newspaper
variables = pd.DataFrame(variables, columns=['TV', 'Radio', 'Newspaper']) # arguments must be given as a DataFrame

In [13]:
predictions = model.predict(variables)
print(predictions)

[[7.13601137]]


**Model performance**

In [14]:
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y, model.predict(X)) # y, actual values ; model.predict(X), predicted values
MSE

2.784126314510936

In [15]:
import numpy as np
RMSE = np.sqrt(MSE)
RMSE

1.6685701407225697

# **Model Validation**
**Hold-out method**

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [17]:
X_train.head() # independent variable of train set

Unnamed: 0,TV,Radio,Newspaper
108,13.1,0.4,25.6
107,90.4,0.3,23.2
189,18.7,12.1,23.4
14,204.1,32.9,46.0
56,7.3,28.1,41.4


In [18]:
y_train.head() # dependent variable of train set

Unnamed: 0,Sales
108,5.3
107,8.7
189,6.7
14,19.0
56,5.5


In [19]:
X_test.head() # independent variable of test set

Unnamed: 0,TV,Radio,Newspaper
58,210.8,49.6,37.7
40,202.5,22.3,31.6
34,95.7,1.4,7.4
102,280.2,10.1,21.4
184,253.8,21.3,30.0


In [20]:
y_test.head() # dependent variable of test set

Unnamed: 0,Sales
58,23.8
40,16.6
34,9.5
102,14.8
184,17.6


In [21]:
lm = LinearRegression()
model = lm.fit(X_train, y_train)

In [23]:
mse = mean_squared_error(y_train, model.predict(X_train))
mse # train set MSE

3.0168306076596774

In [25]:
mse = mean_squared_error(y_test, model.predict(X_test))
mse # test set MSE

1.9918855518287906

# **K-fold cross validation**

In [26]:
from sklearn.model_selection import cross_val_score

In [36]:
cross_val_score(model, X_train, y_train, cv=10, scoring="neg_mean_squared_error")

array([-1.62375953, -3.81875608, -3.43828142, -2.27748673, -7.25325414,
       -1.88303708, -2.80517715, -3.68594486, -1.12810834, -3.96330989])

In [37]:
np.mean(- cross_val_score(model, X_train, y_train, cv=10, scoring="neg_mean_squared_error"))

3.187711520944357

In [38]:
# rmse
np.sqrt(np.mean(- cross_val_score(model, X_train, y_train, cv=10, scoring="neg_mean_squared_error")))

1.7854163438661463