In [37]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

In [38]:
sales_df = pd.read_csv('Advertising.csv')
sales_df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales,Unnamed: 4,TV^2,tv.radio,tv.news,radio^2,radio.news,news^2
0,230.1,37.8,69.2,22.1,,52946.0,8698.0,15923.0,1429.0,2616.0,4789.0
1,44.5,39.3,45.1,10.4,,,,,,,
2,17.2,45.9,69.3,9.3,,,,,,,
3,151.5,41.3,58.5,18.5,,,,,,,
4,180.8,10.8,58.4,12.9,,,,,,,


In [39]:
X = sales_df[['TV', 'Radio', 'Newspaper']]
y = sales_df['Sales']

In [40]:
# X = X.apply(lambda rec: (rec - rec.mean())/rec.std(), axis= 0)
# y = np.array((y - y.mean())/y.std())

In [41]:
X

Unnamed: 0,TV,Radio,Newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4
...,...,...,...
195,38.2,3.7,13.8
196,94.2,4.9,8.1
197,177.0,9.3,6.4
198,283.6,42.0,66.2


### Scikit-Learn Library for Machine Learning

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, 
random_state = 42)

In [43]:
len(X_train)

140

In [44]:
len(X_test)

60

In [45]:
# Initializing the model
model = LinearRegression()
# Fitting training data to the model
model.fit(X_train, y_train)

LinearRegression()

In [46]:
model.intercept_

2.70894909251591

In [47]:
model.coef_

array([0.04405928, 0.1992875 , 0.00688245])

In [48]:
# this line is also showing the the coeffients but in the form of the list
list((zip(X_train.columns, model.coef_)))

[('TV', 0.0440592809574652),
 ('Radio', 0.19928749689893954),
 ('Newspaper', 0.006882452222275456)]

Sales = 2.708 + 0.044*TV + 0.199*Radio + 0.006*Newspaper

In [49]:
# Predicting the y value from the test set

In [50]:
y_pred = model.predict(X_test)

In [51]:
# Creating DataFrame with 3 columns named: actual, predicted and residuals
# to store the respective values

test_pred_df = pd.DataFrame({'actual': y_test, 
                             'predicted': np.round(y_pred,2)})

In [52]:
test_pred_df

Unnamed: 0,actual,predicted
95,16.9,16.57
15,22.4,21.19
30,21.4,21.55
158,7.3,10.89
128,24.7,22.2
115,12.6,13.36
69,22.3,21.2
170,8.4,7.35
174,11.5,13.28
45,14.9,15.12


# below we are checking if the Coefficients are high or not

Here below are the 2 line which if you compare the both R2score for x_train and x_test . you can see the output have very small gap . which means the coefficients are not high

In [53]:
# y_train contains the actual value and the predicted value is
# returned from predict() method after passing the X values of the
# training data.

r2 = metrics.r2_score(y_train, model.predict(X_train))
print('R Squared: ', r2)

R Squared:  0.9055159502227753


In [54]:
r2 = metrics.r2_score(y_test, model.predict(X_test))
print('R Squared: ', r2)

R Squared:  0.8609466508230368


In [55]:
# y_pred contains predicted value of test data
mse = metrics.mean_squared_error(y_test, y_pred)

In [56]:
# Taking square root of MSE and then round off to two decimal values
rmse = round(np.sqrt(mse), 2)
print('RMSE: ', rmse)

RMSE:  1.95


# Till here we have only performed the train data, split , perform predict on train_data and get r2score to confirm the coefficients are not high

### Validation Dataset

In [57]:
sales_df = pd.read_csv('Advertising.csv')
sales_df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales,Unnamed: 4,TV^2,tv.radio,tv.news,radio^2,radio.news,news^2
0,230.1,37.8,69.2,22.1,,52946.0,8698.0,15923.0,1429.0,2616.0,4789.0
1,44.5,39.3,45.1,10.4,,,,,,,
2,17.2,45.9,69.3,9.3,,,,,,,
3,151.5,41.3,58.5,18.5,,,,,,,
4,180.8,10.8,58.4,12.9,,,,,,,


# here we are performing the splitting but not only for train and test data, but for Validation data too.

In [58]:
# here X_other and Y_other will have 30% of splitted data. 
X_train, X_other, y_train, y_other = train_test_split(sales_df[['TV', 'Radio',
'Newspaper']], sales_df['Sales'], train_size=0.7, random_state = 42)

In [64]:
# now here we are again splitting that 30% part(X_other,Y_other) of above splitted data into 50%-50%, 
#X_other,Y_other is splitted into X_eval,X_test,y_eval,y_test
# this part splition is for Testing Data and Validation data
X_eval, X_test, y_eval, y_test = train_test_split(X_other, y_other, 
train_size=0.5, random_state = 42)

In [67]:
from sklearn.preprocessing import StandardScaler
#  The idea behind StandardScaler is that it will transform your data such that 
#  its distribution will have a mean value 0 and standard deviation of 1.
# (actual value - mean)/standard deviation (aka Z score)
# [mean = 0
# std = 1]
# this is required when the numbers are vey big or datasets are huge or the units in the features are differeent like
# kg,heights and cm's.

In [68]:
# it is standardization (z-score)= X(values & Features)-Mean of X/Standard Deviation of X
scaler = StandardScaler()

In [69]:
X_train = scaler.fit_transform(X_train)

In [70]:
X_test = scaler.transform(X_test)

In [71]:
X_eval = scaler.transform(X_eval)

In [72]:
lm = LinearRegression()

In [14]:
lm.fit(X_train, y_train)

LinearRegression()

In [68]:
pred = lm.predict(X_eval)

In [69]:
metrics.mean_squared_error(y_eval, pred)

2.549038568717419

In [70]:
# evaluate and perform usual operations and finally use the test data only once

### KFold Validation

In [79]:
sales_df = pd.read_csv('Advertising.csv')
# Printing first few records
sales_df.head()
X = sales_df[['TV', 'Radio', 'Newspaper']]
y = sales_df['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [80]:
lm = LinearRegression()

In [81]:
# this Standardization is required when you are performing Cross validation.
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [82]:
# it is nothing but cross validation 
from sklearn.model_selection import cross_val_score

In [83]:
#here we are storing scores of crossvalidation in scores ( where its paramter is model(LinearRegression,Training
#data,scoring('which kind of score you want to', and the last is how many time you want to perform cross validation)))
scores = cross_val_score(lm, X_train, y_train, scoring = 'r2', cv=5)

In [84]:
# r2 = .75, .85
# MSE = -2.5, -3.5

In [85]:
# this is the library which showing that many kind of scores you can find.
from sklearn.metrics import SCORERS

In [86]:
# number of score metrics , which you can perform as you want
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [87]:
# https://scikit-learn.org/stable/modules/model_evaluation.html

In [91]:
# this is the avg r2 score is 86%
abs(scores.mean())

0.8627872839147471

In [92]:
# if you want to know scores of each fold . Below are the outputs
scores

array([0.89667066, 0.92803914, 0.80641155, 0.91762366, 0.7651914 ])

In [93]:
lm.fit(X_train, y_train)

LinearRegression()

In [94]:
lm.coef_

array([ 3.76599021,  2.76548662, -0.00690986])

In [95]:
pred = lm.predict(X_test)

In [96]:
metrics.r2_score(y_test, pred)

0.9185780903322446

The choice of k is usually 5 or 10.

### Lasso Regression

# L1 Norm

In [102]:
# for perform Lasso Regularisation This library should be called
from sklearn.linear_model import Lasso

In [107]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [108]:
# it is the penality factor represents as alpha
lasso_model = Lasso(alpha = 0.01)

In [109]:
lasso_model.fit(X_train, y_train)

Lasso(alpha=0.01)

In [110]:
# here you can see as per concept some coefficient can become zero. Here you can see the last feature is zero after performing lasso.
lasso_model.coef_

array([ 3.75598404,  2.7540475 , -0.        ])

In [111]:
predict = lasso_model.predict(X_test)

In [112]:
metrics.r2_score(y_test, predict)

0.9182975460753905

### Lasso with CV

In [116]:
# how will i know that 0.01 is the ideal Lasso penality factor which actually support with all Features
# for checking that we will perform Lasso with Cross validation
from sklearn.linear_model import LassoCV

In [119]:
# in here we will given three penality factors randomly and we will perform cross validation on basis of this 
lassoCV_model = LassoCV(alphas=[.1, 0.01, 10])

In [120]:
lassoCV_model.fit(X_train, y_train)

LassoCV(alphas=[0.1, 0.01, 10])

In [122]:
# upside we fit the model with features and the model chooses 0.1 as good lasso penality factor which can perform well.
lassoCV_model.alpha_

0.1

In [155]:
predict = lassoCV_model.predict(X_test)

In [156]:
metrics.r2_score(y_test, predict)

0.915438906791447

In [124]:
# or set alpha to automatic( here we don't need to give penality factors. just tell machine to choose 1000 penality
# factors randomly and based on that fit to find good suitable one.)

In [126]:
lassoCV_model = LassoCV(n_alphas=1000)

In [127]:
lassoCV_model.fit(X_train, y_train)

LassoCV(n_alphas=1000)

In [131]:
# here are the choosen one which good suitable for performing the model
lassoCV_model.alpha_

0.09472651317300003

In [132]:
predict = lassoCV_model.predict(X_test)

In [133]:
metrics.r2_score(y_test, predict)

0.9156366097134607

# L2 Norm

In [134]:
# 2nd form of regularization is Ridge
from sklearn.linear_model import Ridge

In [135]:
# Same as we previous did. giving the penality factor
ridge_model  = Ridge(alpha = .1)

In [136]:
ridge_model.fit(X_train, y_train)

Ridge(alpha=0.1)

In [137]:
ridge_model.coef_

array([ 3.76333559,  2.76339947, -0.00593674])

In [138]:
predict = ridge_model.predict(X_test)

In [139]:
metrics.r2_score(y_test, predict)

0.918509515975792

In [146]:
# change alpha parameter penality factor

In [147]:
ridge_model  = Ridge(alpha = 5)

In [148]:
ridge_model.fit(X_train, y_train)

Ridge(alpha=5)

In [149]:
ridge_model.coef_

array([3.63784885, 2.66537911, 0.03829115])

In [150]:
predict = ridge_model.predict(X_test)

In [151]:
metrics.r2_score(y_test, predict)

0.9142964955815641

### Ridge with CV

In [152]:
from sklearn.linear_model import RidgeCV

In [153]:
ridge_modelCV  = RidgeCV(alphas=(0.1, 1.0, 10))

In [154]:
ridge_modelCV.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]))

In [155]:
# ridge_modelCV.cv_values_

In [156]:
ridge_modelCV.coef_

array([ 3.76333559,  2.76339947, -0.00593674])

In [157]:
predict = ridge_modelCV.predict(X_test)

In [158]:
metrics.r2_score(y_test, predict)

0.9185095159757849

### Elastic Net

In [159]:
from sklearn.linear_model import ElasticNet

In [161]:
# Now here 1 extra parameter l1_ratio. as per concept in elastic net you can choose the how much lasso and ridge you want to add.
# here if you give l1_ratio as 0.5( ridge is 50). it mean 50% of lasso and 50% of ridge
enet = ElasticNet(alpha = .01, l1_ratio = 0.5)

In [162]:
enet.fit(X_train, y_train)

ElasticNet(alpha=0.01)

In [163]:
pred = enet.predict(X_test)

In [164]:
metrics.r2_score(y_test, pred)

0.9179478260809931

### Elastic Net with CV

In [188]:
from sklearn.linear_model import ElasticNetCV

In [165]:
# here the value mean 1=10%,5=50%,7=70%,8=80%,9=90% and n_alphas is the randomly alpha value 100 times
enetCV = ElasticNetCV(l1_ratio = [.1, .5, .7, .8, .9], 
                     n_alphas=100)

In [166]:
enetCV.fit(X_train, y_train)

ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.8, 0.9])

In [170]:
# here l1_raio is selected as 0.9
enetCV.l1_ratio_

0.9

In [171]:
# here alpha is selected as 0.088....
enetCV.alpha_

0.08848730467563237

In [172]:
pred = enetCV.predict(X_test)

In [173]:
metrics.r2_score(y_test, pred)

0.9150494125440356

### Grid Search CV

In [177]:
sales_df = pd.read_csv('Advertising.csv')
# Printing first few records
sales_df.head()
X = sales_df[['TV', 'Radio', 'Newspaper']]
y = sales_df['Sales'] 

In [178]:
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.7, random_state = 101)

In [179]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [180]:
base_model = ElasticNet()

# HYPERPARAMETER(Tuning the Hyperparameter)

In [None]:
# giving the multiple value for one parameter is called hyperparameter. Tuning is changing in the values of parameter

In ElasticNet we can't provide the multiple value for 1 parameter. so we will use 'Hyperparameter' and will tune it 
until we get upon our expected.

In [187]:
# here we making dictionary which holds data parameter alpha and l1_ratio of ElasticNet. 
# this is why because we can pass this as dictionary data to for parameter alpha and parameter l1_ratio.
param = {'alpha': [ 0.001, 0.01, 1, 5, 10, 50], 'l1_ratio': [.1, .5, .7, .95, .99, 1]}

In [188]:
from sklearn.model_selection import GridSearchCV

In [189]:
#here are passing that instance (base_model) and giving the dictionary which we have created to param_grid. so it 
grid_model = GridSearchCV(estimator=base_model, param_grid=param, scoring='r2', 
                          cv=5,verbose=0)

In [190]:
grid_model.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'alpha': [0.001, 0.01, 1, 5, 10, 50],
                         'l1_ratio': [0.1, 0.5, 0.7, 0.95, 0.99, 1]},
             scoring='r2')

In [192]:
# here the best_estimator will choose the one good and suitable alpha and l1_ratio value and give output.
grid_model.best_estimator_

ElasticNet(alpha=0.01, l1_ratio=1)

In [193]:
# grid_model.cv_results_
# pd.DataFrame(grid_model.cv_results_)

In [203]:
y_pred = grid_model.predict(X_test)

In [204]:
metrics.r2_score(y_test, y_pred)

0.9182975460753906