In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

In [None]:
dataset = pd.read_csv('../input/advertising.csv')
print(dataset.shape)
dataset.describe()

In [None]:
## 2D-plot for Money spent on TV Ads and its effect on Sales
dataset.plot(x='TV', y='Sales', style='o')  
plt.title('TV vs Sales')  
plt.xlabel('Money spent on TV Ads')  
plt.ylabel('Sales')  
plt.show()

In [None]:
## 2D-plot for Money spent on Newspaper Ads and its effect on Sales
dataset.plot(x='Newspaper', y='Sales', style='o')  
plt.title('Newspaper vs Sales')  
plt.xlabel('Money spent on Newspaper Ads')  
plt.ylabel('Sales')  
plt.show()

In [None]:
## 2D-plot for Money spent on Radio Ads and its effect on Sales
dataset.plot(x='Radio', y='Sales', style='o')  
plt.title('Radio vs Sales')  
plt.xlabel('Money spent on Radio Ads')  
plt.ylabel('Sales')  
plt.show()

In [None]:
plt.figure(figsize=(15,10))
plt.tight_layout()
seabornInstance.distplot(dataset['Sales'])
## The avg seems to lie nearly between 15 to 20

In [None]:
## Basic preprocessing to find whether dataset has noisy data
print(dataset.isna().any())

In [None]:
## Our next step is to divide the data into “attributes” and “labels”. 
## Attributes are the independent variables while labels are dependent variables whose values are to be predicted. 
X = dataset['TV'].values.reshape(-1,1) # Attributes/Features
y = dataset['Sales'].values.reshape(-1,1) # Label

In [None]:
## To split the data into test set(20%) and train set(80%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Simple linear regression model
regressor = LinearRegression()  
regressor.fit(X_train, y_train) #training the algorithm

In [None]:
## The linear regression model basically finds the best value for the intercept and slope, which results in a line that best fits the data. 
## To see the value of the intercept and slop calculated by the linear regression algorithm for our dataset

#To retrieve the intercept:
print(regressor.intercept_)
#For retrieving the slope:
print(regressor.coef_)

## This means that for every one unit of effect in TV ads, the Effect in the sales is about to increase by 0.054%.

In [None]:
## To predict the test data with our trained model
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
df

In [None]:
## Bar graph representation on actual vs predicted by the model
df1 = df
df1.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
## Below Visualization to see the linear fit of our model with actual and prediction.

plt.scatter(X_test, y_test,  color='gray')
plt.plot(X_test, y_pred, color='red', linewidth=2)
plt.show()

In [None]:
## For Regression, there are three metrics to consider which are shown below

## Mean Absolute Error (MAE) is the mean of the absolute value of the errors.
## Mean Squared Error (MSE) is the mean of the squared errors
## Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
## In the above cells , we handled the problem with involving sales and TV data, where as in the real world problems we need to include certain other variables
## for consideration. Linear regression involving multiple variables is called “multiple linear regression” or multivariate linear regression.

## The steps to perform multiple linear regression are almost similar to that of simple linear regression. The difference lies in the evaluation. 
## You can use it to find out which factor has the highest impact on the predicted output and how different variables relate to each other.

In [None]:
## In multiple linear regression, till the preprocessing steps above has to be followed and need to proceed further as shown below.

## Dataset 

X = dataset[['TV','Newspaper','Radio']]
y = dataset['Sales']

In [None]:
## To split the data with multiple independent variables and dependent variable(multi attributes) - Train 80% and Test 20% Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
## Multiple linear model
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

In [None]:
## In the case of multivariable linear regression, the regression model has to find the most optimal coefficients for all the attributes.

coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])  
coeff_df

## From the below result which means the following interpretaion

## 1. For 1 unit of TV Ads there is 0.05% increase in Sales
## 2. For 1 unit of Newspaper Ads there is 0.003% of decrease in Sales
## 3. For 1 unit of Radio Ads there is 0.11% of increase in Sales

In [None]:
## Predicting the results with test data after the model has been trained with multiattributes
y_pred = regressor.predict(X_test)


In [None]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1 = df.head(25)
print(df1)

In [None]:
df1.plot(kind='bar',figsize=(10,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


## The RMSE value seems to be lesser than the simple linear regression model which is around 2.5, so this makes more efficient prediction with multidimension 
## than the simple model.


## Alternate for finding RMSE 
from sklearn.model_selection import cross_val_score
MSEs = cross_val_score(regressor, X, y, scoring='neg_mean_squared_error', cv=5)
mean_MSE = np.mean(MSEs)

print("Alternative way of representing error :",mean_MSE)

On overall, the RMSE value seems to be greater than 10% of the mean of Sales value overall which means the model is not so accuate but we can able to make better predictions though, below are some of the reasons to improve the model accuracy .

1. Need more data: We need to have a huge amount of data to get the best possible prediction. 
2. Poor features: The features we used may not have had a high enough correlation to the values we were trying to predict.

# ADD-ON 

### Regularization penalty for linear models

Regularization penalizes parameters for being too large and keeps them from being weighted too heavily. Typically, the penalty grows exponentially, so the larger a coefficient gets, the heavier the penalty.
Regularization is used to keep your sample's idiosyncrasies from having too much influence on your model. It's a trade-off: you get a more generalized model, but it loses accuracy (in the sample/training set).

#### Problem with Linear Regression Model

linear models actually look at average squared error.

Finding such a line of best fit based on minimizing this mean squared error is actually a really easy problem to solve and there are tons of algorithms (the most common one being ordinary least squares(OLS)) that will find a solution for us. And the solution is guaranteed to be optimal from the perspective of minimizing average squared error within the dataset given.

One key observation is that even though the linear model may be optimal for the data given to create the model, it is not necessarily guaranteed to be the best model for predictions on unseen data.
One of the primary reasons that our model ends up performing on unseen data so poorly is directly related to the complexity of the model. 
If our underlying data follows a relatively simple model, and the model we use is too complex for the task, what we are essentially doing is we are putting too much weight on any possible change or variance in the data. Our model is overreacting and overcompensating for even the slightest change in our data. People in the field of statistics and machine learning call this phenomenon overfitting.


### Ridge Regression

To overcome the problem of linear model by adding a penalty to models that have too large coefficients. Remember that fitting a linear model is just a matter of minimizing the average squared error between the true data points and the data points estimated by our model. To get ridge regression, all we do is now we add a constraint that penalizes models with large coefficients.we are actually going to penalize the sum of the squares of the coefficients. 

Let's see the Math behind this

The norm of a vector 𝑤 is denoted as ‖𝑤‖.

With this knowledge in hand, let’s see how the math works out. In ordinary least squares given a dataset 𝑋 and the true values 𝑦 (in our example the matrix 𝑋 had two columns and three rows), we try to find a set of weights or coefficients 𝑤 that minimizes the average square error or

1𝑁‖𝑋𝑤−𝑦‖2

Now, in ridge regression we try find a set of coefficients 𝑤 to minimize

1𝑁(‖𝑋𝑤−𝑦‖2+𝜆‖𝑤‖2)

Notice that the only difference is the 𝜆‖𝑤‖2. What does this mean? Turns out this is a natural way to express the idea of making sure that our weights do not get too large. Notice that if our coefficients become too large, the norm of 𝑤 becomes increasingly large; consequently, ridge regression will avoid these solutions. And the 𝜆 parameter is something called a hyperparameter that we use to represent the degree in which we want to penalize complex models.
In a way, you can think of ridge regression as a way to incorporate prior knowledge into a linear model. If you know ahead of time that you model is likely to be simple, ridge regression is likely more preferable than OLS

The idea of penalizing complex models through large coefficients is central to ridge regression and a general powerful tool in machine learning and statistics called regularization. Regularization is something which battles against overfitting.


In [None]:
## Ridge regression for handling our advertising dataset

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

alpha = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]

ridge = Ridge()
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]} ## 𝜆‖𝑤‖2 - alpha value 
ridge_regressor = GridSearchCV(ridge, parameters,scoring='neg_mean_squared_error', cv=5)
ridge_regressor.fit(X, y)

print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)

### LASSO Regression

The main goal of introducing Regularization in a statistical model is to avoid over-fitting. The same thing can be said and interpreted in many different ways -  to improve out-of-sample model performance or to reduce model complexity. Another important issue tackled by using Regularization is handling Multicollinearity i.e. it can fit a model to data even when we have correlated features.

The different method of regularizaiton in the statistical model is LASSO (Least Absolute Shrinkage and Selection Operator)

Lasso regression analysis is a shrinkage and variable selection method for linear regression models. The goal of lasso regression is to obtain the subset of predictors that minimizes prediction error for a quantitative response variable. The lasso does this by imposing a constraint on the model parameters that causes regression coefficients for some variables to shrink toward zero. Variables with a regression coefficient equal to zero after the shrinkage process are excluded from the model. Variables with non-zero regression coefficients variables are most strongly associated with the response variable.

To test a lasso regression model, you will need to identify a quantitative response variable from your data set if you haven’t already done so, and choose a few additional quantitative and categorical predictor (i.e. explanatory) variables to develop a larger pool of predictors. Having a larger pool of predictors to test will maximize your experience with lasso regression analysis.

The lasso regression analysis will help you determine which of your predictors are most important. 
##### Note also that if you are working with a relatively small data set, you do not need to split your data into training and test data sets. The cross-validation method you apply is designed to eliminate the need to split your data when you have a limited number of observations.

For more detailed math related to regularization refer the below link.
http://www.holehouse.org/mlclass/07_Regularization.html


In [None]:
## LASSO regression for handling our advertising dataset

from sklearn.linear_model import Lasso

lasso = Lasso()
parameters = {'alpha': [1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}
lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv = 5)
lasso_regressor.fit(X, y)
print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)


#### Thus the regularization methods are applied for the linear regression model by tuning the hyperparamters thus overcoming the problem of multicollinearity and overfitting even for the unseen data prediction.