In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error

# Linear Regression

* Basis of many more recent and complex learning methods
* Quantitative (continuous real-valued, $\in R^1$) response (Y) as a linear function of one or more features (X)
    - e.g. height as a function of weight, temperature as a function of pressure

#### Linear Regression Model (single feature)

* **Data** ${((x_0,y_0),(x_1,y_1),...,(x_n,y_n)}, x_i \in R^d, y \in R$, n is the number of observations (i.e. samples)  


* The expected value of Y given X as a linear function of X (i.e. a line)

<div style="font-size: 125%;">
$$ E[Y|X] = f(X) = b_0 + b_1X$$
</div>

#### Uses

* Explain the relationship between response variable and feature variable(s)
    - Strength of association (i.e. the correlation)
    - How much of the variance of the response can be explained by the predictors?
* Predict the response variable from the feature variables

#### Types

* Simple Linear Regression
    - One quantitative feature
* Multivariate Regression
    - Two or more features
    - Quantitative or categorical features
* Polynomial Regression
    - Polynomial terms of single feature



## Simple Linear Regression

* Dependent variable (response, outcome variable) y is a function of a single independent (feature, predictor) variable x
* y = mx + b, where m is the slope and b is the intercept
    - $y,x,b \in R$
* Does y increase or decrease as x increases or decreases and by how much?


In [None]:
howell = pd.read_csv("Howell.csv",sep=';')
howell.tail()

In [None]:
adult = howell.query("age > 17")
adult.tail()

In [None]:
sns.distplot(adult.height);

#### Plot relationship

In [None]:
xlabel,ylabel = 'weight','height'
plt.scatter(adult.weight, adult.height, color = 'red')
plt.title(f'{ylabel} vs. {xlabel}')
plt.xlabel(xlabel)
plt.ylabel(ylabel);

In [None]:
print('Missing values:\n',adult.isnull().sum())
print(f'\nCorrelation: {np.round(np.corrcoef(adult.weight, adult.height)[0,1],3)}')


### Linear Model for Simple Linear Regression

* Dependent variable y (height) is a linear function of the independent variable x (weight)

<div style="font-size: 125%;">
$$ y = b_0 + b_1*x + \epsilon$$
</div>

y is the dependent variable  
x is the independent variable  
$b_0$ is the intercept  
$b_1$ is the slope  
$\epsilon$ is the error term (noise)

#### Fitting the model

* The parameters of the model are $b_0$ and $b_1$ 
* Learn $b_0$ and $b_1$ from the data
* $b_0$ and $b_1$ will determine a line through the data

### Goal: Find best fitting line

* Learn $b_0$ and $b_1$ to find the best fitting line
* Use best fitting line to predict new data
* The best fitting line is the line with the smallest error $\epsilon$
    - Its an **optimization** problem
* We want to fit the linear regression model to the data to estimate the values of the coefficients that minimize the error

<div style="font-size: 125%;">
$$\hat{y_i} = \hat{b}_0 + \hat{b}_1x_i$$
</div>

* $x_i$ is the ith value of the predictor
* $\hat{y_i}$ is the predicted response (or fitted value) for observation i
* $\hat{b}_0$ is an estimate of the intercept
* $\hat{b}_1$ is an estimate of the slope

### Model Code

#### sklearn.LinearRegression

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

* Methods
    - fit: fit model to training data
    - predict: predict new data
    - score: R-squared
    
* intercept_ and coef_ to get the fitted parameters

#### Create arrays from data frame

* x should be 2-dimensional

In [None]:
X = adult.weight.values.reshape(-1,1)
y = adult.height.values

X.shape, X.ndim, y.shape,y.ndim

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=1234)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

#### Create linear model and fit to data

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
print(f'Intercept: {model.intercept_} Slope: {model.coef_[0]} ')

#### Predict new points

In [None]:
new = np.array([30.0,45.0,65.0])
predictions = model.predict(new.reshape(-1,1)) # Model fitted to 2-d array
predictions

In [None]:
xlabel,ylabel = 'Weight','Height'

plt.scatter(X, y, color = 'red')
plt.plot(new,predictions,'bo')
plt.title(f'Predict {new}')
plt.xlabel(xlabel)
plt.ylabel(ylabel);

### Regression Line

#### Predict each value of test set using the  model fitted to the training data

In [None]:
yhat = model.predict(X_test) # model.intercept_  + model.coef_[0]* X_test

In [None]:
plt.scatter(X_test, y_test, color = 'red') # the data points
plt.plot(X_test, yhat, color = 'blue') # the predicted points on the line
plt.plot(new,predictions,'bo')
plt.title(f'Best Fitting Line, y = {model.intercept_:.2f} + {model.coef_[0]:.2f}*x')
plt.xlabel(xlabel)
plt.ylabel(ylabel);

### Ordinary Least Squares (OLS) (i.e. how the model does it)

* Method for estimating the coefficients

#### Residuals: Difference between actual response value $y_i$ and fitted value $\hat{y}$

<div style="font-size: 125%;">
$$\epsilon_i = y_i - \hat{y_i}$$
</div> 

* Best fitting line is one that makes this prediction error "as small as possible" 

#### Minimize Square Error

* One way is to minimize the sum of the squared prediction errors: Ordinary Least Squares method 
* Find parameters $\hat{b}_0$ and $\hat{b}_1$ that minimizes 

<div style="font-size: 125%;">
$$RSS = SSE = \sum{\epsilon^2_i} = \sum{(y_i - \hat{y_i})^2} = \sum{(y_i - \hat{b}_0 + \hat{b}_1x_i)^2}$$
</div>

 * RSS is called the Residual Sum of Squares, SSE = Sum of Squared Estimate of Errors

#### Plot Residuals

In [None]:
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_test, yhat, color = 'blue')
plt.title('Residuals')
plt.xlabel(xlabel)
plt.ylabel(ylabel)
for i in range(len(yhat)):
    plt.plot((X_test[i],X_test[i]),(yhat[i],y_test[i]),'g-') #(x-coordinates from,to) (y-coordinates  from,to)    
#plt.savefig("Residuals.png")


##### Why squared error? why not absolute error>
 
* Distance measure
* Smooth function, differentiable

#### Least squares estimate for the coefficients $\hat{b}_0$ and $\hat{b}_1$
 
* Take the partial derivative with respect to $\hat{b}_1$, set it to 0 and solve for $\hat{b}_1$

<div style="font-size: 125%;">

$$\frac{\partial(RSS)}{\partial(\hat{b}_1)} = 0$$$$\hat{b}_1 = \frac{\sum^n_{i=1}(x_i-\bar{x})(y_i-\bar{y})}{\sum^n_{i=1}(x_i-\bar{x})^2}$$

$$\hat{b}_0 = \bar{y} - \hat{b}_1\bar{x}$$
</div>

$\bar{x}$ - mean of x  
$\bar{y}$ - mean of y

#### Note:

<div style="font-size: 125%;">
$$\hat{b}_1 = \frac{COV(x,y)}{VAR(x)}$$
</div>

#### Derivation

https://are.berkeley.edu/courses/EEP118/current/derive_ols.pdf


### Interpreting the coefficients
 
* $\hat{b}_0$: the value of the response when the predictor is equal to 0
    - The estimated mean of y, the dependent variable
* $\hat{b}_1$: the amount (in $\hat{b}_1$ units) that the mean response will increase or decrease by for every one unit increase in x.

#### What to $\hat{b}_0$ and $\hat{b}_1$ estimate?

* Population Mean
* Will get a different estimate with a different sample


### Assumptions of Least Squares
 
* **Linear** relationship between response and predictor variables
* Error terms are **Independent**
* The error term is **Normally distributed** $N(0,\sigma^2)$
* Homoscedastic: **Equal** variance of the error term
* Under these assumptions, the Method of Least Squares is the Maximum Likelihood Estimate (MLE)
    - MLE is a probabilistic method (will cover later)

In [None]:
def residuals(y,yhat):
    return y - yhat

In [None]:
sns.distplot(residuals(y_test,yhat),fit=stats.norm);
params = stats.norm.fit(residuals(y_test,yhat))
print(f'Mean: {np.round(params[0],3)}, Standard Deviation: {np.round(params[1],3)}')

#### Check for Homoscedasticity

* Should not be funnel shaped

In [None]:
plt.plot(yhat,residuals(y_test,yhat),'bo')
plt.title("Fitted vs. Residuals")
plt.xlabel("Fitted")
plt.ylabel("Residuals");

### Accessing the accuracy of the model

 * Goodness of fit

####  R-squared (coefficient of determination) 
 
* Fraction of the total variance in y explained by the predictor(s)
* A number between 0 and 1 (i.e. independent of the scale of Y)
* $R^2$ close to 1 means a large proportion of the variance in the response is explained by the regression
* $R^2$ close to 0 means that not much of the variance is explained: wrong model, inherently high variance or both 
* Residual Sum of Squares (RSS): squared difference of actual response value and fitted values (i.e. the residuals)
* Total Sum of Squares (TSS): squared difference of actual response value and mean response value.

<div style="font-size: 125%;"> 
$$R^2 = \frac{TSS - RSS}{RSS} = 1 - \frac{RSS}{TSS}$$
$$TSS = \sum^n_{i=1}(y_i-\bar{y})^2$$
</div>

In [None]:
def rss(y,yhat):
    return(np.sum(residuals(y,yhat)**2))

In [None]:
def tss(y):
    return(np.sum((y - np.mean(y))**2))

def R_squared(y,yhat):
    return(1 - (rss(y,yhat)/tss(y)))

R_squared(y_test,yhat)

#### R-squared from model

In [None]:
model.score(X_test,y_test)

* For a single predictor, R-squared is equal to Pearson's Correlation squared
<div style="font-size: 125%;"> 
$$R^2 = r^2$$
</div> 

In [None]:
r = stats.pearsonr(X_test[:,0],y_test)[0]
r**2


#### Mean Squared Error

* The average of the squared error (i.e. residuals)

$$MSE = \frac{1}{n}\sum(y - \hat{y})^2$$

* Root Mean Squared Error
    - Same unit as response variable

$$RMSE = \sqrt{\frac{1}{n}\sum(y - \hat{y})^2}$$

In [None]:
def mse(y,yhat):
    return np.mean((y - yhat)**2)

def rmse(y,yhat):
    return np.sqrt(np.mean((y - yhat)**2))

In [None]:
print(f'MSE: {mse(y_test,yhat)}\nRMSE: {rmse(y_test, yhat)}')

### Exercises

1. Change value of random_state in train_test_split

2. Change test_size in train_test_split


## Multiple Linear Regression

* The response is a linear function of p predictors

<div style="font-size: 125%;">
$$Y = \beta_0 + \beta_1{X_1} + \beta_2{X_2} +...+\beta_p{X_p} + \epsilon $$
</div>

* $X_j$ is the jth predictor and $\beta_j$ is the average effect on Y of a one unit increase in $X_j$ holding all other predictors fixed
    
* Estimating Regression Coefficients
<div style="font-size: 125%;">
$$\hat{y} = \hat{b}_0 + \hat{b}_1{x_1} + \hat{b}_2{x_2} +...+ \hat{b}_p{x_p}$$
</div>
Minimize:
<div style="font-size: 125%;">
$$RSS = \sum^n_{i=1}(y_i - \hat{y}_i)^2 = \sum^n_{i=1}(y_i - \hat{b}_0 - \hat{b}_1{x_1} - \hat{b}_2{x_2} -...- \hat{b}_p{x_p})^2$$
</div>



### Matrix Formulation

* X is called the Design Matrix (1's in the first column for the intercept and the remaining columns are the predictors)  
* $\beta = (b_0,b_1,...b_n)$
* How to find the $\beta$ that minimizes the RSS? 
    - Take the partial derivative of RSS with respect to $\beta$, 
    - Set it equal to 0
    - Solve for $\beta$
* $X\beta$ is the dot product, $(y-X\beta)^T(y-X\beta)$ is the Squared Error

<div style="font-size: 125%;">
$$RSS(\beta) = (y-X\beta)^T(y-X\beta)$$
$$\frac{\partial{RSS}}{\partial{\beta}} = -2X^T(y-X\beta)$$
$$X^T(y-X\beta) = 0$$
$$X^Ty = X^TX\beta$$
$$\beta = (X^TX)^{-1}X^Ty$$
</div>
* Called the Normal Equation, closed form solution to the optimization

### Issues with multiple predictors

#### Which predictors best explain the response?

    
#### Is there multicolinearity (i.e. are two or more predictors highly correlated)?

In [None]:
sns.pairplot(adult,diag_kind='kde');

In [None]:
adult.corr()

In [None]:
X = adult.loc[:,['weight','age','male']].values
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train[:,0:2])
X_test = scaler.transform(X_test[:,0:2])
X_train.shape,X_test.shape

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
print(f'Intercept: {model.intercept_} Coefficients:: {model.coef_} ')

In [None]:
yhat = model.predict(X_test)
yhat.shape

In [None]:
np.round(model.score(X_test,y_test),2)

In [None]:
np.round(rmse(y_test,yhat),2)

#### Exercise

1. Create and run a model with just weight and age as predictors. Compare with full model

## Polynomial Linear Regression
 
* Linear model assumes a linear relationship between response and predictors
* But what if relationship is non-linear, can we extend model to fit these cases
* Polynomial Regression

<div style="font-size: 115%;">
$$y = \beta_0 + \beta_1 X + \beta_2 X^2 + \epsilon$$
</div>

* **This is still a linear model - linear in the coefficients**

<div style="font-size: 115%;">
$$Z = X^2$$
$$y = \beta_0 + \beta_1 X + \beta_2 Z + \epsilon$$
</div>


### Data

How does height vary with weight for all subjects?

* Dependent Variable: height
* Independent Varible: weight

In [None]:
X = howell.weight.values.reshape(-1,1)
y = howell.height.values

plt.scatter(X, y, color = 'red')
plt.title(f'{ylabel} vs. {xlabel} all ages')
plt.xlabel(xlabel)
plt.ylabel(ylabel);

#### Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

#### Fit model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

#### Predict test data

In [None]:
yhat = model.predict(X_test)
yhat.shape

#### Model Parameters

In [None]:
print(f'Intercept: {model.intercept_} Coefficients: {model.coef_} ')

#### Model Accuracy

In [None]:
print(f'R-squared: {np.round(model.score(X,y),2)}')

print(f'RMSE: {np.round(rmse(y_test,yhat),2)}')



In [None]:
plt.scatter(X, y, color = 'red')
plt.plot(X_test, yhat, color = 'blue')
plt.title('Regression Line')
plt.xlabel(xlabel)
plt.ylabel(ylabel);

### Try adding a quadratic term

<div style="font-size: 125%;">
$$ height = \beta_0 + \beta_1\cdot{weight} + \beta_2\cdot{weight^2} + \epsilon$$
</div>

In [None]:
howell['weight2'] = howell.weight**2
howell.head()

#### Transform to array

In [None]:
X2 = howell.loc[:,['weight','weight2']].values
X2.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size = 0.2, random_state = 1234)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

#### Fit quadratic model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)



#### Predict test data

In [None]:
yhat = model.predict(X_test)
yhat.shape

#### Model Parameters

In [None]:
print(f'Intercept: {model.intercept_} Coefficients: {model.coef_} ')

#### Model Accuracy

In [None]:
print(f'R-squared: {np.round(model.score(X2,y),2)}')

print(f'RMSE: {np.round(rmse(y_test,yhat),2)}')



In [None]:
X_test[:,0].shape

In [None]:
plt.scatter(X, y, color = 'red')
plt.plot(X_test[:,0], yhat, 'bo')
plt.title('Predicted Values')
plt.xlabel(xlabel)
plt.ylabel(ylabel);

#### Exercise

1. Add cubic term to model and compare with quadratic model.