In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

### Metrics for choosing the best model from a set of models (for Linear Regression)

* Want the model with the lowest test error but pure training error is a poor estimate of test error
* Determine model with best generalization by **adjusting** training error
* Penalizes training error for more complex models (i.e. models with more predictors)

In [None]:
boston = pd.read_csv('Boston.csv')
boston.tail()

In [None]:
X = boston.iloc[:,0:-1].values
y = boston.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1234)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

model = LinearRegression().fit(X_train,y_train)
yhat = model.predict(X_train)

In [None]:
X2 = X[:,0:2]
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y, test_size = 0.25, random_state = 1234)
print(X_train2.shape, X_test2.shape, y_train2.shape, y_test2.shape)

model2 = LinearRegression().fit(X_train2,y_train2)
yhat2 = model2.predict(X_train2)

#### Adjusted $R^2$: 

* Can't use $R^2$ since it will always decrease as the number of predictors increases
    
$$\text{Adjusted }R^2 = 1 - \frac{RSS/(n - d - 1)}{TSS/(n - 1)} = 1 - \frac{(1 - R^2)(n-1)}{(n - d -1)}$$

n: number of observations  
d: number of predictors(include intercept)  
RSS: Residual Sum of Squares  
TSS: Total Sum of Squares  

* Adjusted R2 is always less than or equal to R2. 
    - = 1 indicates a model that perfectly predicts the target values 
    - <= 0 indicates a model that has no predictive value. 

In [None]:
def rss(y,yhat):
    return np.sum((y - yhat)**2)
    
def tss(y):
    return(np.sum((y - np.mean(y))**2))

def R_squared(y,yhat):
    return(1 - (rss(y,yhat)/tss(y)))

def adjr2_1(y,yhat,d):
    n = len(y)
    return 1 - ((rss(y,yhat)/(n-d-1))/(tss(y)/(n-1)))

def adjr2_2(y,yhat,d):
    n = len(y)
    return 1 - (((1-R_squared(y,yhat))*(n-1))/(n-d-1))


In [None]:
R_squared(y_train,yhat)

In [None]:
adjr2_1(y_train,yhat,X.shape[1]),adjr2_2(y_train,yhat,X.shape[1])

In [None]:
yhat_test = model.predict(X_test)
R_squared(y_test,yhat_test)

    
#### Akaike Information Criteria (AIC)

* Value only meaningful in comparison to other models
* Lowest AIC is the best

$$AIC = -2logL+ 2d$$ 

L: Maximum Likelihood Estimate  
d: number of predictors(include intercept)  

* For OLS Linear Regression

$$AIC = nlog(RSS/n) + 2d$$

n: number of observations  
d: number of predictors(include intercept)  
RSS: Residual Sum of Squares 



In [None]:
def aic(y,yhat,d):
    n = len(y)
    return n*np.log(rss(y,yhat)/n) + 2*d

In [None]:
aic(y_train,yhat,X.shape[1])

In [None]:
aic(y_train2,yhat2,X_train2.shape[1])

#### Bayesian Information Criteria (BIC)

* Value only meaningful in comparison to other models
* Lowest BIC is the best
* Generally a heavier penalty than AIC for more predictors

$$BIC = -2log(L)+ dlog(n)$$

L: Maximum Likelihood Estimate  
n: number of observations  
d: number of predictors(include intercept)  

* For OLS Linear Regression

$$BIC = nlog(RSS/n) + dlog(n)$$

n: number of observations  
d: number of predictors(include intercept)  
RSS: Residual Sum of Squares 


    

In [None]:
def bic(y,yhat,d):
    n = len(y)
    return n*np.log(rss(y,yhat)/n) + d*np.log(n)

In [None]:
bic(y_train,yhat,X_train.shape[1])

In [None]:
bic(y_train2,yhat2,X2.shape[1])

In [None]:
m = LinearRegression().fit(X,y)
y_hat = m.predict(X)
a = aic(y,y_hat,X.shape[1])
b = bic(y,y_hat,X.shape[1])
a,b

### Reference

https://math.stackexchange.com/questions/2093369/bayesian-information-criterion-derivation-for-linear-regression