### Import Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn
seaborn.set()

### Load data

In [3]:
df=pd.read_csv('datasets/Multiple linear regression.csv')

In [4]:
df.sample(5)

Unnamed: 0,SAT,"Rand 1,2,3",GPA
16,1755,3,3.17
14,1773,2,3.12
74,2021,3,3.61
56,1730,2,3.47
72,1975,1,3.6


In [5]:
df.describe()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
count,84.0,84.0,84.0
mean,1845.27381,2.059524,3.330238
std,104.530661,0.855192,0.271617
min,1634.0,1.0,2.4
25%,1772.0,1.0,3.19
50%,1846.0,2.0,3.38
75%,1934.0,3.0,3.5025
max,2050.0,3.0,3.81


## Multiple Regression

### Dependent and Independent Variables

In [7]:
x=df[['SAT','Rand 1,2,3']]
y=df['GPA']

### Standarization

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler=StandardScaler()

In [10]:
scaler.fit(x)

In [11]:
x_scaled=scaler.transform(x)

In [50]:
x_scaled[0:5]

array([[-1.26338288, -1.24637147],
       [-1.74458431,  1.10632974],
       [-0.82067757,  1.10632974],
       [-1.54247971,  1.10632974],
       [-1.46548748, -0.07002087]])

### Regression Itself

In [13]:
reg=LinearRegression()

In [14]:
reg.fit(x_scaled,y)

In [49]:
reg.predict(scaler.transform([[2133,3]]))



array([3.79822781])

### Coeffiecient

In [29]:
reg.coef_

array([ 0.17181389, -0.00703007])

### Intercept

In [30]:
reg.intercept_

3.330238095238095

### R-Square

In [31]:
reg.score(x_scaled,y)

0.4066811952814283

### Adjusted R-Squared

#### Formula
$R^2_{adj.}=1-(1-R^2)*\frac{n-1}{n-p-1}$

In [32]:
def adjusted_RSquared(x,y,reg):
    n=x.shape[0]
    p=x.shape[1]
    r2=reg.score(x,y)
    suffix=(n-1)/(n-p-1)
    preffix=(1-r2)
    
    return 1-preffix*suffix

In [33]:
print(adjusted_RSquared(x_scaled,y,reg))

0.3920313482513401


### Feature Selection as R-Adjusted was low

In [51]:
from sklearn.feature_selection import f_regression

In [52]:
f_reg=f_regression(x_scaled,y)
f_reg

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

In [53]:
p_values=f_reg[1]

In [54]:
x_features=['SAT','RAND 1,2,3']
for i in range(2):
    print(f'{x_features[i]} : {p_values.round(4)[i]}')

SAT : 0.0
RAND 1,2,3 : 0.6763


##### As 0.676>0.05 so Rand 1,2,3 is useless

### Creating a Summary Table

In [55]:
reg_summary=pd.DataFrame([['Bias'],['SAT'],['RAND 1,2,3']],columns=['Features'])
reg_summary['Weights']=reg.intercept_,reg.coef_[0],reg.coef_[1]

In [56]:
reg_summary

Unnamed: 0,Features,Weights
0,Bias,3.330238
1,SAT,0.171814
2,"RAND 1,2,3",-0.00703
