# Multiple linear regression 

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression

## Load the data

In [3]:
data = pd.read_csv('1.02. Multiple linear regression.csv')
data

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
0,1714,2.40,1
1,1664,2.52,3
2,1760,2.54,3
3,1685,2.74,3
4,1693,2.83,2
...,...,...,...
79,1936,3.71,3
80,1810,3.71,1
81,1987,3.73,3
82,1962,3.76,1


In [4]:
x = data[['SAT','Rand 1,2,3']]
y = data['GPA']

In [5]:
reg = LinearRegression()
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [6]:
#coefficient
reg.coef_

array([ 0.00165354, -0.00826982])

In [7]:
#intercept
reg.intercept_

0.29603261264909486

In [8]:
#R squared value
reg.score(x,y)

0.4066811952814285

## Ajusted R Squared value

$R^2_(adj.) - 1 = (1-R^2)*\frac{n-1}{n-p-1}

In [9]:
x.shape

(84, 2)

In [10]:
r2 = reg.score(x,y)
n = x.shape[0]
p = x.shape[1]
adjusted_r2 = 1 - (1-r2)*(n-1)/(n-p-1)

In [11]:
adjusted_r2

0.39203134825134023

## Feature selection with f-regression 

In [12]:
from sklearn.feature_selection import f_regression

In [13]:
f_regression(x,y)

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

1st array contains F-statistics and 2nd array contains p values 

In [14]:
p_values = f_regression(x,y)[1]
p_values

array([7.19951844e-11, 6.76291372e-01])

In [15]:
p_values.round(3)

array([0.   , 0.676])

## CREATING A SUMMARY TABLE

In [44]:
reg_summary = pd.DataFrame(data=x.columns.values , columns=['Features'])
reg_summary

Unnamed: 0,Features
0,SAT
1,"Rand 1,2,3"


In [16]:
regs_summ = pd.DataFrame(columns=[['features','SAT']])
regs_summ = regs_summ.append({'features':125,'SAT':58}, ignore_index=True)
regs_summ

SyntaxError: invalid syntax (<ipython-input-16-33424cadc4f2>, line 2)

In [17]:
reg_summary['Coefficients'] = reg.coef_
reg_summary['p-values'] = p_values.round(3)

In [18]:
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676


# Example real estate data set 

In [21]:
data = pd.read_csv('real_estate_price_size_year (2).csv')

In [22]:
data

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009
...,...,...,...
95,252460.400,549.80,2009
96,310522.592,1037.44,2009
97,383635.568,1504.75,2006
98,225145.248,648.29,2015


In [23]:
x = data[['size','year']]
y = data['price']

In [24]:
reg = LinearRegression()
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [26]:
r2 = reg.score(x,y)
n = x.shape[0]
p = x.shape[1]
adjusted_r2 = 1 - (1-r2)*(n-1)/(n-p-1)

In [28]:
from sklearn.feature_selection import f_regression
p_values = f_regression(x,y)[1]

In [33]:
reg_summary = pd.DataFrame(data=x.columns.values , columns=['Features'])
reg_summary['p-value'] = p_values.round(3)
reg_summary['coefficients'] = reg.coef_
reg_summary

Unnamed: 0,Features,p-value,coefficients
0,size,0.0,227.700854
1,year,0.357,2916.785327


# Feature scaling for GPA data

In [19]:
data = pd.read_csv('1.02. Multiple linear regression.csv')
data

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
0,1714,2.40,1
1,1664,2.52,3
2,1760,2.54,3
3,1685,2.74,3
4,1693,2.83,2
...,...,...,...
79,1936,3.71,3
80,1810,3.71,1
81,1987,3.73,3
82,1962,3.76,1


In [20]:
x = data[['SAT','Rand 1,2,3']]
y = data['GPA']

## Standardization

In [21]:
from sklearn.preprocessing import StandardScaler

scaling mechanism 

In [23]:
scaler = StandardScaler()
scaler.fit(x)

StandardScaler(copy=True, with_mean=True, with_std=True)

transform the unscaled input using the information contained in the scaler object feature-wise

In [26]:
x_scaled = scaler.transform(x)

Regression with scaled features

In [28]:
reg = LinearRegression()
reg.fit(x_scaled,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [29]:
reg.coef_

array([ 0.17181389, -0.00703007])

In [30]:
reg.intercept_

3.330238095238095

creating summary table

In [35]:
reg_summary = pd.DataFrame([['Bias'],['SAT'],['Rand 1,2,3']], columns = ['Fetures'])

In [36]:
reg_summary['Weights'] = reg.intercept_,reg.coef_[0],reg.coef_[1]
reg_summary

Unnamed: 0,Fetures,Weights
0,Bias,3.330238
1,SAT,0.171814
2,"Rand 1,2,3",-0.00703


Making predictions with the standard coefficients ( weights )

In [38]:
new_data = pd.DataFrame(data=[[1700,2],[1800,1]], columns = ['SAT','Rand 1,2,3'])
new_data

Unnamed: 0,SAT,"Rand 1,2,3"
0,1700,2
1,1800,1


In [39]:
reg.predict(new_data)

array([295.39979563, 312.58821497])

the new data frame should be scaled to arrange in the same way and also standarized the same way

In [40]:
new_data_scaled = scaler.transform(new_data)
new_data_scaled

array([[-1.39811928, -0.07002087],
       [-0.43571643, -1.24637147]])

In [41]:
reg.predict(new_data_scaled)

array([3.09051403, 3.26413803])

## What if we removed the "random 1,2,3" varible 

In [42]:
reg_simple = LinearRegression()
x_simple_matrix = x_scaled[:,0].reshape(-1,1)
reg_simple.fit(x_simple_matrix,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [43]:
reg_simple.predict(new_data_scaled[:,0].reshape(-1,1))

array([3.08970998, 3.25527879])

# Feature scaling for real estate data 

In [45]:
data = pd.read_csv('real_estate_price_size_year (2).csv')
data

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009
...,...,...,...
95,252460.400,549.80,2009
96,310522.592,1037.44,2009
97,383635.568,1504.75,2006
98,225145.248,648.29,2015


In [46]:
x = data[['size','year']]
y = data['price']

In [48]:
scaler = StandardScaler()
scaler.fit(x)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [49]:
x_scaled = scaler.transform(x)

In [50]:
reg = LinearRegression()
reg.fit(x_scaled,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [51]:
reg_summary = pd.DataFrame([['Bias'],['size'],['year']], columns = ['Fetures'])
reg_summary['Weights'] = reg.intercept_,reg.coef_[0],reg.coef_[1]
reg_summary

Unnamed: 0,Fetures,Weights
0,Bias,292289.47016
1,size,67501.576142
2,year,13724.397082


In [60]:
new_data = pd.DataFrame(data=[[750,2009],[520.98,2007]], columns = ['size','year'])

In [61]:
new_data_scaled = scaler.transform(new_data)
new_data_scaled

array([[-0.34752816, -0.76509206],
       [-1.12007382, -1.1901432 ]])

In [62]:
reg.predict(new_data_scaled)

array([258330.34465995, 200348.72442148])

## P values 

In [63]:
from sklearn.feature_selection import f_regression

In [64]:
f_regression(x_scaled,y)
p_values = f_regression(x,y)[1]
p_values.round(3)

array([0.   , 0.357])

## summary table

In [66]:
reg_summary = pd.DataFrame(data=x.columns.values , columns=['Features'])
reg_summary['coefficients'] = reg.coef_
reg_summary['p_values'] = p_values.round(3)
reg_summary

Unnamed: 0,Features,coefficients,p_values
0,size,67501.576142,0.0
1,year,13724.397082,0.357


It seems that 'Year' is not event significant, therefore we should remove it from the model.

Note that this dataset is extremely clean and probably artificially created, therefore standardization does not really bring any value to it.