# Multiple Linear Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression


# Load the Data

In [2]:
data = pd.read_csv('house.csv')
data.head()

Unnamed: 0,loyer,surface
0,1330,37
1,1400,32
2,904,26
3,955,30
4,2545,70


# Add a column with random 1, 2,3 variables

In [4]:
data['random123'] = np.random.randint(1, 4, data.shape[0])
data


Unnamed: 0,loyer,surface,random123
0,1330,37,3
1,1400,32,1
2,904,26,2
3,955,30,1
4,2545,70,2
...,...,...,...
540,1490,48,3
541,2020,58,2
542,2050,70,2
543,1220,42,1


In [5]:
data.describe()

Unnamed: 0,loyer,surface,random123
count,545.0,545.0,545.0
mean,2307.502752,63.236697,1.992661
std,2201.364353,48.716475,0.815713
min,488.0,11.0,1.0
25%,1255.0,32.0,1.0
50%,1795.0,50.0,2.0
75%,2600.0,76.0,3.0
max,25000.0,415.0,3.0


# Create the multiple linear regression

# Declare the dependent and independent var

In [6]:
x = data[['loyer', 'random123']]
y = data['surface']

# Regression itself

In [7]:
reg = LinearRegression()
reg.fit(x, y)

LinearRegression()

In [8]:
reg.coef_

array([ 0.02006587, -0.98773259])

In [9]:
reg.intercept_

18.90285373706675

# R-squared

In [10]:
reg.score(x, y)

0.8223731936447282

# Formula for Adjusted R^2
$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

n is the number of observations and p is the number of prodictors

In [11]:
x.shape

(545, 2)

In [14]:
r2 = reg.score(x,y)
n = x.shape[0]
p = x.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.8217177441747825

# Feature Selection

f_regression creates simple linear regressions of each feature and the dependent variable

if p of a independent variable is greater than 0.05 then the impact of this variable is very low

In [15]:
from sklearn.feature_selection import f_regression

In [16]:
f_regression(x, y)

(array([2.50927084e+03, 1.25073260e-01]),
 array([9.99301800e-206, 7.23733235e-001]))

the first array is f-statistics for the features and the second is f p_values

In [22]:
p_values = f_regression(x, y)[1]
p_values #in p_values the first value is p of the first column in x and so n

array([9.99301800e-206, 7.23733235e-001])

In [23]:
# take 3 digits after comma
p_values.round(3)

array([0.   , 0.724])

#since p_value of the second column in x is greater then 0.05 then we can get red of it

these are the univariate p_values reached from simle linear models.
They do not reflect the interconnection of features in our multiple linear regression

# Creating a summary table

In [27]:
reg_summary = pd.DataFrame(data = x.columns.values, columns=['Features'])

reg_summary

Unnamed: 0,Features
0,loyer
1,random123


In [28]:
reg_summary['Coefficients'] = reg.coef_
reg_summary['p-values'] = p_values.round(3)
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,loyer,0.020066,0.0
1,random123,-0.987733,0.724


P_values are one of the best ways to determine if a variable is redundant, but they provide no information whasoever about HOW USEFUL a variable is