## Multiple linear regression

In [1]:
# Import the relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import os
import seaborn as sns
# We can override the default matplotlib styles with those of Seaborn
sns.set()

In [2]:
# Load the data
df = pd.read_csv(os.path.join(os.path.pardir,'data','raw','1.02. Multiple linear regression.csv'))

In [3]:
# This method gives us very nice descriptive statistics. We don't need this as of now, but will later on!
df.describe()

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
count,84.0,84.0,84.0
mean,1845.27381,3.330238,2.059524
std,104.530661,0.271617,0.855192
min,1634.0,2.4,1.0
25%,1772.0,3.19,1.0
50%,1846.0,3.38,2.0
75%,1934.0,3.5025,3.0
max,2050.0,3.81,3.0


### Declare the independent and dependent variables

In [9]:
# There are two independent variables: 'SAT' and 'Rand 1,2,3'
x = df[['SAT','Rand 1,2,3']]

# and a single depended variable: 'GPA'
y = df['GPA']

### The regression itself

In [10]:
# We start by creating a linear regression object
reg = LinearRegression()

# The whole learning process boils down to fitting the regression
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [11]:
# Getting the coefficients of the regression
reg.coef_
# Note that the output is an array

array([ 0.00165354, -0.00826982])

In [None]:
# Getting the intercept of the regression
reg.intercept_
# Note that the result is a float as we usually expect a single value

#### Calculating the R-Squared
- used to measure the goodness of fit
- i.e it's a universal measure to evaluate how well linear regression fair and compare
- reg.score(x,y) returns the R-squared of a linear regression

In [13]:
reg.score(x,y)

0.4066811952814285

#### Formula for Adjusted R^2

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [14]:
x.shape

(84, 2)

In [15]:
r2 = reg.score(x,y)
n = x.shape[0]
p = x.shape[1]

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.39203134825134023

- [x] The adjusted R-squared < R-squared, therefore, one or more of the predictors have less or no explanatory power

## Creating a summary table

In [16]:
# Let's create a new data frame with the names of the features
reg_summary = pd.DataFrame(data = x.columns.values, columns=['Features'])
reg_summary

Unnamed: 0,Features
0,SAT
1,"Rand 1,2,3"


In [17]:
# Then we create and fill a second column, called 'Coefficients' with the coefficients of the regression
reg_summary ['Coefficients'] = reg.coef_
# Finally, we add the p-values we just calculated
reg_summary ['p-values'] = p_values.round(3)

NameError: name 'p_values' is not defined

In [None]:
# Now we've got a pretty clean summary, which can help us make an informed decision about the inclusion of the variables 
reg_summary