## Feature selection through Standardization

### Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
from sklearn.linear_model import LinearRegression

### Load the data

In [2]:
data = pd.read_csv("../1.02.+Multiple+linear+regression.csv")

In [3]:
data.head()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
0,1714,1,2.4
1,1664,3,2.52
2,1760,3,2.54
3,1685,3,2.74
4,1693,2,2.83


In [4]:
data.describe()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
count,84.0,84.0,84.0
mean,1845.27381,2.059524,3.330238
std,104.530661,0.855192,0.271617
min,1634.0,1.0,2.4
25%,1772.0,1.0,3.19
50%,1846.0,2.0,3.38
75%,1934.0,3.0,3.5025
max,2050.0,3.0,3.81


In [5]:
data.corr()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
SAT,1.0,-0.031722,0.637184
"Rand 1,2,3",-0.031722,1.0,-0.046224
GPA,0.637184,-0.046224,1.0


## Create the multiple linear regression

### Declare the dependent and independent variables

In [6]:
x = data[['SAT', 'Rand 1,2,3']]
y = data['GPA']

### Standardization

In [7]:
from sklearn.preprocessing import StandardScaler

Scaler will be used to subtract the mean and didvied by the standard deviation

In [8]:
scaler = StandardScaler()

fit() calculates and stores the mean and standard deviation of each feature

In [9]:
scaler.fit(x)
scaler.get_params()

{'copy': True, 'with_mean': True, 'with_std': True}

transform() transform the unscaled inputs using the infomation contained in the scaler object (feature-wise)

In [10]:
x_scaled = scaler.transform(x)

In [11]:
x_scaled

array([[-1.26338288, -1.24637147],
       [-1.74458431,  1.10632974],
       [-0.82067757,  1.10632974],
       [-1.54247971,  1.10632974],
       [-1.46548748, -0.07002087],
       [-1.68684014, -1.24637147],
       [-0.78218146, -0.07002087],
       [-0.78218146, -1.24637147],
       [-0.51270866, -0.07002087],
       [ 0.04548499,  1.10632974],
       [-1.06127829,  1.10632974],
       [-0.67631715, -0.07002087],
       [-1.06127829, -1.24637147],
       [-1.28263094,  1.10632974],
       [-0.6955652 , -0.07002087],
       [ 0.25721362, -0.07002087],
       [-0.86879772,  1.10632974],
       [-1.64834403, -0.07002087],
       [-0.03150724,  1.10632974],
       [-0.57045283,  1.10632974],
       [-0.81105355,  1.10632974],
       [-1.18639066,  1.10632974],
       [-1.75420834,  1.10632974],
       [-1.52323165, -1.24637147],
       [ 1.23886453, -1.24637147],
       [-0.18549169, -1.24637147],
       [-0.5608288 , -1.24637147],
       [-0.23361183,  1.10632974],
       [ 1.68156984,

## Regression with scaled features

In [14]:
reg = LinearRegression()
reg.fit(x_scaled, y)
reg.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [15]:
reg.coef_

array([ 0.17181389, -0.00703007])

In [16]:
reg.intercept_

3.330238095238095

In [17]:
reg.score(x_scaled,y)

0.4066811952814283

## Creating a summary table

In [20]:
reg_summary = pd.DataFrame([['Bias'], ['SAT'], ['Rand 1,2,3']], columns=['Feature'])
reg_summary['Weights'] = reg.intercept_, reg.coef_[0], reg.coef_[1]

In [21]:
reg_summary

Unnamed: 0,Feature,Weights
0,Bias,3.330238
1,SAT,0.171814
2,"Rand 1,2,3",-0.00703


The greater the weights, the greater impact is on the predicted outcomes\
SAT contribute the most and the Rand 1,2,3 doesn't

### Making predictions with the standardized coefficients (weights)

In [22]:
new_data = pd.DataFrame(data=[[1700,2], [1800,1]], columns=['SAT', 'Rand 1,2,3'])
new_data

Unnamed: 0,SAT,"Rand 1,2,3"
0,1700,2
1,1800,1


In [25]:
reg.predict(new_data)



array([295.39979563, 312.58821497])

In [26]:
new_data_scaled = scaler.transform(new_data)
new_data_scaled

array([[-1.39811928, -0.07002087],
       [-0.43571643, -1.24637147]])

In [27]:
reg.predict(new_data_scaled)

array([3.09051403, 3.26413803])

## What if we removed the Rand 1,2,3 variable?

In [29]:
reg_simple = LinearRegression()
x_simple_matrix = x_scaled[:,0].reshape(-1,1)
reg_simple.fit(x_simple_matrix,y)
reg_simple.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [39]:
x_simple_matrix

array([[-1.26338288],
       [-1.74458431],
       [-0.82067757],
       [-1.54247971],
       [-1.46548748],
       [-1.68684014],
       [-0.78218146],
       [-0.78218146],
       [-0.51270866],
       [ 0.04548499],
       [-1.06127829],
       [-0.67631715],
       [-1.06127829],
       [-1.28263094],
       [-0.6955652 ],
       [ 0.25721362],
       [-0.86879772],
       [-1.64834403],
       [-0.03150724],
       [-0.57045283],
       [-0.81105355],
       [-1.18639066],
       [-1.75420834],
       [-1.52323165],
       [ 1.23886453],
       [-0.18549169],
       [-0.5608288 ],
       [-0.23361183],
       [ 1.68156984],
       [-0.4934606 ],
       [-0.73406132],
       [ 0.85390339],
       [-0.67631715],
       [ 0.09360513],
       [ 0.33420585],
       [ 0.03586096],
       [-0.35872421],
       [ 1.04638396],
       [-0.65706909],
       [-0.13737155],
       [ 0.18984542],
       [ 0.04548499],
       [ 1.1618723 ],
       [-1.37887123],
       [ 1.39284898],
       [ 0

In [41]:
reg_simple.predict(new_data_scaled[:,0].reshape(-1,1))

array([3.08970998, 3.25527879])