In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('Fish.csv')

In [3]:
data.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [4]:
data.describe()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
count,159.0,159.0,159.0,159.0,159.0,159.0
mean,398.326415,26.24717,28.415723,31.227044,8.970994,4.417486
std,357.978317,9.996441,10.716328,11.610246,4.286208,1.685804
min,0.0,7.5,8.4,8.8,1.7284,1.0476
25%,120.0,19.05,21.0,23.15,5.9448,3.38565
50%,273.0,25.2,27.3,29.4,7.786,4.2485
75%,650.0,32.7,35.5,39.65,12.3659,5.5845
max,1650.0,59.0,63.4,68.0,18.957,8.142


## Creating the Multi-linear Regression

In [5]:
species_dict = {'Bream':1, 'Roach':2, 'Whitefish':3, 'Parkki':4, 'Perch':5, 'Pike':6, 'Smelt':7}

In [6]:
x = data[['Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width']]
y = data['Species'].map(species_dict)

## Standardising the variables so can be compared

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler()

In [9]:
scaler.fit(x)

StandardScaler()

In [10]:
x_scaled = scaler.transform(x)

In [11]:
x_scaled

array([[-4.38072172e-01, -3.05788578e-01, -2.82303007e-01,
        -1.06020232e-01,  5.96578670e-01, -2.36528948e-01],
       [-3.03562184e-01, -2.25507242e-01, -1.98053663e-01,
        -2.33668373e-03,  8.21260549e-01, -6.65789457e-02],
       [-1.63447613e-01, -2.35542409e-01, -1.79331587e-01,
        -1.09769794e-02,  7.97341291e-01,  1.65793169e-01],
       [-9.89949100e-02,  5.30159764e-03,  5.46943678e-02,
         1.96390116e-01,  8.79771455e-01,  2.26210031e-02],
       [ 8.87586153e-02,  2.53719316e-02,  5.46943678e-02,
         2.39591594e-01,  8.12834979e-01,  4.26371272e-01],
       [ 1.44804444e-01,  5.54774324e-02,  1.20221635e-01,
         3.00073664e-01,  1.08395111e+00,  3.03431249e-01],
       [ 2.84919015e-01,  5.54774324e-02,  1.20221635e-01,
         2.82793073e-01,  1.21901769e+00,  5.12357880e-01],
       [-2.33330416e-02,  1.35758768e-01,  1.48304750e-01,
         3.25994551e-01,  8.65728838e-01,  1.62163285e-01],
       [ 1.44804444e-01,  1.35758768e-01,  1.483

In [12]:
data.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [13]:
reg = LinearRegression()
reg.fit(x_scaled,y)

LinearRegression()

In [14]:
reg.coef_

array([  0.85476284,   5.614399  ,   8.82652719, -14.37576717,
        -0.03693506,  -1.31416116])

In [15]:
reg.intercept_

3.8805031446540883

In [16]:
reg.score(x_scaled,y)

0.8352561425737633

In [17]:
x_scaled.shape

(159, 6)

In [18]:
r2 = reg.score(x_scaled,y)

n = x.shape[0]

p = x.shape[1]

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.8287530955700961

## Feature Selection - which variables are useful/useless

In [19]:
from sklearn.feature_selection import f_regression

In [20]:
f_regression(x,y)

(array([  5.06491515,   1.62375473,   2.23314555,   7.20627896,
        148.30843301,  17.32301142]),
 array([2.58037135e-02, 2.04451965e-01, 1.37085757e-01, 8.04482266e-03,
        1.92028875e-24, 5.18160895e-05]))

In [21]:
p_values = f_regression(x,y)[1]
p_values

array([2.58037135e-02, 2.04451965e-01, 1.37085757e-01, 8.04482266e-03,
       1.92028875e-24, 5.18160895e-05])

In [22]:
p_values.round(3)

array([0.026, 0.204, 0.137, 0.008, 0.   , 0.   ])

## Summary Table with P-values

In [23]:
reg_summary = pd.DataFrame(data = x.columns.values, columns=['Features'])
reg_summary

Unnamed: 0,Features
0,Weight
1,Length1
2,Length2
3,Length3
4,Height
5,Width


In [24]:
reg_summary ['Coefficients'] = reg.coef_
reg_summary ['P-values'] = p_values.round(3)

In [25]:
reg_summary

Unnamed: 0,Features,Coefficients,P-values
0,Weight,0.854763,0.026
1,Length1,5.614399,0.204
2,Length2,8.826527,0.137
3,Length3,-14.375767,0.008
4,Height,-0.036935,0.0
5,Width,-1.314161,0.0


##### Variables with P-values higher than 0.05 are reduntant
##### Variables with P-values below 0.05 are useful

##### Length1 and Length2 are redundant variables
##### Weight and Length3 are relatively significant
##### However, we can see that Height and Width are the most significant variables for predicting the fish species