## Building the Regression Model

In [2]:
# Import Pandas Library, used for data manipulation
# Import matplotlib, used to plot our data
# Import nump for mathemtical operations
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Import our fish data and store it in the variable fish_data
fish_data = pd.read_csv("D:\ProjectData\Fish.csv") 
# Display first few rows of data
fish_data.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [3]:
# renaming columns
renamed_columns = ['Species','Weight', 'Vertical_length','Diagonal_length','Cross_length', 'Height','Width']

fish_data.columns = renamed_columns

# view changes
fish_data.columns

Index(['Species', 'Weight', 'Vertical_length', 'Diagonal_length',
       'Cross_length', 'Height', 'Width'],
      dtype='object')

In [4]:
# one hot encode species feature
fish_data = pd.get_dummies(fish_data)

# view changes
fish_data.head()

Unnamed: 0,Weight,Vertical_length,Diagonal_length,Cross_length,Height,Width,Species_Bream,Species_Parkki,Species_Perch,Species_Pike,Species_Roach,Species_Smelt,Species_Whitefish
0,242.0,23.2,25.4,30.0,11.52,4.02,1,0,0,0,0,0,0
1,290.0,24.0,26.3,31.2,12.48,4.3056,1,0,0,0,0,0,0
2,340.0,23.9,26.5,31.1,12.3778,4.6961,1,0,0,0,0,0,0
3,363.0,26.3,29.0,33.5,12.73,4.4555,1,0,0,0,0,0,0
4,430.0,26.5,29.0,34.0,12.444,5.134,1,0,0,0,0,0,0


In [5]:
# view shape of dataframe
fish_data.shape

(159, 13)

In [6]:
# define input data and target variable

# input data
X = fish_data.drop(['Weight'], 1)

# target variable
y = fish_data.Weight

# split data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

  after removing the cwd from sys.path.


In [8]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

## Evaluating the regression model

In [9]:
y_pred = model.predict(X_test)

In [10]:
y_pred

array([  22.83330062,   23.41273287,  187.29483799,  310.45293863,
        208.87406165,  781.04848248,  -52.19610215,  253.55251651,
        261.83735593, 1151.75626825,  599.51515345,  830.9367517 ,
        548.24025022,  140.03721841,  691.31264199,  836.13483002,
       1021.39702132,  286.88161176,  231.66639764,  587.73721669,
         -2.27001926,  592.30651037,  509.16439696,  490.44424842])

In [13]:
# mean squared error
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

5828.016723991241

In [14]:
# root mean squared error
mean_squared_error(y_test, y_pred, squared = False)

76.3414482701975

In [15]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9565882607957022

In [16]:
# define variables for adjusted r2 score
r2 = r2_score(y_test, y_pred)
n = len(y_test)
k = len(X_test.columns)

# calculate adjusted r2 score
adj_r2_score = 1-(((1-r2)*(n-1))/(n-k-1))

adj_r2_score

0.9092299998455593

### Adding features

In [17]:
from sklearn.preprocessing import PolynomialFeatures

# transform data to include polynomial terms to third degree
poly = PolynomialFeatures(degree = 3)
X_degree3 = poly.fit_transform(X)

In [18]:
# check number of features, this is given as the number of columns of our transformed data
X_degree3.shape

(159, 455)

In [19]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_degree3, y, test_size=0.15, random_state=42)

# fit model
degree3_model = LinearRegression()
degree3_model.fit(X_train, y_train)

# produce set of predictions
y_pred = degree3_model.predict(X_test)

In [20]:
r2_score(y_test, y_pred)

-10153.238814236409

In [21]:
# produce set of predictions from training data X_train
y_train_pred = degree3_model.predict(X_train)

r2_score(y_train, y_train_pred)

0.9999260622078815

### Improving our model

In [22]:
# transform data to include polynomial terms to third degree
poly = PolynomialFeatures(interaction_only = True)
X_interaction = poly.fit_transform(X)

In [23]:
# check number of features, this is given as the number of columns of our transformed data
X_interaction.shape

(159, 79)

In [24]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_interaction, y, test_size=0.15, random_state=42)

# fit model
interaction_model = LinearRegression()
interaction_model.fit(X_train, y_train)

# produce set of predictions
y_pred = interaction_model.predict(X_test)

In [25]:
r2_score(y_test, y_pred)

0.9643044936811065

In [26]:
# produce set of predictions from training data X_train
y_train_pred = interaction_model.predict(X_train)

r2_score(y_train, y_train_pred)

0.9827157781069666

In [46]:
# define variables for adjusted r2 score
r2 = r2_score(y_test, y_pred)
n = len(y_test)
k = len(X_test[0])

# calculate adjusted r2 score
adj_r2_score = 1-(((1-r2)*(n-1))/(n-k-1))

adj_r2_score

0.9368464118973423

In [31]:
n

24

In [33]:
k

79