# Regression with Abalone Dataset


## Import Libraries


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
%matplotlib inline

In [4]:
import sweetviz as sv



## Import data

In [5]:
data = pd.read_csv('abalone.csv')

In [6]:
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


## Exploratory Data Anaysis (EDA)

In [7]:
# Generate a report
report = sv.analyze(data)
report.show_html('sweetviz_report.html')

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:00 -> (00:00 left)


Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## Feature engineering


In [8]:
encoded_data = pd.get_dummies(data)
encoded_data.head

<bound method NDFrame.head of       Length  Diameter  Height  Whole weight  Shucked weight  Viscera weight  \
0      0.455     0.365   0.095        0.5140          0.2245          0.1010   
1      0.350     0.265   0.090        0.2255          0.0995          0.0485   
2      0.530     0.420   0.135        0.6770          0.2565          0.1415   
3      0.440     0.365   0.125        0.5160          0.2155          0.1140   
4      0.330     0.255   0.080        0.2050          0.0895          0.0395   
...      ...       ...     ...           ...             ...             ...   
4172   0.565     0.450   0.165        0.8870          0.3700          0.2390   
4173   0.590     0.440   0.135        0.9660          0.4390          0.2145   
4174   0.600     0.475   0.205        1.1760          0.5255          0.2875   
4175   0.625     0.485   0.150        1.0945          0.5310          0.2610   
4176   0.710     0.555   0.195        1.9485          0.9455          0.3765   

      She

## Modelling

## Split dataset

In [9]:
training_set = encoded_data.iloc[:4080, :]
validation_set = encoded_data.iloc[4080:,:]
print(training_set.shape)
print(validation_set.shape)

(4080, 11)
(97, 11)


### Baseline model: multiple linear regression

In [10]:
training_set

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,False,False,True
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,False,False,True
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,True,False,False
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,False,False,True
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
4075,0.545,0.450,0.150,0.7805,0.3795,0.1625,0.2160,8,False,False,True
4076,0.545,0.430,0.140,0.7720,0.2890,0.1900,0.2615,8,False,True,False
4077,0.550,0.435,0.125,0.7410,0.3480,0.1585,0.2060,9,False,True,False
4078,0.550,0.430,0.180,0.8265,0.4405,0.1590,0.2250,10,False,False,True


In [11]:
X = training_set[['Length','Height']]
y = training_set['Rings'].values.reshape(-1,1)

X_val = validation_set[['Length','Height']]
y_val = validation_set['Rings'].values.reshape(-1,1)


In [12]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Linear Regression

In [14]:
lin_reg = LinearRegression()
# Train the model
lin_reg.fit(X_train, y_train)


In [15]:
# Test RMSE
lin_reg_pred = lin_reg.predict(X_test)
test_rmse = mean_squared_error(y_test,lin_reg_pred, squared = False)
print(f'Test RMSE:{test_rmse}')

# Validation RMSE
lin_reg_pred_val = lin_reg.predict(X_val)
val_rmse = mean_squared_error(y_val,lin_reg_pred_val, squared = False)
print(f'Validation RMSE:{val_rmse}')

Test RMSE:2.5984004074765825
Validation RMSE:1.5321754955062656




In [20]:
training_set

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,False,False,True
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,False,False,True
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,True,False,False
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,False,False,True
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
4075,0.545,0.450,0.150,0.7805,0.3795,0.1625,0.2160,8,False,False,True
4076,0.545,0.430,0.140,0.7720,0.2890,0.1900,0.2615,8,False,True,False
4077,0.550,0.435,0.125,0.7410,0.3480,0.1585,0.2060,9,False,True,False
4078,0.550,0.430,0.180,0.8265,0.4405,0.1590,0.2250,10,False,False,True


### Ridge Regression

In [39]:

# Create a Ridge Regression model
ridge_reg = Ridge(alpha=1.0) 

# Train the model
ridge_reg.fit(X_train, y_train)

In [40]:
# Test RMSE
ridge_reg_pred = ridge_reg.predict(X_test)
test_rmse_ridge = mean_squared_error(y_test,ridge_reg_pred, squared = False)
print(f'Test RMSE:{test_rmse_ridge}')

Test RMSE:2.6116923776529712




In [41]:
# Validation RMSE
ridge_reg_pred_val = ridge_reg.predict(X_val)
val_rmse_ridge = mean_squared_error(y_val,ridge_reg_pred_val, squared = False)
print(f'Validation RMSE:{val_rmse_ridge}')

Validation RMSE:1.5313084318431962


