In [1]:
#Imports

import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Data Processing

In [2]:
df = pd.read_csv("abalone.data", names=["sex", "length", "diameter", "height", "whole weight", "shucked weight",
                                       "viscera weight", "shell weight", "rings"])

df.head()

Unnamed: 0,sex,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


Since our target is age, we must convert the rings column to age by adding 1.5 to every entry, since abalone show age similarly to trees.

In [3]:
df["age"] = df["rings"] + 1.5
df = df.drop(columns = "rings")

df.head()

Unnamed: 0,sex,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,age
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,16.5
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,8.5
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,10.5
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,11.5
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,8.5


In [4]:
sexArr = df["sex"].to_numpy()
numRows = len(sexArr)

sexMat = np.zeros((numRows, 3), dtype=int)
for i in range(numRows):
    char = sexArr[i]
    if char == 'M':
        sexMat[i][0] = 1
    elif char == 'F':
        sexMat[i][1] = 1
    else:
        sexMat[i][2] = 1

sexdf = pd.DataFrame(data = sexMat, columns = ["M", "F", "I"])

df = pd.concat([sexdf, df], axis = 1, sort = False)
df = df.drop(columns = "sex")

df.head()

Unnamed: 0,M,F,I,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,age
0,1,0,0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,16.5
1,1,0,0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,8.5
2,0,1,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,10.5
3,1,0,0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,11.5
4,0,0,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,8.5


In [5]:
# TODO: Perform EDA

df.describe()

Unnamed: 0,M,F,I,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,age
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.365813,0.312904,0.321283,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,11.433684
std,0.481715,0.463731,0.467025,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.0,0.0,0.0,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,2.5
25%,0.0,0.0,0.0,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,9.5
50%,0.0,0.0,0.0,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,10.5
75%,1.0,1.0,1.0,0.615,0.48,0.165,1.153,0.502,0.253,0.329,12.5
max,1.0,1.0,1.0,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,30.5


In [6]:
df = df.sample(frac=1)

y = df['age'].to_numpy()
X = df.drop(columns='age').to_numpy()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
scaler = StandardScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
'''
param_grid = {'alpha': [0.1, 0.01, 0.001], 'learning_rate': ["constant", "optimal", "invscaling"], 
              'l1_ratio': [1, 0.5, 0.2, 0], 'max_iter':[100, 400, 1000, 10000],'eta0': [0.01, 0.001],
              'loss': ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']}
'''

param_grid = {'alpha': [0.01, 0.001, .00001], 'learning_rate': ["optimal"], 
              'l1_ratio': [1, 0.2, 0], 'max_iter':[400, 1000, 10000],'eta0': [0.001, .0001],
              'loss': ['squared_loss']}

sgd = SGDRegressor()

sgd_cv = GridSearchCV(sgd, param_grid, scoring='neg_mean_squared_error', cv=10, verbose=2, n_jobs=-1)
sgd_cv.fit(X_train, y_train)


params_optimal_sgd = sgd_cv.best_params_

print("Best Score (negative mean squared error): %f" % sgd_cv.best_score_)
print("Optimal Hyperparameter Values: ", params_optimal_sgd)
print("\n")

Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   26.2s


Best Score (negative mean squared error): -5.241217
Optimal Hyperparameter Values:  {'alpha': 0.001, 'eta0': 0.001, 'l1_ratio': 1, 'learning_rate': 'optimal', 'loss': 'squared_loss', 'max_iter': 10000}




[Parallel(n_jobs=-1)]: Done 525 out of 540 | elapsed:   35.0s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:   35.1s finished


In [11]:
lin_reg_sgd = SGDRegressor(**params_optimal_sgd)

lin_reg_sgd.fit(X_train, y_train)

y_test_predicted = lin_reg_sgd.predict(X_test)

test_mse_polynomial = mean_squared_error(y_test, y_test_predicted)

print("Mean squared error: %.2f"
      % test_mse_polynomial)

# Explained variance score: 1 is perfect prediction

test_r2_polynomial = r2_score(y_test, y_test_predicted)
print("Coefficient of determination r^2 variance score [1 is perfect prediction]: %.2f" 
      % test_r2_polynomial)

Mean squared error: 4.87
Coefficient of determination r^2 variance score [1 is perfect prediction]: 0.51
