A1. Load dataset

In [3]:
import pandas as pd
import numpy as np

column_names = [
    "Sex", "Length", "Diameter", "Height",
    "Whole_weight", "Shucked_weight",
    "Viscera_weight", "Shell_weight", "Rings"
]

data = pd.read_csv("abalone.data", header=None, names=column_names)

#number of rows
print("Number of rows:", len(data))

#column names
print("Column names:", data.columns.tolist())

#first 5 rows
data.head()


Number of rows: 4177
Column names: ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings']


Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [None]:
# what is input: numeric physical measurements
# what is output: Rings (converted to age)
# why output is numeric: Rings counts shell growth rings (integer)


A2. Convert target

In [4]:
data["Age"] = data["Rings"] + 1.5

A3. Choose features

In [6]:
features = ["Length", "Diameter", "Shell_weight"]

X = data[features].values
y = data["Age"].values.reshape(-1, 1)

In [None]:
# Feature 1: Length → overall body growth indicator
# Feature 2: Diameter → shape/thickness dimension
# Feature 3: Shell_weight → accumulated long-term growth
# Combining dimensions + shell mass captures maturity better than soft weights.

A4. Train-test split

In [7]:
N = len(X)
split = int(0.8 * N)

X_train = X[:split]
X_test = X[split:]
y_train = y[:split]
y_test = y[split:]

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (3341, 3)
Test shape: (836, 3)


A5. Normalize inputs

In [8]:
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)

X_train = (X_train - mean) / std
X_test = (X_test - mean) / std

In [None]:
# why normalization is needed for learning:
# ensures stable gradients and prevents one feature dominating updates.

Part B: Define the model

In [9]:
def forward(X, w, b):
    """
    computes y_hat = Xw + b
    """
    y_hat = X @ w + b
    return y_hat

# Initialize parameters
d = X_train.shape[1]
w = np.random.randn(d, 1) * 0.01
b = 0.0

print("X shape:", X_train.shape)
print("w shape:", w.shape)
print("b shape:", np.array(b).shape)

X shape: (3341, 3)
w shape: (3, 1)
b shape: ()


In [10]:
# parameters are: w (weights), b (bias)
# number of parameters: 3 weights + 1 bias = 4


Part C: MSE Loss

In [11]:
def mse(y, y_hat):
    loss = np.mean((y - y_hat) ** 2)
    return loss

In [12]:
# why square: penalizes large errors more
# expensive mistakes: large deviations dominate loss

Part D: Gradients

In [13]:
def grad_w(X, y, y_hat):
    N = len(y)
    dW = (2/N) * X.T @ (y_hat - y)
    return dW

def grad_b(y, y_hat):
    N = len(y)
    db = (2/N) * np.sum(y_hat - y)
    return db

In [14]:
# gradient means: slope showing direction of greatest increase
# subtracting gradient reduces loss because we move opposite increase direction
# meaning of large gradient: loss is sensitive to parameter change
# effect of too-large learning rate: overshooting, divergence

 Part E: Training Loop

In [15]:

lr = 0.01
epochs = 500

print("\nTraining started...\n")

for epoch in range(epochs):

    #forward
    y_hat = forward(X_train, w, b)

    #loss
    loss = mse(y_train, y_hat)

    #gradients
    dW = grad_w(X_train, y_train, y_hat)
    db = grad_b(y_train, y_hat)

    #update
    w = w - lr * dW
    b = b - lr * db

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")


Training started...

Epoch 0, Loss: 144.4143
Epoch 50, Loss: 24.6099
Epoch 100, Loss: 9.2003
Epoch 150, Loss: 7.1077
Epoch 200, Loss: 6.7909
Epoch 250, Loss: 6.7183
Epoch 300, Loss: 6.6847
Epoch 350, Loss: 6.6615
Epoch 400, Loss: 6.6437
Epoch 450, Loss: 6.6296


In [None]:
# Initial expectation: loss should decrease gradually
# Revised expectation after training: loss decreases steadily if lr reasonable

Part F: Evaluation

In [16]:
y_test_hat = forward(X_test, w, b)

test_mse = mse(y_test, y_test_hat)
test_mae = np.mean(np.abs(y_test - y_test_hat))

print("\nTest MSE:", test_mse)
print("Test MAE:", test_mae)

print("\n5 Sample Predictions:")
for i in range(5):
    true_age = y_test[i][0]
    pred_age = y_test_hat[i][0]
    abs_error = abs(true_age - pred_age)

    print(f"True: {true_age:.2f}, Pred: {pred_age:.2f}, Abs Error: {abs_error:.2f}")


Test MSE: 5.0947805273710065
Test MAE: 1.7325825577309466

5 Sample Predictions:
True: 13.50, Pred: 10.82, Abs Error: 2.68
True: 15.50, Pred: 9.69, Abs Error: 5.81
True: 14.50, Pred: 10.11, Abs Error: 4.39
True: 14.50, Pred: 11.11, Abs Error: 3.39
True: 13.50, Pred: 11.35, Abs Error: 2.15


In [None]:
# systematic errors: very young and very old abalones tend to have larger errors
# observed bias: linear model underestimates older abalones due to growth slowdown