In [1]:
import numpy as np
import pandas as pd

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"

In [3]:
cols = [
    "Sex","Length","Diameter","Height",
    "WholeWeight","ShuckedWeight",
    "VisceraWeight","ShellWeight","Rings"
]

df = pd.read_csv(url, names=cols)

In [4]:
print("Rows:", len(df))
print("Columns:", df.columns.tolist())
print(df.head())

Rows: 4177
Columns: ['Sex', 'Length', 'Diameter', 'Height', 'WholeWeight', 'ShuckedWeight', 'VisceraWeight', 'ShellWeight', 'Rings']
  Sex  Length  Diameter  Height  WholeWeight  ShuckedWeight  VisceraWeight  \
0   M   0.455     0.365   0.095       0.5140         0.2245         0.1010   
1   M   0.350     0.265   0.090       0.2255         0.0995         0.0485   
2   F   0.530     0.420   0.135       0.6770         0.2565         0.1415   
3   M   0.440     0.365   0.125       0.5160         0.2155         0.1140   
4   I   0.330     0.255   0.080       0.2050         0.0895         0.0395   

   ShellWeight  Rings  
0        0.150     15  
1        0.070      7  
2        0.210      9  
3        0.155     10  
4        0.055      7  


In [5]:
# Checkpoint:
# what is input: physical measurements of abalone (length, weight etc.)
# what is output: Rings (age proxy)
# why output is numeric: rings count is a number (regression problem)

In [7]:
df["y"] = df["Rings"] + 1.5

In [8]:
features = ["Length", "Diameter", "WholeWeight"]
X = df[features].values
y = df["y"].values.reshape(-1,1)

In [9]:
# Justification:
# Feature 1: Length → overall body size relates to age
# Feature 2: Diameter → shell growth indicator
# Feature 3: WholeWeight → heavier abalones are usually older

In [10]:
n = len(X)
split = int(0.8*n)

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]

In [11]:
print("Train X:", X_train.shape)
print("Train y:", y_train.shape)
print("Test X:", X_test.shape)
print("Test y:", y_test.shape)

Train X: (3341, 3)
Train y: (3341, 1)
Test X: (836, 3)
Test y: (836, 1)


In [12]:
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)

In [13]:
X_train = (X_train - mean) / std
X_test = (X_test - mean) / std

In [14]:
# Checkpoint:
# why normalization is needed for learning:
# makes gradients stable and speeds up convergence

In [15]:
def forward(X, w, b):
    y_hat = X @ w + b

    print("X:", X.shape)
    print("w:", w.shape)
    print("b:", np.shape(b))
    print("y_hat:", y_hat.shape)

    # Checkpoint:
    # parameters are: w and b
    # number of parameters: 3 weights + 1 bias = 4

    return y_hat

In [16]:
def mse(y, y_hat):
    loss = np.mean((y - y_hat)**2)

    # Checkpoint:
    # why square: penalizes large errors more
    # what mistakes are expensive: large deviations

    return loss

In [17]:
# Checkpoint:
# what gradient means in words: slope showing direction of increase of loss
# why subtracting gradient reduces loss: moves opposite slope → downhill

In [18]:
def grad_w(X, y, y_hat):
    N = len(y)
    dW = (2/N) * X.T @ (y_hat - y)
    return dW

In [19]:
def grad_b(y, y_hat):
    N = len(y)
    db = (2/N) * np.sum(y_hat - y)
    return db

In [20]:
# Checkpoint:
# meaning of large gradient: loss changes rapidly
# effect of too-large learning rate: divergence/oscillation

In [22]:
d = X_train.shape[1]

# initialize w (small random values)
w = np.random.randn(d,1) * 0.01

# initialize b (zero)
b = 0.0

# choose learning rate and epochs
lr = 0.01
epochs = 500

# Initial expectation:
# loss should go down gradually

for epoch in range(epochs):

    # 1) forward pass
    y_hat = X_train @ w + b

    # 2) compute loss
    loss = mse(y_train, y_hat)

    # 3) compute gradients
    dW = grad_w(X_train, y_train, y_hat)
    db = grad_b(y_train, y_hat)

    # 4) update
    w -= lr * dW
    b -= lr * db

    if epoch % 50 == 0:
        print(epoch, loss)

0 144.33094785503434
50 25.146850577533243
100 9.81280502320147
150 7.776139629272984
200 7.501971582242397
250 7.461749648716772
300 7.452738089166425
350 7.4480307428574735
400 7.444042934732362
450 7.440285405542608


In [23]:
# Revised expectation after training:
# loss decreases smoothly and stabilizes

In [24]:
y_pred = X_test @ w + b

test_mse = mse(y_test, y_pred)
test_mae = np.mean(np.abs(y_test - y_pred))

print("\nTest MSE:", test_mse)
print("Test MAE:", test_mae)


Test MSE: 5.364065792524271
Test MAE: 1.8102354149670565


In [25]:
print("\nSample predictions:")
for i in range(5):
    true = y_test[i][0]
    pred = y_pred[i][0]
    err = abs(true - pred)
    print(true, pred, err)


Sample predictions:
13.5 11.023120825408867 2.476879174591133
15.5 9.943823304074062 5.556176695925938
14.5 10.440358683817514 4.059641316182486
14.5 10.862528935517995 3.637471064482005
13.5 10.883935700044454 2.6160642999555463


In [26]:
# Checkpoint:
# systematic errors: very old abalones slightly underpredicted
# observed bias: model pulls predictions toward mean age