# Programming Assignment 2 - PRML
### UBIT: haudipud  
### UB ID: 50599797


## Section 1: Binary Classification (30 marks)
- Dataset: `abalone.csv`
- Task: Predict if abalone is older than 10 years (rings > 8.5)
- Model: Logistic Regression (from `prml.linear`)


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load data
df = pd.read_csv('abalone.csv')
df['age'] = df['rings'] + 1.5
df['binary_label'] = (df['age'] > 10).astype(int)

# Drop unused columns
X = df.drop(columns=['rings', 'age', 'binary_label'])
y = df['binary_label']

# One-hot encoding for 'class' column
if X['class'].dtype == object:
    X = pd.get_dummies(X, columns=['class'], drop_first=True)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [2]:
# Logistic Regression from PRML
from prml.linear import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.classify(X_test)

acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {acc:.4f}\n")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", report)

AttributeError: `np.Inf` was removed in the NumPy 2.0 release. Use `np.inf` instead.

## Section 2: Multi-class Classification (30 marks)
- Dataset: `abalone.csv`
- Task: Predict age group: young (<=8), adult (9–11), old (>=12)
- Model: Softmax Regression (from `prml.linear`)


In [None]:
# Create age group classes: 0 = young (<=8), 1 = adult (9–11), 2 = old (>=12)
df['age_group'] = pd.cut(df['rings'], bins=[0, 8, 11, df['rings'].max()], labels=[0,1,2]).astype(int)
X_multi = df.drop(columns=['rings', 'age', 'binary_label', 'age_group'])
y_multi = df['age_group']

# One-hot encoding
if X_multi['class'].dtype == object:
    X_multi = pd.get_dummies(X_multi, columns=['class'], drop_first=True)

# Scale features
X_multi_scaled = scaler.fit_transform(X_multi)

# Train-test split
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_multi_scaled, y_multi, test_size=0.2, random_state=42)


In [None]:
from prml.linear import SoftmaxRegression

model_multi = SoftmaxRegression()
model_multi.fit(X_train_m, y_train_m)
y_pred_m = model_multi.classify(X_test_m)

acc_m = accuracy_score(y_test_m, y_pred_m)
cm_m = confusion_matrix(y_test_m, y_pred_m)
report_m = classification_report(y_test_m, y_pred_m)

print(f"Accuracy: {acc_m:.4f}\n")
print("Confusion Matrix:\n", cm_m)
print("\nClassification Report:\n", report_m)

## Section 3: Neural Networks (20 marks)
- Dataset: `bonus.csv`
- Task: Predict wine `quality` using a neural network
- Model: Feedforward Neural Network (PyTorch)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Load dataset
bonus_df = pd.read_csv('bonus.csv')
X = bonus_df.drop(columns=['quality'])
y = bonus_df['quality']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.long)
y_test = torch.tensor(y_test.values, dtype=torch.long)

# Define the model
class FeedforwardNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.net(x)

# Instantiate model
input_dim = X_train.shape[1]
hidden_dim = 32
output_dim = len(y.unique())
model = FeedforwardNN(input_dim, hidden_dim, output_dim)

# Train the model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
epochs = 30

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 5 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# Evaluation
model.eval()
with torch.no_grad():
    y_pred = model(X_test).argmax(dim=1)
    acc = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy: {acc:.4f}")
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred))

## Section 4: Convolutional Neural Networks (20 marks)
- Dataset: Image folders (`train/` and `test/`)
- Classes: `adidas`, `converse`, `nike`
- Task: Classify images using a CNN
- Model: PyTorch CNN with Conv2D + ReLU + MaxPool + FC layers


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

# Define transforms
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])

# Load datasets from folders
train_dataset = datasets.ImageFolder(root='train', transform=transform)
test_dataset = datasets.ImageFolder(root='test', transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define CNN
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(32 * 16 * 16, 100),
            nn.ReLU(),
            nn.Linear(100, 3)
        )

    def forward(self, x):
        x = self.conv(x)
        x = self.fc(x)
        return x

# Train the CNN
model = CNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss:.4f}")

# Evaluate on test set
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.numpy())
        all_labels.extend(labels.numpy())

print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=train_dataset.classes))

## Section 5: Bonus - Multi-class Classification using Sklearn (10 marks)
- Dataset: `bonus.csv`
- Task: Predict wine `quality` using any `sklearn` classifier
- Model: RandomForestClassifier


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load and split data
bonus_df = pd.read_csv('bonus.csv')
X = bonus_df.drop(columns=['quality'])
y = bonus_df['quality']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"Accuracy: {acc:.4f}\n")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))