In [57]:
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.io import loadmat
import torch
import torch.nn as nn

In [58]:
mnist = loadmat('mnist_49_3000.mat')
X = mnist['x']
y = mnist['y'][0]
X_train, X_test, y_train, y_test = train_test_split(X.T, y, test_size=1000, random_state=42)
print(X_train.shape)
print(y_train.shape)

(2000, 784)
(2000,)


In [59]:
def converter(inp):
    if inp > 0:
        return 1
    else:
        return 0

y_train_mod = [converter(val) for val in y_train]
y_test_mod  = [converter(val) for val in y_test]

In [78]:
class LogisticRegression(nn.Module):
    def __init__(self, x_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(x_dim, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))
    
    def predict(self, x, threshold):
        return self.forward(x) >= threshold

class LogisticRegressionGradientDescent:
    def __init__(self, lr=0.01, lamb=10, num_iter=1000, x_dim=785, fit_intercept=True):
        self.lr = lr
        self.lamb = lamb
        self.num_iter = num_iter
        self.x_dim = x_dim
        self.fit_intercept = fit_intercept
        self.model = LogisticRegression(x_dim)
        
    def __add_intercept(self, x):
        intercept = np.ones((x.shape[0], 1))
        return np.concatenate((intercept, x), axis=1)
    
    def fit(self, x, y):
        if self.fit_intercept:
            x = self.__add_intercept(x)
        
        x = torch.tensor(x).float()
        y = torch.tensor(y).float()
        
        criterion = nn.BCELoss()
        optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr)
        
        print("Training Error:")
        for epoch in range(self.num_iter):
            # Predict and get weights
            optimizer.zero_grad()
            y_pred = self.model.forward(x)
            theta = self.model.linear.weight
            
            # Calculate loss using the function from the 4.1 in homework document
            regularization_offset = self.lamb * (torch.dot(theta[:, 0], theta[:, 0]))
            loss = criterion(y_pred, y[:, None]) + regularization_offset
            loss.backward()
            optimizer.step()
            
            if epoch % 100 == 0:
                print(f"Epoch {epoch}/{num_iter}: {float(loss)}")
        
        print(f"Train Error: {float(loss)}")
    
    def predict(self, x, threshold):
        if self.fit_intercept:
            x = self.__add_intercept(x)
        x = torch.tensor(x).float()
        preds = self.model.forward(x)
        return preds >= threshold
        

In [80]:
# Gradient descent parameters
learning_rates = [0.1, 0.5, 1]
lambs = [0.01, 0.1, 0.5, 0.9, 1]
thresholds = [0.25, 0.4, 0.5, 0.6, 0.75]
num_iter = 1000

max_correct = 0
best_params = []

# Initialize the model, loss function, and optimizer
for learning_rate in learning_rates:
    print(f"Learning Rate: {learning_rate}")
    for lamb in lambs:
        print(f"lambda: {lamb}")
        model = LogisticRegressionGradientDescent(lr=learning_rate, lamb=lamb, num_iter=num_iter, x_dim=(X_train.shape[1] + 1))
        model.fit(X_train, y_train_mod)
        
        print("Thresholds:")
        for threshold in thresholds:
            preds = model.predict(X_test, threshold)
            predictions = []
            for pred in preds:
                if pred:
                    predictions.append(1)
                else:
                    predictions.append(0)
            
            correct = 0
            total = 0
            for i, val in enumerate(predictions):
                total += 1
                if val == y_test_mod[i]:
                    correct += 1
            if correct > max_correct:
                max_correct = correct
                best_params = [learning_rate, lamb, threshold]
            print(f"{threshold}: Correctly predicted: {correct}/{total}")
        print("")
    print("")
print(f"Best Parameters: lr={best_params[0]}, lambda={best_params[1]}, threshold={best_params[2]}")
print(f"Best Performance: {max_correct}/{total}")

Learning Rate: 0.1
lambda: 0.01
Training Error:
Epoch 0/1000: 0.6980333924293518
Epoch 100/1000: 0.1914200484752655
Epoch 200/1000: 0.1479988545179367
Epoch 300/1000: 0.12901455163955688
Epoch 400/1000: 0.1176883652806282
Epoch 500/1000: 0.10990528762340546
Epoch 600/1000: 0.10409972816705704
Epoch 700/1000: 0.09952925145626068
Epoch 800/1000: 0.09579091519117355
Epoch 900/1000: 0.09264496713876724
Train Error: 0.08996408432722092
Thresholds:
0.25: Correctly predicted: 959/1000
0.4: Correctly predicted: 972/1000
0.5: Correctly predicted: 971/1000
0.6: Correctly predicted: 973/1000
0.75: Correctly predicted: 953/1000

lambda: 0.1
Training Error:
Epoch 0/1000: 0.6915158033370972
Epoch 100/1000: 0.1915685534477234
Epoch 200/1000: 0.14848464727401733
Epoch 300/1000: 0.1295381635427475
Epoch 400/1000: 0.11820440739393234
Epoch 500/1000: 0.11040231585502625
Epoch 600/1000: 0.10457470268011093
Epoch 700/1000: 0.09998181462287903
Epoch 800/1000: 0.0962216854095459
Epoch 900/1000: 0.09305496513

In [83]:
# Train the model again using the best parameters found in the grid sweep above
model = LogisticRegressionGradientDescent(lr=best_params[0], lamb=best_params[1], num_iter=num_iter, x_dim=X_train.shape[1] + 1)
%time model.fit(X_train, y_train_mod)

# Generate predictions for the test set
preds = model.predict(X_test, best_params[2])
predictions = []
for pred in preds:
    if pred:
        predictions.append(1)
    else:
        predictions.append(0)

correct = 0
total = 0
for i, val in enumerate(predictions):
    total += 1
    if val == y_test_mod[i]:
        correct += 1

print(f"Accuracy: {correct}/{total}")



Training Error:
Epoch 0/1000: 0.6867411732673645
Epoch 100/1000: 0.08304082602262497
Epoch 200/1000: 0.07120714336633682
Epoch 300/1000: 0.06420852243900299
Epoch 400/1000: 0.059210795909166336
Epoch 500/1000: 0.055330995470285416
Epoch 600/1000: 0.052162718027830124
Epoch 700/1000: 0.04948648810386658
Epoch 800/1000: 0.04717110097408295
Epoch 900/1000: 0.04513220489025116
Train Error: 0.04332975298166275
CPU times: total: 5.27 s
Wall time: 1.33 s
Accuracy: 974/1000
