HW4 Pytorch: Classification, Autoencoders, Word Embedding, Image Features, LSTM

PROBLEM 1: Setup a tensor library [Optional, no credit]

A) Setup your favorite tensor-based library for deep learning, such as PyTorch or TensorFlow, and familiarize yourself with its basic usage. If using PyTorch, you can test if it is installed properly with (in Python):

B) Train a simple feed-forward neural network on the MNIST dataset with 80/20 train and test splits and report results



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import torch.nn.functional as F

# Device config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
batch_size = 64
learning_rate = 0.01
momentum = 0.9
epochs = 5  # You can increase for better accuracy

# MNIST dataset (download + transform)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Load full dataset
full_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

# 80/20 split
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Neural Net Model
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.dropout = nn.Dropout2d(0.25)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
      x = F.relu(self.conv1(x))
      x = F.max_pool2d(x, 2)            # Pool after conv1
      x = F.relu(self.conv2(x))
      x = F.max_pool2d(x, 2)            # Pool after conv2
      x = self.dropout(x)
      x = torch.flatten(x, 1)           # Flatten
      x = F.relu(self.fc1(x))
      x = F.relu(self.fc2(x))
      x = self.fc3(x)
      return F.log_softmax(x, dim=1)


model = CNN().to(device)

# Optimizer and Loss
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, nesterov=True)
criterion = nn.NLLLoss()

# Training loop
for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0
    for batch in train_loader:
        data, target = batch
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        output = model(data)
        pred = output.argmax(dim=1)
        correct += (pred == target).sum().item()
        total += target.size(0)

accuracy = 100. * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

Epoch 1, Loss: 219.8138
Epoch 2, Loss: 52.8084
Epoch 3, Loss: 36.8860
Epoch 4, Loss: 29.0962
Epoch 5, Loss: 22.8169
Test Accuracy: 98.77%


PROBLEM 2 : NNet supervised classification with tuned word vectors

Train a neural network on a sizeable subset of 20NG (say, at least 5 categories)


Setup and preprocessing:

In [None]:
from sklearn.datasets import fetch_20newsgroups
from gensim.utils import simple_preprocess

categories = ['comp.graphics', 'sci.space', 'rec.sport.baseball', 'talk.politics.misc', 'soc.religion.christian']
newsgroups = fetch_20newsgroups(subset='train', categories=categories)
texts, labels = newsgroups.data, newsgroups.target

In [None]:
tokenized_texts = [simple_preprocess(text) for text in texts]

Load GloVe and create Vocabulary

In [None]:
from collections import Counter

#Building vocab from corpus
word_counts = Counter(word for doc in tokenized_texts for word in doc)
vocab = {word: i+2 for i, (word, _) in enumerate(word_counts.items())}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

In [None]:
import numpy as np

#Loading GloVe embeddings
embedding_dim = 100
glove_path = 'glove.6B.100d.txt'
embeddings_index = {}

with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

In [None]:
#Create Embedding Matrix
embedding_matrix = np.random.normal(0, 1, (len(vocab), embedding_dim))
for word, idx in vocab.items():
    if word in embeddings_index:
        embedding_matrix[idx] = embeddings_index[word]

Converting Docs to Indices

In [None]:
def doc2ind(doc):
    return [vocab.get(token, vocab['<UNK>']) for token in doc]

indexed_docs = [doc2ind(doc) for doc in tokenized_texts]

In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch

padded_docs = pad_sequence([torch.tensor(doc) for doc in indexed_docs], batch_first=True, padding_value=vocab['<PAD>'])
labels_tensor = torch.tensor(labels)

Create Dataset and Dataloader

In [None]:
from torch.utils.data import TensorDataset, DataLoader, random_split

dataset = TensorDataset(padded_docs, labels_tensor)
train_len = int(0.8 * len(dataset))
train_ds, val_ds = random_split(dataset, [train_len, len(dataset) - train_len])
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=64)

Define our Neural Network

In [None]:
from torch import nn

embedding_layer = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=True)

class TextClassifier(nn.Module):
    def __init__(self, embedding_layer, hidden_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = embedding_layer
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(dim=1)  # mean pooling
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return torch.softmax(x, dim=1)

model = TextClassifier(embedding_layer, hidden_dim=128, num_classes=len(categories))

Train the model

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(5):
    model.train()
    total_loss = 0
    for x_batch, y_batch in train_dl:
        optimizer.zero_grad()
        y_pred = model(x_batch)
        loss = loss_fn(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Evaluate accuracy

In [None]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for x_batch, y_batch in val_dl:
        y_pred = model(x_batch)
        predicted = y_pred.argmax(dim=1)
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)
print(f"Accuracy: {100 * correct / total:.2f}%")

In [None]:
#Fine tuning the embeddings
model.embedding.weight.requires_grad = True

Visualization of embeddings using TSNE

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Get a sample of word vectors
sample_idx = [vocab[word] for word in list(vocab.keys())[:500]]
sample_vectors = model.embedding.weight.data[sample_idx].cpu().numpy()

tsne = TSNE(n_components=2)
reduced = tsne.fit_transform(sample_vectors)

plt.scatter(reduced[:, 0], reduced[:, 1])
plt.title("t-SNE of Word Embeddings (Post-Fine-Tuning)")
plt.show()

PROBLEM 3 [Optional, no credit]: Autoencoders

You can pick your own text to fine tune word vectors, if its reasonable in size and very domain-specific (compared to general English). Suggestions:
- Alice in Wonderland
- Sonnets
- specific categories (labels) from 20NG or Reurters datasets
- use your favorite specific text (like a book, or project)


PROBLEM 4 [Optional, no credit]: Autoencoders

For each one of the datasets MNIST, 20NG (required) and SPAMBASE, FASHION (optional) run as an autoencoder with pytorch with a desired hidden layer size (try K=5,10, 20, 100, 200)- what is the smaleest K that works?).

Load the data with dataloader https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

Construct an Autoencoder with the following architecture :

Two linear layers with in features matching the dimensions of input and out

features matching the size of K

Two linear layers with in features matching K and size of out features matching the size of input dimensions.

Define a forward pass with relu

Code a train loop with number of epochs as 10.

Define loss and Optimizer (Adam)

Train the model

use gpu if available

use mean-squared error loss

create a model from Autoencoder class load it to the specified device, either gpu or cpu

Verify the obtained re-encoding of data (the new feature representation) in several ways:

repeat a classification train/test task , or a clustering taks
examine the new pairwise distances dist(i,j) agains the old distances obtained with original features (sample 100 pairs of related words)
examine the top 20 neighbors (by new distance) set overlap with old neighbors, per datapoint
for images, rebuild the image form output layer and draw to look at it


PROBLEM 5 [Optional, no credit]: Image Feature Extraction

Run a Convolutional Neural Network in pytorch to extract image features. In practice the network usually does both the feature extraction and the supervised task (classification) in one pipeline.


PROBLEM 6 [Optional, no credit]: LSTM for text

Run a Recurrent Neural Network/LSTM in Pytorch to model word dependecies/order in text. Can be use for translation, next-word prediction, event detection etc.
