# EECE568 - Assignment 2

### Mount on Google Drive


In [31]:
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

In [32]:
# %cd gdrive/My Drive/EECE568_Assignment2/

### Imports

In [33]:
import time
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

# import standard PyTorch modules
import torch
import torch.nn as nn
import torch.optim as optim

# import torchvision module to handle image manipulation
import torchvision
import torchvision.transforms as transforms
print("Running on torch.__version__ = ", torch.__version__)

torch.backends.cudnn.deterministic = True

# utils
import os
from tqdm import tqdm
tqdm.pandas()
from collections import Counter

Running on torch.__version__ =  2.1.0


### Helper Functions

In [34]:
# a function to move tensors from the CPU to the GPU and vice versa
def dict_to_device(orig, device):
    new = {}
    for k,v in orig.items():
        new[k] = v.to(device)
    return new

# a function to make gray-scale images the same shape as color images
def gray_to_color(x):
    return x.repeat(3, 1, 1)

# a function to plot a batch of images together
def plot_images(img, ax):
    img = torchvision.utils.make_grid(img)
    npimg = img.numpy()
    ax.imshow(np.transpose(npimg, (1, 2, 0)))

## Q1

### Dataset and Dataloader

In [35]:
# Use standard MNIST dataset
class MyDataset(torchvision.datasets.MNIST):
    def __init__(self, *args, debug=False, **kwargs):
        super().__init__(*args, **kwargs)
        self.debug = debug

    def __getitem__(self, idx):
        data = super().__getitem__(idx)
        img = data[0]
        label = data[1]
        return {'image': img, 'label': label}

    def __len__(self):
        return super().__len__()

dataset = MyDataset(
    root = './',
    train = True,
    download = True,
    transform = transforms.Compose([
        transforms.ToTensor(),
        gray_to_color,
    ]),
    debug=True,
)

loader = torch.utils.data.DataLoader(dataset, batch_size=128, num_workers=0)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz


34.0%

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MyDataset/raw/train-images-idx3-ubyte.gz


100.0%


Extracting ./MyDataset/raw/train-images-idx3-ubyte.gz to ./MyDataset/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz


100.0%

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MyDataset/raw/train-labels-idx1-ubyte.gz
Extracting ./MyDataset/raw/train-labels-idx1-ubyte.gz to ./MyDataset/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz



100.0%

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MyDataset/raw/t10k-images-idx3-ubyte.gz
Extracting ./MyDataset/raw/t10k-images-idx3-ubyte.gz to ./MyDataset/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz



100.0%

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MyDataset/raw/t10k-labels-idx1-ubyte.gz
Extracting ./MyDataset/raw/t10k-labels-idx1-ubyte.gz to ./MyDataset/raw





### ToDo: train, validation, test split

The standard MNIST dataset does not provide a validation set. Use 20% of the training data as the validation set.

The standard MNIST dataset has a test set, and you can download it similar to downloading the train set, only by setting the train label to 'False'. Use the test set only for final the evaluation.

In [36]:
# ToDo: load the test set


In [37]:
# ToDo: split the training data into train and validation, and define the dataloaders
train_loader = None
val_loader = None
test_loader = None

In [38]:
# ToDo: print the size of train, validation, and test sets


### Network

In [39]:
# an example of using available models in PyTorch
class MyResNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.resnet18 = torchvision.models.resnet18(num_classes=10)

    def forward(self, input_dict):
        pred_label = self.resnet18(input_dict['image'])
        return {'label': pred_label}

# a simple CNN model, implemented from the scratch
class MyNetwork(nn.Module):
    def __init__(self):
        super(MyNetwork, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=1)
        self.fc = nn.Linear(32*26*26, 10)

    def forward(self, input_dict):
        x = self.conv1(input_dict['image'])
        x = self.relu1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        pred_label = nn.functional.log_softmax(x, dim=1)

        return {'label': pred_label}

# You can choose either of the defined networks, or define your own neural net
network = MyNetwork().to('cuda')
print(network)

AssertionError: Torch not compiled with CUDA enabled

### Training

In [None]:
# training loop
%matplotlib inline
from IPython import display

# prepare plotting
fig = plt.figure(figsize=(20, 5), dpi= 80, facecolor='w', edgecolor='k')
axes = fig.subplots(1,3)

optimizer = torch.optim.Adam(network.parameters(), lr=0.001)

num_epochs = 5
train_losses = []
val_losses = []

for e in range(num_epochs):
    train_iter = iter(train_loader)
    network.train()
    for i in range(len(train_loader)):
        batch_cpu = next(train_iter)
        batch_gpu = dict_to_device(batch_cpu, 'cuda')
        pred = network(batch_gpu)
        pred_cpu = dict_to_device(pred, 'cpu')

        # calculate the loss and backward the gradient
        loss = nn.CrossEntropyLoss()(pred['label'], batch_gpu['label'])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

        if i%100==0:
            axes[0].cla()
            axes[1].cla()

            # plot some sample image inputs
            plot_images(batch_cpu['image'][0:1], ax=axes[0])
            axes[0].legend()
            axes[0].set_title('sample input')

            # plot the training error on a log plot
            axes[1].plot(train_losses, label='loss')
            axes[1].set_yscale('log')
            axes[1].set_title('Training loss')
            axes[1].set_xlabel('number of gradient iterations')
            axes[1].legend()

            # clear output window and diplay updated figure
            display.clear_output(wait=True)
            display.display(plt.gcf())
            print("Training epoch {}, iteration {} of {} ({} %), loss={}".format(e, i, len(train_loader), 100*i//len(train_loader), train_losses[-1]))

    val_iter = iter(val_loader)
    network.eval()
    for i in range(len(val_loader)):
        batch_cpu = next(val_iter)
        batch_gpu = dict_to_device(batch_cpu, 'cuda')
        pred = network(batch_gpu)
        pred_cpu = dict_to_device(pred, 'cpu')

        # calculate the loss
        with torch.no_grad():
            loss = nn.CrossEntropyLoss()(pred['label'], batch_gpu['label'])
            val_losses.append(loss.item())

        if i%10==0:
            axes[2].cla()

            # plot the validation error on a log plot
            axes[2].plot(val_losses, label='loss')
            axes[2].set_yscale('log')
            axes[2].set_title('Validation loss')
            axes[2].set_xlabel('number of gradient iterations')
            axes[2].legend()

            # clear output window and diplay updated figure
            display.clear_output(wait=True)
            display.display(plt.gcf())
            print("Validation epoch {}, iteration {} of {} ({} %), loss={}".format(e, i, len(val_loader), 100*i//len(val_loader), val_losses[-1]))
plt.close('all')

### ToDo: Evaluation

Here we want to report the accuracy of the network's predictions. We have defined a function named 'get_accuracy' that returns the accuracy of the network on its input data.

Here, we assumed that the network returns class probabilities as output. If your network returns class indices, you need to change this evaluation function as well.

Print the accuracy of your network for the train, validation, and test sets.

In [None]:
def get_accuracy(network, data_loader):
    network.eval()
    iterator = iter(data_loader)
    correct = 0
    total = 0
    for i in range(len(iterator)):
        batch_cpu = next(iterator)
        batch_gpu = dict_to_device(batch_cpu, 'cuda')
        pred = network(batch_gpu)['label'].argmax(dim=1, keepdim=True)
        correct += pred.eq(batch_gpu['label'].view_as(pred)).sum().item()
        total += pred.shape[0]

    return correct / total

In [None]:
# ToDo: report accuracy on train, validation, and test sets


In [None]:
# ToDo: Plot the input images and output of your network for a few samples in the test set


## Q2

### Dataset and Dataloader

In [None]:
# Use standard MNIST dataset
class AnomalyDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
        self.digits = {0:[], 1:[], 2:[], 3:[], 4:[]}
        for idx in range(len(self.data)):
            digit = self.data[idx][1]
            if digit in self.digits.keys():
                self.digits[digit].append(idx)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        norm, anomaly = random.sample(self.digits.keys(), k=2)
        anomaly_loc, = random.sample([0, 1, 2], k=1)
        norm1_idx, norm2_idx = random.sample(self.digits[norm], k=2)
        anomaly_idx, = random.sample(self.digits[anomaly], k=1)
        if anomaly_loc == 0:
            img0 = self.data[anomaly_idx][0]
            img1 = self.data[norm1_idx][0]
            img2 = self.data[norm2_idx][0]
        elif anomaly_loc == 1:
            img0 = self.data[norm1_idx][0]
            img1 = self.data[anomaly_idx][0]
            img2 = self.data[norm2_idx][0]
        elif anomaly_loc == 2:
            img0 = self.data[norm1_idx][0]
            img1 = self.data[norm2_idx][0]
            img2 = self.data[anomaly_idx][0]

        return {'img0': img0, 'img1': img1, 'img2': img2, 'index': anomaly_loc}

dataset = AnomalyDataset(torchvision.datasets.MNIST(
    root = './',
    train = True,
    download = True,
    transform = transforms.Compose([
        transforms.ToTensor(),
        gray_to_color,
    ])
))

loader = torch.utils.data.DataLoader(dataset, batch_size=128, num_workers=0)

In [None]:
print('Selected digits and their frequencies:')
for key, value in loader.dataset.digits.items():
    print(key, len(value))

### ToDo: train, validation, test split

Use 20% of the training data as the validation set.

The standard MNIST dataset has a test set, and you can download it similar to downloading the train set, only by setting the train label to 'False'. Use the test set only for final the evaluation.

In [None]:
# ToDo: load the test set


In [None]:
# ToDo: define the dataloaders
train_loader = None
val_loader = None
test_loader = None

In [None]:
# ToDo: print the size of train, validation, and test sets


### ToDo: Network

Implement a neural network that takes three images as input and returns the index of the image with the different digit.

Your network should take a python dictionary as input and extract the input images from it. Your network should return a python dictionaly containing a key named 'index'.

In [None]:
# ToDo: Implement your neural network from scratch
class MyNetwork(nn.Module):
    def __init__(self):
        super(MyNetwork, self).__init__()

        # ToDo: code here


    def forward(self, input_dict):
        img0 = input_dict['img0']
        img1 = input_dict['img1']
        img2 = input_dict['img2']

        # ToDo: code here

        # Hint: use padding in conv layers to adjust the dimensions
        # Hint: max-pooling and ReLU layers can be useful
        # Hint: choose a suitable activation function for the last layer


        return {'index': pred_index}

network = MyNetwork().to('cuda')
print(network)

### ToDo: Training

In [None]:
# ToDo: define a suitable loss function
def my_loss_function(predicted_index, target_index):

  # code here

  return loss

In [None]:
# training loop
%matplotlib inline
from IPython import display

# prepare plotting
fig = plt.figure(figsize=(20, 5), dpi= 80, facecolor='w', edgecolor='k')
axes = fig.subplots(1,3)

optimizer = torch.optim.Adam(network.parameters(), lr=0.001)

num_epochs = 5
train_losses = []
val_losses = []

for e in range(num_epochs):
    train_iter = iter(train_loader)
    network.train()
    for i in range(len(train_loader)):
        batch_cpu = next(train_iter)
        batch_gpu = dict_to_device(batch_cpu, 'cuda')
        pred = network(batch_gpu)
        pred_cpu = dict_to_device(pred, 'cpu')

        # calculate the loss and backward the gradient
        loss = my_loss_function(pred['index'], batch_gpu['index'])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

        if i%100==0:
            axes[0].cla()
            axes[1].cla()

            # plot some sample image inputs
            plot_images(torch.cat((batch_cpu['img0'][0:1], batch_cpu['img1'][0:1], batch_cpu['img2'][0:1]), 0), ax=axes[0])
            axes[0].legend()
            axes[0].set_title('sample input')

            # plot the training error on a log plot
            axes[1].plot(train_losses, label='loss')
            axes[1].set_yscale('log')
            axes[1].set_title('Training loss')
            axes[1].set_xlabel('number of gradient iterations')
            axes[1].legend()

            # clear output window and diplay updated figure
            display.clear_output(wait=True)
            display.display(plt.gcf())
            print("Training epoch {}, iteration {} of {} ({} %), loss={}".format(e, i, len(train_loader), 100*i//len(train_loader), train_losses[-1]))

    val_iter = iter(val_loader)
    network.eval()
    for i in range(len(val_loader)):
        batch_cpu = next(val_iter)
        batch_gpu = dict_to_device(batch_cpu, 'cuda')
        pred = network(batch_gpu)
        pred_cpu = dict_to_device(pred, 'cpu')

        # calculate the loss
        with torch.no_grad():
            loss = my_loss_function(pred['index'], batch_gpu['index'])
            val_losses.append(loss.item())

        if i%10==0:
            axes[2].cla()

            # plot the validation error on a log plot
            axes[2].plot(val_losses, label='loss')
            axes[2].set_yscale('log')
            axes[2].set_title('Validation loss')
            axes[2].set_xlabel('number of gradient iterations')
            axes[2].legend()

            # clear output window and diplay updated figure
            display.clear_output(wait=True)
            display.display(plt.gcf())
            print("Validation epoch {}, iteration {} of {} ({} %), loss={}".format(e, i, len(val_loader), 100*i//len(val_loader), val_losses[-1]))
plt.close('all')

### ToDo: Evaluation

Here we want to report the accuracy of the network's predictions. We have defined a function named 'get_accuracy' that returns the accuracy of the network on its input data.

Here, we assumed that the network returns class probabilities as output. If your network returns class indices, you need to change this evaluation function as well.

Print the accuracy of your network for the train, validation, and test sets.

In [None]:
def get_accuracy(network, data_loader):
    network.eval()
    iterator = iter(data_loader)
    correct = 0
    total = 0
    for i in range(len(iterator)):
        batch_cpu = next(iterator)
        batch_gpu = dict_to_device(batch_cpu, 'cuda')
        pred = network(batch_gpu)['index'].argmax(dim=1, keepdim=True)
        correct += pred.eq(batch_gpu['index'].view_as(pred)).sum().item()
        total += pred.shape[0]

    return correct / total

In [None]:
# ToDo: report accuracy on train, validation, and test sets


In [None]:
# ToDo: Plot the input images and output of your network for a few samples in the test set


## Q3

### Load and Tokenize Data

In [None]:
data = pd.read_csv('./imdb_processed.csv')
data.head()

In [None]:
# obtain list of words
words = ' '.join(data.processed.values).split()

# check our list
words[30:40]

In [None]:
# build vocabulary
counter = Counter(words)
vocab = sorted(counter, key=counter.get, reverse=True)
token2word = dict(enumerate(vocab, 1))
token2word[0] = '<PAD>'
word2token = {word: id for id, word in token2word.items()}

In [None]:
# tokenize reviews
reviews = data.processed.values
reviews_tokenized = [[word2token[word] for word in review.split()] for review in tqdm(reviews)]

# padding sequences
def pad_features(reviews, pad_id, seq_length):
    features = np.full((len(reviews), seq_length), pad_id, dtype=int)
    for i, row in enumerate(reviews):
        features[i, :len(row)] = np.array(row)[:seq_length]

    return features

features = pad_features(reviews_tokenized, pad_id=word2token['<PAD>'], seq_length=256)

print('number of reviews:', len(reviews_tokenized))
print('seq_length:', len(features[0]))

# print first-5 words of first 3 reviews
print('\n first-five words of the first-three reviews:')
print('===============')
features[:3, :5]

### Dataloader

In [None]:
# we use 70%, 15%, 15% for train, validation, and test sets
train_size = .7
val_size = .5
labels = data.label.to_numpy()

# make train set
split_id = int(len(features) * train_size)
train_x, remain_x = features[:split_id], features[split_id:]
train_y, remain_y = labels[:split_id], labels[split_id:]

# make val and test set
split_val_id = int(len(remain_x) * val_size)
val_x, test_x = remain_x[:split_val_id], remain_x[split_val_id:]
val_y, test_y = remain_y[:split_val_id], remain_y[split_val_id:]

# print out the shape
print('Train set: {}'.format(train_x.shape))
print('Validation set: {}'.format(val_x.shape))
print('Test set: {}'.format(test_x.shape))

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, reviews, labels):
        self.data = torch.utils.data.TensorDataset(torch.from_numpy(reviews), torch.from_numpy(labels).float())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        review = self.data[index][0]
        label = self.data[index][1]

        return {'review': review, 'label': label}


train_set = MyDataset(train_x, train_y)
val_set = MyDataset(val_x, val_y)
test_set = MyDataset(test_x, test_y)

train_loader = torch.utils.data.DataLoader(train_set, shuffle=True, batch_size=128)
val_loader = torch.utils.data.DataLoader(val_set, shuffle=True, batch_size=128)
test_loader = torch.utils.data.DataLoader(test_set, shuffle=True, batch_size=128)

### ToDo: Network

Implement a neural network based on Long short-term memory (LSTM) that takes a series of words (a review) as input and returns the label of the review indicating whether it is positive (1) or negative (0).

Your network should take a python dictionary as input and extract the input sentences from it. Your network should return a python dictionaly containing a key named 'label'.

In [None]:
# ToDo: Implement your neural network
class SentimentLSTM(nn.Module):
    def __init__(self):
        super(SentimentLSTM, self).__init__()

        # ToDo: code here

        # Hint: start with an embedding layer
        # Hint: using dropout might be useful
        # Hint: choose a suitable activation function for the last layer


    def forward(self, input_dict):
        x = input_dict['review']

        # ToDo: code here

        return {'label': pred_label}

network = SentimentLSTM().to('cuda')
print(network)

### ToDo: Training

In [None]:
# ToDo: define a suitable loss function
def my_loss_function(predicted_label, target_label):

  # code here

  return loss

In [None]:
# training loop
%matplotlib inline
from IPython import display

# prepare plotting
fig = plt.figure(figsize=(20, 5), dpi= 80, facecolor='w', edgecolor='k')
axes = fig.subplots(1,2)

optimizer = torch.optim.Adam(network.parameters(), lr=0.001)

num_epochs = 10
train_losses = []
val_losses = []

for e in range(num_epochs):
    train_iter = iter(train_loader)
    network.train()
    for i in range(len(train_loader)):
        batch_cpu = next(train_iter)
        batch_gpu = dict_to_device(batch_cpu, 'cuda')
        pred = network(batch_gpu)
        pred_cpu = dict_to_device(pred, 'cpu')

        # calculate the loss and backward the gradient
        loss = my_loss_function(pred['label'], batch_gpu['label'])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

        if i%100==0:
            axes[0].cla()

            # plot the training error on a log plot
            axes[0].plot(train_losses, label='loss')
            axes[0].set_yscale('log')
            axes[0].set_title('Training loss')
            axes[0].set_xlabel('number of gradient iterations')
            axes[0].legend()

            # clear output window and diplay updated figure
            display.clear_output(wait=True)
            display.display(plt.gcf())
            print("Training epoch {}, iteration {} of {} ({} %), loss={}".format(e, i, len(train_loader), 100*i//len(train_loader), train_losses[-1]))

    val_iter = iter(val_loader)
    network.eval()
    for i in range(len(val_loader)):
        batch_cpu = next(val_iter)
        batch_gpu = dict_to_device(batch_cpu, 'cuda')
        pred = network(batch_gpu)
        pred_cpu = dict_to_device(pred, 'cpu')

        # calculate the loss
        with torch.no_grad():
            loss = my_loss_function(pred['label'], batch_gpu['label'])
            val_losses.append(loss.item())

        if i%10==0:
            axes[1].cla()

            # plot the validation error on a log plot
            axes[1].plot(val_losses, label='loss')
            axes[1].set_yscale('log')
            axes[1].set_title('Validation loss')
            axes[1].set_xlabel('number of gradient iterations')
            axes[1].legend()

            # clear output window and diplay updated figure
            display.clear_output(wait=True)
            display.display(plt.gcf())
            print("Validation epoch {}, iteration {} of {} ({} %), loss={}".format(e, i, len(val_loader), 100*i//len(val_loader), val_losses[-1]))
plt.close('all')

### ToDo: Evaluation

Here we want to report the accuracy of the network's predictions. We have defined a function named 'get_accuracy' that returns the accuracy of the network on its input data.

Here, we assumed that the network returns class probabilities as output. If your network returns class indices, you might need to change this evaluation function as well.

Print the accuracy of your network for the train, validation, and test sets.

In [None]:
def get_accuracy(network, data_loader):
    network.eval()
    iterator = iter(data_loader)
    correct = 0
    total = 0
    for i in range(len(iterator)):
        batch_cpu = next(iterator)
        batch_gpu = dict_to_device(batch_cpu, 'cuda')
        pred = network(batch_gpu)['label']
        binary_pred = torch.where(pred < 0.5, 0, 1).squeeze(-1)
        correct += binary_pred.eq(batch_gpu['label']).sum().item()
        total += binary_pred.shape[0]

    return correct / total

In [None]:
# ToDo: report accuracy on train, validation, and test sets


In [None]:
# ToDo: print the input review and output of your network for one positive
#       example and one negative example in the test set
