In [None]:
import nltk
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
import gensim.downloader as api
from gensim.models import KeyedVectors
from tqdm import tqdm
import random

In [None]:
# Making the runs deterministic
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)

# Download necessary components
# stored in ~/nltk_data
nltk.download(["stopwords", "punkt"])
# ~13 mb stored in ~/gensim-data
data = api.load("20-newsgroups")
# ~1.6 gb, this takes some time!
embedding = api.load("word2vec-google-news-300")

# Defining the Dataset

In [None]:
class NewsgroupsDataset(Dataset):
    """20 Newsgroups Dataset"""

    def __init__(self, data: list, labels: dict, embedding: KeyedVectors):
        super().__init__()
        self.data = data
        self.labels = labels
        self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.embedding = embedding
        self.embedding_dim = self.embedding.vector_size

    def __len__(self):
        """Returns the size of the dataset"""
        return len(self.data)

    def __getitem__(self, idx: int):
        """Returns a data point (text and label) given an index"""
        text = self.data[idx]["data"]
        text = self.preprocess(text)
        text = torch.from_numpy(text).float()  # network inputs need to be float

        label = self.data[idx]["topic"]
        label = self.labels[label]
        label = torch.tensor(label).long()  # label is not a continuous value but class indices

        return text, label

    def preprocess(self, text):
        """ Processes raw text into neural network input """
        tokens = nltk.word_tokenize(text)
        count = 0

        features = np.zeros(self.embedding_dim).astype(np.float32)
        for token in tokens:
            # continue if token is stopword or missing in vocabulary
            if token in self.stopwords or token not in self.embedding:
                continue

            count += 1
            features += self.embedding[token]

        return features / count

# Defining the Model structure

In [None]:
class NewsgroupsModel(nn.Module):
    """Simple Feedforward Neural Network for 20 Newsgroups"""

    def __init__(self, input_size=300):
        super().__init__()

        self.input_size = input_size
        self.hidden_1_size = 2048
        self.hidden_2_size = 256
        self.num_classes = 20

        # nn.Linear is a feedforward layer, i.e. that it captures weights and bias values
        self.fc1 = nn.Linear(self.input_size, self.hidden_1_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(self.hidden_1_size, self.hidden_2_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(self.hidden_2_size, self.num_classes)

        # weight initialisation
        torch.nn.init.xavier_uniform_(self.fc1.weight)
        torch.nn.init.xavier_uniform_(self.fc2.weight)
        torch.nn.init.xavier_uniform_(self.fc3.weight)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)  # => logits

        # softmax is not used here as the predefined loss function automatically assigns it

        return x

In [None]:
class NewsgroupsModelDropout(nn.Module):
    """Simple Feedforward Neural Network for 20 Newsgroups"""

    def __init__(self, input_size=300):
        super().__init__()

        self.input = input_size
        self.hidden_1_size = 2048
        self.hidden_2_size = 256
        self.num_classes = 20

        # nn.Linear is a feedforward layer, i.e. that it captures weights and bias values
        self.fc1 = nn.Linear(self.input, self.hidden_1_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(self.hidden_1_size, self.hidden_2_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(self.hidden_2_size, self.num_classes)

        # Dropout with 10% chance of dropping a neuron
        self.dropout = nn.Dropout(p=0.1)

        # weight initialisation
        torch.nn.init.xavier_uniform_(self.fc1.weight)
        torch.nn.init.xavier_uniform_(self.fc2.weight)
        torch.nn.init.xavier_uniform_(self.fc3.weight)

    def forward(self, x):
        x = self.dropout(self.fc1(x))
        x = self.relu1(x)
        x = self.dropout(self.fc2(x))
        x = self.relu2(x)
        x = self.fc3(x)  # => logits

        # softmax is not used here as the predefined loss function automatically assigns it

        return x

In [None]:
class NewsgroupsModelLowLevel(nn.Module):
    """Simple Feedforward Neural Network for 20 Newsgroups"""

    def __init__(self, input_size=300):
        super().__init__()

        self.input_size = input_size
        self.hidden_1_size = 2048
        self.hidden_2_size = 256
        self.num_classes = 20

        self.W1 = nn.Parameter(torch.randn(self.input_size, self.hidden_1_size, requires_grad=True))
        self.b1 = nn.Parameter(torch.randn(1, self.hidden_1_size, requires_grad=True))
        self.relu1 = nn.ReLU()
        self.W2 = nn.Parameter(torch.randn(self.hidden_1_size, self.hidden_2_size, requires_grad=True))
        self.b2 = nn.Parameter(torch.randn(1, self.hidden_2_size, requires_grad=True))
        self.relu2 = nn.ReLU()
        self.W3 = nn.Parameter(torch.randn(self.hidden_2_size, self.num_classes, requires_grad=True))
        self.b3 = nn.Parameter(torch.randn(1, self.num_classes, requires_grad=True))

    def forward(self, x):
        # first hidden layer
        x = x @ self.W1 + self.b1
        x = self.relu1(x)
        # second hidden layer
        x = x @ self.W2 + self.b2
        x = self.relu2(x)
        # output layer
        x = x @ self.W3 + self.b3  # => logits

        # softmax is not used here as the predefined loss function automatically assigns it

        return x

# Loading the train and test sets

In [None]:
train_data, test_data = [], []
labels = set()

# split data into train and test set
for document in data:
    labels.add(document["topic"])
    if document["set"] == "train":
        train_data.append(document)
    else:
        test_data.append(document)

# assign indices to labels
labels = {label: index for index, label in enumerate(labels)}

train_dataset = NewsgroupsDataset(train_data, labels, embedding)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=2)

test_dataset = NewsgroupsDataset(test_data, labels, embedding)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=2)

# Initialising the model

In [None]:
model = NewsgroupsModel()
# model = NewsgroupsModelDropout()
# model = NewsgroupsModelLowLevel()

# Defining the loss function and optimisation algorithm

In [None]:
# Loss function
criterion = nn.CrossEntropyLoss()
# Optimiser
optimiser = optim.Adam(model.parameters(), lr=0.001)
# optimiser = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.05)

# Metrics to check the performance

We want to check how the model performs on the train and test datasets while and after training.
Therefore we build a little helper that calculates the accuracy of the network's predictions.

We need to handle the batches that are used while training.

In [None]:
class Accuracy:
    """A class to keep track of the accuracy while training"""
    def __init__(self):
        self.correct = 0
        self.total = 0
        
    def reset(self):
        """Resets the internal state"""
        self.correct = 0
        self.total = 0
        
    def update(self, output, labels):
        """
        Updates the internal state to later compute the overall accuracy
        
        output: the output of the network for a batch
        labels: the target labels
        """
        _, predicted = torch.max(output.data, 1) # predicted now contains the predicted class index/label
        
        self.total += labels.size(0)
        self.correct += (predicted == labels).sum().item() # .item() gets the number, not the tensor

    def compute(self):
        return self.correct/self.total
    

accuracy = Accuracy()

# Training loop

We loop over the training dataset multiple times (every full iteration is called an *epoch*).
For every batch in the dataset, we calculate the loss of the network output, calculate the gradients by using Autograd's automatic gradient calculation, and update the network parameters using the Adam optimiser we initialised before.

In [None]:
model.train()

for epoch in range(10):  # loop over the dataset multiple times
    print("Starting epoch {}".format(epoch+1))
    
    total = 0
    running_loss = 0.0

    # to make a beautiful progress bar
    loader = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, data in loader:
        # get the data points
        inputs, labels = data

        # zero the parameter gradients (else, they are accumulated)
        optimiser.zero_grad()

        # forward the data through the network
        outputs = model(inputs)
        # calculate the loss given the output of the network and the target labels
        loss = criterion(outputs, labels)
        # calculate the gradients of the network w.r.t. its parameters
        loss.backward()
        # Let the optimiser take an optimization step using the calculated gradients
        optimiser.step()
        
        running_loss += loss
        total += outputs.size(0)

        loader.set_description("loss: {:.5f}".format(running_loss/total))

print("Finished Training")

# Testing

We can now use the test set to run inference of our model.
We can output resulting predictions or use them for testing how well our model generalizes.

In [None]:
model.eval()

## Training Accuracy

In [None]:
accuracy.reset()

# Gradients are calculated on the forward pass for every iteration.
# As we do not need gradients now, we can disable the calculation.
with torch.no_grad():
    for data in tqdm(train_loader):
        # get the data points
        inputs, labels = data

        # forward the data through the network
        outputs = model(inputs)
        
        accuracy.update(outputs, labels)
        
print("Accuracy: {:.2f}%".format(100 * accuracy.compute()))

## Test Accuracy

In [None]:
accuracy.reset()

# Gradients are calculated on the forward pass for every iteration.
# As we do not need gradients now, we can disable the calculation.
with torch.no_grad():
    for data in tqdm(test_loader): # now the test_loader
        # get the data points
        inputs, labels = data

        # forward the data through the network
        outputs = model(inputs)
        
        accuracy.update(outputs, labels)
        
print("Accuracy: {:.2f}%".format(100 * accuracy.compute()))