# MLP for Text Classification

Multilayer Perceptron is the most traditional types of deep learning, the architecture is simple: every node in previous layer is connected to every node in the next layer.

Each perceptron unit has an input (x), an output (y), and three “knobs”: a set of weights (w), a bias (b), and an activation function (f). The weights and the bias are learned from the data, and the activation function is handpicked depending on the network designer’s intuition of the network and its target outputs. Mathematically, we can express this as follows:

y = f ( wx + b )

Essentially, a perceptron is a composition of a linear and a nonlinear function. The linear expression wx+b is also known as an affine transform.

In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [105]:
class MultilayerPerceptron(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        """
        Args:
            input_dim (int): the size of the input vectors
            hidden_dim (int): the output size of the first Linear layer
            output_dim (int): the output size of the second Linear layer
        """
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x_in, apply_softmax=False):
        """The forward pass of the MLP
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (batch, input_dim)
            apply_softmax (bool): a flag for the softmax activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch, output_dim)
        """
        intermediate = F.relu(self.fc1(x_in))
        output = self.fc2(intermediate)
        
        if apply_softmax:
            output = F.softmax(output, dim=1)
        return output

## Data process

In [2]:
def parse_line(line):
    """
    Parse one line in the file
    return: set of words, class
    """
    l = line.strip().split()
    tag = l[0]
    word_set = set([w.split(':')[0] for w in l[1:]])
    return word_set, tag

def parse_file(filename):
    """
    Parse file
    return: X - word set list, y - class list
    """
    X, y = [], []
    with open(filename, 'r') as f:
        for line in f.readlines():
            word_set, target = parse_line(line)
            X.append(word_set)
            y.append(target)
    return X, y

In [3]:
training_data = 'data/train.txt'
test_data = 'data/test.txt'

X_train, y_train = parse_file(training_data)
X_test, y_test = parse_file(test_data)

In [18]:
vocab = set.union(*X_train)
vocab.add('<UNK>')
id_to_token = {idx: token for idx, token in enumerate(sorted(vocab))}
token_to_id = {token: idx for idx, token in id_to_token.items()}
assert len(vocab) == len(id_to_token) == len(token_to_id)

In [19]:
id_to_label = {idx: label for idx, label in enumerate(sorted(set(y_train)))}
label_to_id = {label: idx for idx, label in id_to_label.items()}
assert len(id_to_label) == len(label_to_id)

## One-hot encoding

In [21]:
def vectorize(convert_dict, X):
    one_hot_matrix_size = (len(X), len(convert_dict))
    one_hot_matrix = np.zeros(one_hot_matrix_size, dtype=np.float32)
                               
    for position_index, news in enumerate(X):
        for word in news:
            word_index = convert_dict.get(word, convert_dict['<UNK>'])
            one_hot_matrix[position_index][word_index] = 1
        
    return one_hot_matrix

In [22]:
X_train_onehot = vectorize(token_to_id, X_train)
X_test_onehot = vectorize(token_to_id, X_test)

In [29]:
y_train_cat = [label_to_id[y] for y in y_train]
y_test_cat = [label_to_id[y] for y in y_test]

In [28]:
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return {'news': self.X[idx], 'label': self.y[idx]}

In [84]:
news_dataset = NewsDataset(X_train_onehot, y_train_cat)
news_dataloader = DataLoader(news_dataset, batch_size=32, shuffle=True)

In [56]:
for d in news_dataloader:
    print(d['news'].shape)
    print(d['label'].shape)
    break

torch.Size([32, 32847])
torch.Size([32])


In [106]:
batch_size = 32
input_dim = len(vocab)
hidden_dim = 500
output_dim = 3

mlp = MultilayerPerceptron(input_dim, hidden_dim, output_dim)
print(mlp)

MultilayerPerceptron(
  (fc1): Linear(in_features=32847, out_features=500, bias=True)
  (fc2): Linear(in_features=500, out_features=3, bias=True)
)


In [107]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.01)

## Training Stage

In [108]:
for _ in range(5):
    for i, data in enumerate(news_dataloader):
        news = data['news']
        label = data['label']
        # Step 1: Clear the gradients
        mlp.zero_grad()
        optimizer.zero_grad()
        # Step 2: Compute the forward pass of the model
        output = mlp(news, apply_softmax=False)
        # Step 3: Compute the loss value that we wish to optimize
        loss = criterion(output, label)
        # Step 4: Propagate the loss signal backward
        loss.backward()
        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()

## Evaluation

In [109]:
loss = 0
acc = 0
news_dataset_eval = NewsDataset(X_test_onehot, y_test_cat)
news_dataloader_eval = DataLoader(news_dataset_eval, batch_size=32, shuffle=True)
for data in news_dataloader_eval:
    news = data['news']
    label = data['label']
    with torch.no_grad():
        output = mlp(news, apply_softmax=False)
        loss = criterion(output, label)
        loss += loss.item()
        acc += (output.argmax(1) == label).sum().item()

print('avg loss', loss / len(news_dataset_eval))
print('accuracy', acc / len(news_dataset_eval))

avg loss tensor(0.0034)
accuracy 0.8733333333333333
