In [1]:
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import re
import matplotlib.pyplot as plt
import random
from sklearn import metrics
from pdb import set_trace as st
import spacy
import pandas as pd
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
train = pd.read_csv('train.csv')
X_train = train['Text']
y_train = train['Verdict']
train_set = list(zip(
    X_train, 
    map(
        lambda y: y+1,
        y_train
    )
))

random.shuffle(train_set)

val_set = train_set[:2000]
train_set = train_set[2000:]


In [3]:
train_set_label_counts = [0, 0, 0]

for datum in train_set:
    train_set_label_counts[datum[1]] += 1

class_weights = np.array([10000, 10000, 10000]) / train_set_label_counts
class_weights = torch.FloatTensor(class_weights)


In [4]:
class NeuralNet(nn.Module):
    def __init__(self, input_dims):
        super(NeuralNet, self).__init__()
        self.nlp = spacy.load("en_core_web_lg")
        self.layer1 = nn.Linear(input_dims, 10)
        self.layer2 = nn.Linear(10, 3)

    def forward(self, x):
        x = self.process_strings(x)
        x = self.layer1(x)
        x = nn.LeakyReLU()(x)
        x = self.layer2(x)
        x = nn.Softmax(dim=3)(x)
        return x[0][0]
    
    def process_strings(self, x):
        return torch.FloatTensor([[list(map(
            lambda string: self.nlp(str(string)).vector, # extra typecast to str to avoid np.str_ problems
            x
        ))]])


In [5]:
def validate(model, val_set, criterion):
    val_x, val_y = np.transpose(val_set)
    val_y = list(map(int, val_y))
    val_outputs = neural_net(val_x)
    loss = criterion(val_outputs, torch.LongTensor(val_y))
    print(f'CE Loss: {loss.item()}')
    val_output_labels = list(map(
        lambda v_o: np.argmax(v_o.detach().numpy()),
        val_outputs
    ))
    macro_f1 = metrics.f1_score(val_y, val_output_labels, average='macro')
    print(f'Macro F1: {macro_f1}')
    return macro_f1

In [6]:
FORCE_OVERFIT = False

if FORCE_OVERFIT:
    so_called_train_set = train_set[:4]
    epochs = 100
else:
    so_called_train_set = train_set
    epochs = 3

neural_net = NeuralNet(300)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(neural_net.parameters(), lr=0.001, weight_decay=0.00001) # weight_decay also has L2 normalization
train_loader = torch.utils.data.DataLoader(so_called_train_set, batch_size=4, shuffle=True, num_workers=4)

for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        x, labels = data
        optimizer.zero_grad()
        outputs = neural_net(x)
        loss = criterion(outputs, torch.LongTensor(labels))
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0
    print(f'Epoch {epoch}')
    validate(neural_net, val_set, criterion)

[1,  2000] loss: 0.933
[1,  4000] loss: 0.870
Epoch 0
CE Loss: 0.9077911376953125
Macro F1: 0.6315272633466836
[2,  2000] loss: 0.857
[2,  4000] loss: 0.854
Epoch 1
CE Loss: 0.8828085064888
Macro F1: 0.6417817525921833
[3,  2000] loss: 0.843
[3,  4000] loss: 0.842
Epoch 2
CE Loss: 0.9084600806236267
Macro F1: 0.6414550634893191


In [7]:
test_set = pd.read_csv('test.csv')
test_x = test_set['Text']
test_outputs = neural_net(test_x)
test_output_labels = list(map(
    lambda distrib: np.argmax(distrib.detach().numpy()) - 1,
    test_outputs
))
test_set['Verdict'] = pd.Series(test_output_labels)
test_set.drop(columns=['Text'], inplace=True)
test_set.to_csv('A0184415E.csv', index=False)