In [3]:
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import nltk.corpus
import re
import matplotlib.pyplot as plt
import random
from sklearn import metrics
from pdb import set_trace as st
import spacy
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:

nlp = spacy.load("en_core_web_lg")  # make sure to use larger package!
stopwords = nltk.corpus.stopwords.words('english')
'''
stopwords = stopwords + [
    "n't",
    'not',
    'mr',
    'mr.'
]
'''

'\nstopwords = stopwords + [\n    "n\'t",\n    \'not\',\n    \'mr\',\n    \'mr.\'\n]\n'

In [5]:
import csv

with open('train.csv', newline='', encoding='utf-8') as f:
    reader = csv.reader(f, dialect='excel')
    train_set = list(reader)

train_set = list(map(
    lambda x: [int(x[0]), x[1], int(x[2])],
    train_set[1:]
))

random.shuffle(train_set)

for datum in train_set:
    if datum[2] == -1:
        datum.append(np.array([1,0,0]))
    elif datum[2] == 0:
        datum.append(np.array([0,1,0]))
    else:
        datum.append(np.array([0,0,1]))

val_set = train_set[:2000]
train_set = train_set[2000:]

with open('test.csv', newline='', encoding='utf-8') as f:
    reader = csv.reader(f, dialect='excel')
    test_set = list(reader)

test_set = list(map(
    lambda x: (int(x[0]), x[1]),
    test_set[1:]
))

In [6]:
train_set_label_counts = np.sum(
    np.array(list(map(
        lambda x: x[3],
        train_set
    ))),
    axis=0
)
class_weights = np.array([10000, 10000, 10000]) / train_set_label_counts
class_weights = torch.FloatTensor(class_weights)


In [7]:
# TODO: dropout?

class NeuralNet(nn.Module):
    def __init__(self, input_dims):
        super(NeuralNet, self).__init__()
        self.layer1 = nn.Linear(input_dims, 10)
        self.layer2 = nn.Linear(10, 3)

    # Called on each input
    # Computes the outputs (and next hidden state)
    def forward(self, x):
        x = self.layer1(x)
        x = nn.LeakyReLU()(x)
        x = self.layer2(x)
        x = nn.Softmax(dim=3)(x)
        return x[0][0]


In [8]:
neural_net = NeuralNet(300)
neural_net(torch.Tensor([[[nlp('president think say').vector]]]))

tensor([[0.2782, 0.3745, 0.3473]], grad_fn=<SelectBackward>)

In [9]:
neural_net(torch.Tensor([[[nlp(train_set[0][1]).vector]]])), train_set[0][2]

(tensor([[0.2681, 0.3854, 0.3465]], grad_fn=<SelectBackward>), 0)

In [None]:
FORCE_OVERFIT = False

if FORCE_OVERFIT:
    so_called_train_set = train_set[:4]
    epochs = 100
else:
    so_called_train_set = train_set
    epochs = 50

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(neural_net.parameters(), lr=0.001, weight_decay=0.00001) # weight_decay also has L2 normalization
train_loader = torch.utils.data.DataLoader(so_called_train_set, batch_size=4, shuffle=True, num_workers=4)

f1s = []

for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        _, x, labels, _ = data
        
        x = torch.FloatTensor([[list(map(
            lambda string: nlp(string).vector,
            x
        ))]])
        
        labels = list(map(
            lambda x: x+1,
            labels
        ))
        
        optimizer.zero_grad()
        outputs = neural_net(x)
        loss = criterion(outputs, torch.LongTensor(labels))
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0
    print(f'Epoch {epoch}')
    f1s.append(validate(neural_net, val_set))

In [None]:
plt.plot(f1s)

In [11]:
criterion = nn.CrossEntropyLoss()

def validate(model, val_set):
    val_x = []
    val_y_labels = []
    for val_datum in val_set:
        val_x.append(val_datum[1])
        val_y_labels.append(val_datum[2]+1)
    val_x = torch.FloatTensor([[list(map(
        lambda string: nlp(string).vector,
        val_x
    ))]])
    val_outputs = neural_net(val_x)
    loss = criterion(val_outputs, torch.LongTensor(val_y_labels))
    print(f'CE Loss: {loss.item()}')
    
    val_output_labels = list(map(
        lambda v_o: np.argmax(v_o.detach().numpy()),
        val_outputs
    ))
    macro_f1 = metrics.f1_score(val_y_labels, val_output_labels, average='macro')
    print(f'Macro F1: {macro_f1}')
    return macro_f1

validate(neural_net, val_set)

CE Loss: 1.1313353776931763
Macro F1: 0.0671462829736211


0.0671462829736211

In [16]:
test_x = list(map(
    lambda x: x[1],
    test_set
))

test_x = torch.FloatTensor([[list(map(
    lambda string: nlp(string).vector,
    test_x
))]])

test_outputs = neural_net(test_x)

test_output_labels = list(map(
    lambda v_o: np.argmax(v_o.detach().numpy()),
    test_outputs
))

In [19]:
import pandas as pd
test = pd.read_csv('test.csv')
test['Verdict'] = pd.Series(val_output_labels)
test.drop(columns=['Text'], inplace=True)
test.to_csv('A0184415E.csv', index=False)