In [1]:
!pip install -r requirements.txt



In [2]:
do_train = False

In [3]:
import torch
import random
import numpy as np
import regex

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
def split_train_val_test(df, props=[.9, .1]):
    train_df, val_df = None, None
    
    train_size = int(props[0] * len(df))
    val_size =  train_size + int(props[1] * len(df))
    train_df = df.iloc[0:train_size]
    val_df = df.iloc[train_size:]
    return train_df, val_df


In [5]:
import gensim.downloader as api

def download_embeddings(fasttetxt):
    # https://fasttext.cc/docs/en/english-vectors.html
    if fasttetxt:
      wv = api.load("fasttext-wiki-news-subwords-300")
    else:
      
      wv = api.load("word2vec-google-news-300")
      print("\nLoading complete!\n" +
            "Vocabulary size: {}".format(len(wv.vocab)))
    return wv


In [6]:
# Opening and preprocessing input file
import gensim.models
import pandas as pd
import nltk
nltk.download('punkt')
from tqdm import tqdm
from preprocess import clean_text

data = pd.read_pickle('our_train.pkl')
test_df = pd.read_pickle('our_test.pkl')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# to convert authors into numbers
author_to_number = {
    'EAP': 0,
    'HPL': 1,
    'MWS': 2
    
}

# lowercase, removing punctuation and tookenize sentences. Converting labels to int
for i in range(len(data)):
    data['text'].iloc[i] = nltk.word_tokenize(regex.sub(r'[^\w\s]', '',data['text'].iloc[i].lower()))
    data['author'].iloc[i] = author_to_number[data['author'].iloc[i]]
data.sample(frac=1)
for i in range(len(test_df)):
    test_df['text'].iloc[i] = nltk.word_tokenize(regex.sub(r'[^\w\s]', '',test_df['text'].iloc[i].lower()))
    test_df['author'].iloc[i] = author_to_number[test_df['author'].iloc[i]]
test_df.sample(frac=1)
from dataset import *
# Splitting dataset and generating vocab
train_df, val_df = split_train_val_test(data)
train_vocab, reversed_vocab = generate_vocab_map(train_df)
# print(train_df)
val_df.head()
test_df.head()

[nltk_data] Downloading package punkt to /home/andre/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,id,text,author
17613,id08561,"[a, lamp, which, had, been, accidentally, left...",0
17614,id01432,"[i, gave, to, each, heroine, of, whom, i, read...",2
17615,id22037,"[he, got, in, communication, with, dr, houghto...",1
17616,id22330,"[the, trees, of, the, frequent, forest, belts,...",1
17617,id26151,"[i, then, moved, forward, and, a, murmuring, s...",2


In [7]:
# Use downloaded pretrained embeddings or train our own
DOWNLOAD = True
# Use fastext or word2vec
FASTTEXT = True
WINDOW_SIZE = 5

EMBEDDING_DIM = 300
HIDDEN_DIM = 128
NUM_LAYERS = 1
BIDIRECTIONAL = True


In [8]:
# Downloading or generating word2vec embeddings

if DOWNLOAD:
    model = download_embeddings(FASTTEXT)
else:
    if FASTTEXT:
        model = gensim.models.FastText(sentences=train_df['text'], size=EMBEDDING_DIM, window=WINDOW_SIZE)
    else:
        model = gensim.models.Word2Vec(sentences=train_df['text'], size=EMBEDDING_DIM, window=WINDOW_SIZE)
                        

In [9]:
from dataset import HeadlineDataset
from torch.utils.data import RandomSampler

train_dataset = HeadlineDataset(train_vocab, train_df,model.wv, FASTTEXT)
val_dataset = HeadlineDataset(train_vocab, val_df,model.wv, FASTTEXT)
test_dataset = HeadlineDataset(train_vocab, test_df,model.wv, FASTTEXT)

# Pytorch random samplers
train_sampler = RandomSampler(train_dataset)
val_sampler = RandomSampler(val_dataset)
test_sampler = RandomSampler(test_dataset)

  train_dataset = HeadlineDataset(train_vocab, train_df,model.wv, FASTTEXT)
  val_dataset = HeadlineDataset(train_vocab, val_df,model.wv, FASTTEXT)
  test_dataset = HeadlineDataset(train_vocab, test_df,model.wv, FASTTEXT)


In [10]:
from torch.utils.data import DataLoader
from dataset import collate_fn
BATCH_SIZE = 16
# Creating data iterators
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

for x, y in test_iterator:
    print(x,y)
    break

tensor([[[-0.0304,  0.0262,  0.0459,  ..., -0.0693, -0.0009, -0.0220],
         [ 0.0213, -0.0121,  0.0186,  ..., -0.0256, -0.0538, -0.0028],
         [ 0.0096,  0.0141,  0.0412,  ..., -0.0355,  0.0058,  0.0138],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0532, -0.0129,  0.0272,  ...,  0.0447,  0.0646,  0.0044],
         [ 0.0242,  0.0048,  0.0184,  ..., -0.0350,  0.0184,  0.0035],
         [-0.0417, -0.0532,  0.1002,  ..., -0.0111,  0.0370, -0.0189],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0242,  0.0048,  0.0184,  ..., -0.0350,  0.0184,  0.0035],
         [-0.0673, -0.0321,  0.1381,  ...,  0

  tokenized_word_tensor = torch.Tensor(tmp)


### Modeling

In [11]:
from models import ClassificationModel

model = ClassificationModel(len(train_vocab),embedding_dim=EMBEDDING_DIM,hidden_dim = HIDDEN_DIM,num_layers = NUM_LAYERS,bidirectional = BIDIRECTIONAL)

model.to(device)

ClassificationModel(
  (LSTM): LSTM(300, 128, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=256, out_features=3, bias=True)
  (softmax): Softmax(dim=1)
)

In [12]:
from torch.optim import AdamW

criterion, optimizer = torch.nn.CrossEntropyLoss(), torch.optim.Adam(model.parameters(), lr=0.001)

# Testing and Evaluation

In [13]:
# returns the total loss calculated from criterion
def train_loop(model, criterion, iterator):
    model.train()
    total_loss = 0
    
    for x, y in tqdm(iterator):
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()

        prediction = model(x)
        prediction = torch.squeeze(prediction)
        # y = y.round()
        y = y.long()
        

 
        loss = criterion(prediction,y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss

# returns:
# - true: a Python boolean array of all the ground truth values 
#         taken from the dataset iterator
# - pred: a Python boolean array of all model predictions. 
def val_loop(model, criterion, iterator):
    true, pred = [], []
    for x, y in tqdm(iterator):
        x = x.to(device)
        y = y.to(device)
    
        preds = model(x)
        preds.to(device)
        preds = torch.squeeze(preds)
        for i_batch in range(len(y)):
            true.append(y[i_batch])
            pred.append(torch.argmax(preds[i_batch]))
            
    return true, pred


In [14]:
# Initial testing
from sklearn.metrics import f1_score, accuracy_score

from eval_utils import binary_macro_f1, accuracy
true, pred = val_loop(model, criterion, val_iterator)
true = [x.item() for x in true]
pred = [x.item() for x in pred]
print(f1_score(true, pred, average='weighted'))
print(accuracy_score(true, pred))


100%|██████████| 111/111 [00:02<00:00, 48.97it/s]


0.22915720847032175
0.3904653802497162


### Training the model
Do not run this for testing

In [15]:
if do_train:
    TOTAL_EPOCHS = 7
    for epoch in range(TOTAL_EPOCHS):
        train_loss = train_loop(model, criterion, train_iterator)
        true, pred = val_loop(model, criterion, val_iterator)
        true = [x.item() for x in true]
        pred = [x.item() for x in pred]
        print(f"EPOCH: {epoch}")
        print(f"TRAIN LOSS: {train_loss}")
        print(f"VAL F-1: {f1_score(true, pred, average='weighted')}")
        print(f"VAL ACC: {accuracy_score(true, pred)}")
    file = open('downloaded_fasttext.model', 'w+')    
    torch.save(model.state_dict(), f'downloaded_fasttext.model')


In [16]:
# Loading saved model
model.load_state_dict(torch.load('downloaded_fasttext.model', map_location=torch.device('cpu')))

<All keys matched successfully>

In [17]:
# Testing results
true, pred = val_loop(model, criterion, test_iterator)
true = [x.item() for x in true]
pred = [x.item() for x in pred]
print(f"TEST F-1: {f1_score(true, pred, average='weighted')}")
print(f"TEST ACC: {accuracy_score(true, pred)}")

100%|██████████| 123/123 [00:02<00:00, 46.37it/s]


TEST F-1: 0.745655030258186
TEST ACC: 0.7451678535096643
