In [1]:
import torch
import random
import numpy as np
import regex

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def split_train_val_test(df, props=[.8, .1, .1]):
    assert round(sum(props), 2) == 1 and len(props) >= 2
    train_df, test_df, val_df = None, None, None

    train_size = int(props[0] * len(df))
    val_size =  train_size + int(props[1] * len(df))
    test_size =val_size + int(props[2] * len(df)) 
    train_df = df.iloc[0:train_size]
    val_df = df.iloc[train_size:val_size]
    test_df = df.iloc[val_size:test_size]
    
    return train_df, val_df, test_df

In [3]:
import gensim.downloader as api

def download_embeddings(fasttetxt):
    # https://fasttext.cc/docs/en/english-vectors.html
    if fasttetxt:
      wv = api.load("fasttext-wiki-news-subwords-300")
    else:
      
      wv = api.load("word2vec-google-news-300")
      print("\nLoading complete!\n" +
            "Vocabulary size: {}".format(len(wv.vocab)))
    return wv


In [4]:
# Opening and preprocessing input file
import gensim.models
import pandas as pd
import nltk
nltk.download('punkt')
from tqdm import tqdm
from src.preprocess import clean_text

data = pd.read_csv('train.csv', quotechar='"')
data.sample(frac=1)


# to convert authors into numbers
author_to_number = {
    'EAP': 0,
    'HPL': 1,
    'MWS': 2
    
}

# lowercase, removing punctuation and tookenize sentences. Converting labels to int
training_text = ""
for i in range(len(data)):

    data['text'][i] = nltk.word_tokenize(regex.sub(r'[^\w\s]', '',data['text'][i].lower()))
    data['author'][i] = author_to_number[data['author'][i]]

print(data[0:10])
print(len(data))

from src.dataset import *

# Splitting dataset and generating vocab
train_df, val_df, test_df = split_train_val_test(data)
train_vocab, reversed_vocab = generate_vocab_map(train_df)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
        id                                               text author
0  id26305  [this, process, however, afforded, me, no, mea...      0
1  id17569  [it, never, once, occurred, to, me, that, the,...      1
2  id11008  [in, his, left, hand, was, a, gold, snuff, box...      0
3  id27763  [how, lovely, is, spring, as, we, looked, from...      2
4  id12958  [finding, nothing, else, not, even, gold, the,...      1
5  id22965  [a, youth, passed, in, solitude, my, best, yea...      2
6  id09674  [the, astronomer, perhaps, at, this, point, to...      0
7  id13515  [the, surcingle, hung, in, ribands, from, my, ...      0
8  id19322  [i, knew, that, you, could, not, say, to, your...      0
9  id00912  [i, confess, that, neither, the, structure, of...      2
19579


In [5]:
DOWNLOAD = True
# Use fastext or word2vec
FASTTEXT = False
WINDOW_SIZE = 7

EMBEDDING_DIM = 300
HIDDEN_DIM = 512
NUM_LAYERS = 2
BIDIRECTIONAL = True


In [6]:
# Downloading or generating word2vec embeddings

if DOWNLOAD:
    model = download_embeddings(FASTTEXT)
else:
    if FASTTEXT:
        model = gensim.models.FastText(sentences=train_df['text'], size=EMBEDDING_DIM, window=WINDOW_SIZE)
    else:
        model = gensim.models.Word2Vec(sentences=train_df['text'], size=EMBEDDING_DIM, window=WINDOW_SIZE)
                        


Loading complete!
Vocabulary size: 3000000


In [7]:
from src.dataset import HeadlineDataset
from torch.utils.data import RandomSampler

train_dataset = HeadlineDataset(train_vocab, train_df,model.wv, FASTTEXT)
val_dataset = HeadlineDataset(train_vocab, val_df,model.wv, FASTTEXT)
test_dataset = HeadlineDataset(train_vocab, test_df,model.wv, FASTTEXT)

# Now that we're wrapping our dataframes in PyTorch datsets, we can make use of PyTorch Random Samplers.
train_sampler = RandomSampler(train_dataset)
val_sampler = RandomSampler(val_dataset)
test_sampler = RandomSampler(test_dataset)

  after removing the cwd from sys.path.
  """
  


In [8]:
from torch.utils.data import DataLoader
from src.dataset import collate_fn
BATCH_SIZE = 16
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

for x, y in test_iterator:
    print(x,y)
    break

  tokenized_word_tensor = torch.Tensor(tmp)


tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.1797, -0.0913, -0.1553,  ..., -0.1143, -0.0378, -0.1514],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.2256, -0.0195,  0.0908,  ...,  0.0282, -0.1777, -0.0060],
         [ 0.0771, -0.1396,  0.1445,  ..., -0.0845,  0.2002, -0.3145],
         [-0.0332, -0.0327, -0.0598,  ..., -0.0058,  0.1299, -0.0209],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0884, -0.0317, -0.1226,  ...,  0.0234,  0.2480, -0.1177],
         [-0.0581,  0.0581,  0.0133,  ..., -0

### Modeling

In [9]:
from src.models import ClassificationModel

model = ClassificationModel(len(train_vocab),embedding_dim=EMBEDDING_DIM,hidden_dim = HIDDEN_DIM,num_layers = NUM_LAYERS,bidirectional = BIDIRECTIONAL)

model.to(device)

ClassificationModel(
  (LSTM): LSTM(300, 512, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=3, bias=True)
  (softmax): Softmax(dim=1)
)

In the following cell, **instantiate the model with some hyperparameters, and select an appropriate loss function and optimizer.** 

Hint: we already use sigmoid in our model. What loss functions are availible for binary classification? Feel free to look at PyTorch docs for help!

In [10]:
from torch.optim import AdamW

criterion, optimizer = torch.nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

### Part 3: Training and Evaluation [10 Points]
The final part of this HW involves training the model, and evaluating it at each epoch. **Fill out the train and test loops below.**

In [11]:
# returns the total loss calculated from criterion
def train_loop(model, criterion, iterator):
    model.train()
    total_loss = 0
    
    for x, y in tqdm(iterator):
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()

        prediction = model(x)
        prediction = torch.squeeze(prediction)
        # y = y.round()
        # y = y.long()
        

 
        loss = criterion(prediction,y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss

# returns:
# - true: a Python boolean array of all the ground truth values 
#         taken from the dataset iterator
# - pred: a Python boolean array of all model predictions. 
def val_loop(model, criterion, iterator):
    true, pred = [], []
    for x, y in tqdm(iterator):
        x = x.to(device)
        y = y.to(device)
    
        preds = model(x)
        preds.to(device)
        preds = torch.squeeze(preds)
        for i_batch in range(len(y)):
            true.append(y[i_batch])
            pred.append(torch.argmax(preds[i_batch]))
            
    return true, pred


In [12]:
# To test your eval implementation, let's see how well the untrained model does on our dev dataset.
# It should do pretty poorly.
from src.eval_utils import binary_macro_f1, accuracy
true, pred = val_loop(model, criterion, val_iterator)
# print(binary_macro_f1(true, pred))
# print(accuracy(true, pred))


100%|██████████| 123/123 [00:05<00:00, 24.06it/s]


### Actually training the model

In [13]:
TOTAL_EPOCHS = 20
for epoch in range(TOTAL_EPOCHS):
    train_loss = train_loop(model, criterion, train_iterator)
    true, pred = val_loop(model, criterion, val_iterator)
    print(f"EPOCH: {epoch}")
    print(f"TRAIN LOSS: {train_loss}")
    print(f"VAL F-1: {binary_macro_f1(true, pred)}")
    print(f"VAL ACC: {accuracy(true, pred)}")


100%|██████████| 979/979 [02:32<00:00,  6.44it/s]
100%|██████████| 123/123 [00:04<00:00, 24.79it/s]


EPOCH: 0
TRAIN LOSS: 1000.0313270688057
VAL F-1: 0.5770874829168147
VAL ACC: 0.5769034236075626


100%|██████████| 979/979 [02:31<00:00,  6.48it/s]
100%|██████████| 123/123 [00:04<00:00, 25.67it/s]


EPOCH: 1
TRAIN LOSS: 942.1972339749336
VAL F-1: 0.5936830166164149
VAL ACC: 0.6050076647930506


100%|██████████| 979/979 [02:29<00:00,  6.53it/s]
100%|██████████| 123/123 [00:04<00:00, 24.76it/s]


EPOCH: 2
TRAIN LOSS: 800.6480236947536
VAL F-1: 0.6722938120897061
VAL ACC: 0.6729688298415942


100%|██████████| 979/979 [02:31<00:00,  6.48it/s]
100%|██████████| 123/123 [00:04<00:00, 25.47it/s]


EPOCH: 3
TRAIN LOSS: 716.2159390002489
VAL F-1: 0.6607763272435196
VAL ACC: 0.6642820643842616


100%|██████████| 979/979 [02:25<00:00,  6.71it/s]
100%|██████████| 123/123 [00:04<00:00, 26.26it/s]


EPOCH: 4
TRAIN LOSS: 662.8433213979006
VAL F-1: 0.720984600801737
VAL ACC: 0.7199795605518651


100%|██████████| 979/979 [02:23<00:00,  6.84it/s]
100%|██████████| 123/123 [00:04<00:00, 26.61it/s]


EPOCH: 5
TRAIN LOSS: 608.9507778435946
VAL F-1: 0.7146105859991924
VAL ACC: 0.7199795605518651


100%|██████████| 979/979 [02:24<00:00,  6.76it/s]
100%|██████████| 123/123 [00:04<00:00, 26.26it/s]


EPOCH: 6
TRAIN LOSS: 564.5167116299272
VAL F-1: 0.7204696578638852
VAL ACC: 0.7210015329586101


100%|██████████| 979/979 [02:23<00:00,  6.83it/s]
100%|██████████| 123/123 [00:04<00:00, 26.52it/s]


EPOCH: 7
TRAIN LOSS: 522.6278914809227
VAL F-1: 0.7400267545922364
VAL ACC: 0.740929994890138


100%|██████████| 979/979 [02:22<00:00,  6.87it/s]
100%|██████████| 123/123 [00:04<00:00, 26.15it/s]


EPOCH: 8
TRAIN LOSS: 497.16423062235117
VAL F-1: 0.7414763474366738
VAL ACC: 0.740929994890138


100%|██████████| 979/979 [02:22<00:00,  6.87it/s]
100%|██████████| 123/123 [00:04<00:00, 26.38it/s]


EPOCH: 9
TRAIN LOSS: 456.0402592010796
VAL F-1: 0.7321923313649958
VAL ACC: 0.7342871742462953


100%|██████████| 979/979 [02:22<00:00,  6.85it/s]
100%|██████████| 123/123 [00:04<00:00, 25.64it/s]


EPOCH: 10
TRAIN LOSS: 425.5233007967472
VAL F-1: 0.7256376154888545
VAL ACC: 0.7250894225855902


100%|██████████| 979/979 [02:19<00:00,  7.01it/s]
100%|██████████| 123/123 [00:04<00:00, 26.78it/s]


EPOCH: 11
TRAIN LOSS: 381.4862082824111
VAL F-1: 0.7327042146224879
VAL ACC: 0.7317322432294328


100%|██████████| 979/979 [02:20<00:00,  6.97it/s]
100%|██████████| 123/123 [00:04<00:00, 25.88it/s]


EPOCH: 12
TRAIN LOSS: 344.6356770209968
VAL F-1: 0.7183600285381923
VAL ACC: 0.7215125191619827


100%|██████████| 979/979 [02:20<00:00,  6.95it/s]
100%|██████████| 123/123 [00:04<00:00, 27.31it/s]


EPOCH: 13
TRAIN LOSS: 305.57615879084915
VAL F-1: 0.72762929513329
VAL ACC: 0.7271333673990802


100%|██████████| 979/979 [02:20<00:00,  6.97it/s]
100%|██████████| 123/123 [00:04<00:00, 25.82it/s]


EPOCH: 14
TRAIN LOSS: 279.14132490567863
VAL F-1: 0.7362752574501735
VAL ACC: 0.7363311190597854


100%|██████████| 979/979 [02:22<00:00,  6.87it/s]
100%|██████████| 123/123 [00:04<00:00, 25.64it/s]


EPOCH: 15
TRAIN LOSS: 234.77847710438073
VAL F-1: 0.7405425515214948
VAL ACC: 0.740929994890138


100%|██████████| 979/979 [02:23<00:00,  6.82it/s]
100%|██████████| 123/123 [00:04<00:00, 26.01it/s]


EPOCH: 16
TRAIN LOSS: 195.717875294853
VAL F-1: 0.733681870773233
VAL ACC: 0.7337761880429229


100%|██████████| 979/979 [02:28<00:00,  6.58it/s]
100%|██████████| 123/123 [00:04<00:00, 25.52it/s]


EPOCH: 17
TRAIN LOSS: 175.09464340494014
VAL F-1: 0.7343720155354642
VAL ACC: 0.7363311190597854


100%|██████████| 979/979 [02:26<00:00,  6.69it/s]
100%|██████████| 123/123 [00:04<00:00, 25.93it/s]


EPOCH: 18
TRAIN LOSS: 145.63777079596184
VAL F-1: 0.7105345217775607
VAL ACC: 0.711803781297905


100%|██████████| 979/979 [02:26<00:00,  6.68it/s]
100%|██████████| 123/123 [00:04<00:00, 25.51it/s]


EPOCH: 19
TRAIN LOSS: 134.48625293577788
VAL F-1: 0.7373006118930997
VAL ACC: 0.7383750638732755


We can also look at the models performance on the held-out test set, using the same val_loop we wrote earlier.

In [14]:
true, pred = val_loop(model, criterion, test_iterator)
print(f"TEST F-1: {binary_macro_f1(true, pred)}")
print(f"TEST ACC: {accuracy(true, pred)}")

100%|██████████| 123/123 [00:05<00:00, 24.17it/s]


TEST F-1: 0.7150672020517449
TEST ACC: 0.717935615738375
