### Part 1. Loading and Preprocessing Data 
The following cell loads the OnionOrNot dataset, and tokenizes each data item

In [1]:
# DO NOT MODIFY #
import torch
import random
import numpy as np
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
# this is how we select a GPU if it's avalible on your computer.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def split_train_val_test(df, props=[.8, .1, .1]):
    assert round(sum(props), 2) == 1 and len(props) >= 2
    train_df, test_df, val_df = None, None, None
    
    ## YOUR CODE STARTS HERE (~3-5 lines of code) ##
    # hint: you can use df.iloc to slice into specific indexes or ranges.
    train_size = int(props[0] * len(df))
    val_size =  train_size + int(props[1] * len(df))
    test_size =val_size + int(props[2] * len(df)) 
    train_df = df.iloc[0:train_size]
    val_df = df.iloc[train_size:val_size]
    test_df = df.iloc[val_size:test_size]
    

    ## YOUR CODE ENDS HERE ##
    
    return train_df, val_df, test_df

In [3]:
import gensim.models
import pandas as pd
import nltk
from tqdm import tqdm
from src.preprocess import clean_text

data = pd.read_csv('train.csv', quotechar='"')
data.sample(frac=1)


# to convert authors into numbers
author_to_number = {
    'EAP': 0,
    'HPL': 1,
    'MWS': 2
    
}

# lowercase and tookenize sentences and converting authors to int
training_text = ""
for i in range(len(data)):

    data['text'][i] = nltk.word_tokenize(data['text'][i].lower())
    data['author'][i] = author_to_number[data['author'][i]]


# print(train_df[0:100])



In [4]:
model = gensim.models.Word2Vec(sentences=data['text'], 
                 window=5, 
                 size=300,  
                 min_count=1
                 )

model.wv.get_vector('i')

array([-0.35372332, -0.21892434, -0.42669693, -0.8065087 , -0.35411844,
       -0.7134083 , -0.81613445,  1.2479382 , -0.0961616 , -0.27998012,
        0.20446391,  0.7864846 ,  0.1925291 ,  0.07181507, -0.8196775 ,
        0.37739325,  0.43254086, -0.9081421 , -0.35610607, -0.9284587 ,
       -0.11410665,  0.81164134, -0.467723  ,  0.12861249,  0.3100781 ,
        1.2033657 , -0.23873782, -0.5529341 ,  1.0321962 ,  0.30944267,
        1.431958  , -0.1352361 ,  0.27077287, -0.22875233, -0.1075282 ,
        1.1086254 , -0.5252576 , -0.8342886 , -0.10383235,  0.2144221 ,
        0.12922513,  1.0267302 ,  0.01532739, -0.84992987,  0.3042318 ,
        1.8826039 ,  0.41099328, -0.25291282, -0.2030294 ,  0.18681571,
        0.24375334,  0.60119206,  0.6150842 , -0.08565544, -0.27811393,
        0.30948603,  0.33192647,  0.7676214 ,  0.3884883 , -0.46885395,
        0.20916714,  0.4344627 , -0.01126379,  0.30733675,  0.11881754,
        0.04564675, -1.0912914 ,  0.0648623 , -1.1331195 , -0.56

In [5]:
from src.dataset import *
embeddings = []
# for i in range(len(data)):
#     embeddings_sentence = []
#     for word in data['text'][i]:
#         embeddings_sentence.append(model.wv.get_vector(word))
#     embeddings.append(embeddings_sentence)
# data.insert(3,'emb',embeddings)

train_df, val_df, test_df = split_train_val_test(data)
train_vocab, reversed_vocab = generate_vocab_map(train_df)

print(train_df[0:10])


        id                                               text author
0  id26305  [this, process, ,, however, ,, afforded, me, n...      0
1  id17569  [it, never, once, occurred, to, me, that, the,...      1
2  id11008  [in, his, left, hand, was, a, gold, snuff, box...      0
3  id27763  [how, lovely, is, spring, as, we, looked, from...      2
4  id12958  [finding, nothing, else, ,, not, even, gold, ,...      1
5  id22965  [a, youth, passed, in, solitude, ,, my, best, ...      2
6  id09674  [the, astronomer, ,, perhaps, ,, at, this, poi...      0
7  id13515  [the, surcingle, hung, in, ribands, from, my, ...      0
8  id19322  [i, knew, that, you, could, not, say, to, your...      0
9  id00912  [i, confess, that, neither, the, structure, of...      2


Here's what the dataset looks like. You can index into specific rows with pandas, and try to guess some of these yourself :)

Now that we've loaded this dataset, we need to split the data into train, validation, and test sets. We also need to create a vocab map for words in our Onion dataset, which will map tokens to numbers. This will be useful later, since torch models can only use tensors of sequences of numbers as inputs. **Go to src/dataset.py, and fill out split_train_val_test, generate_vocab_map**

PyTorch has custom Datset Classes that have very useful extentions. **Go to src/dataset.py, and fill out the HeadlineDataset class.** Refer to PyTorch documentation on Dataset Classes for help.

We can now use PyTorch DataLoaders to batch our data for us. **Go to src/dataset.py, and fill out collate_fn.** Refer to PyTorch documentation on Dataloaders for help.

In [6]:
from src.dataset import HeadlineDataset
from torch.utils.data import RandomSampler
#print(train_df)

train_dataset = HeadlineDataset(train_vocab, train_df,model.wv)
val_dataset = HeadlineDataset(train_vocab, val_df,model.wv)
test_dataset = HeadlineDataset(train_vocab, test_df,model.wv)

# Now that we're wrapping our dataframes in PyTorch datsets, we can make use of PyTorch Random Samplers.
train_sampler = RandomSampler(train_dataset)
val_sampler = RandomSampler(val_dataset)
test_sampler = RandomSampler(test_dataset)

In [7]:
from torch.utils.data import DataLoader
from src.dataset import collate_fn
BATCH_SIZE = 16
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)


In [8]:
# # Use this to test your collate_fn implementation.

# # You can look at the shapes of x and y or put print 
# # statements in collate_fn while running this snippet

for x, y in test_iterator:
    print(x,y)
    break

# embeddings = []
# for i in range(len(train_iterator.dataset.df)):
#     embeddings_sentence = []
#     for word in train_iterator.dataset.df['text'][i]:
#         embeddings_sentence.append(model.wv.get_vector(word))
#     embeddings.append(embeddings_sentence)
# data.insert(3,'emb',embeddings)

tensor([[[-1.0413e-02, -1.8419e-01, -4.6175e-01,  ..., -2.9385e-01,
          -4.0682e-01, -4.6937e-01],
         [ 8.5743e-02, -7.8340e-02, -1.0163e-01,  ...,  7.3698e-02,
          -2.0163e-01,  2.9778e-02],
         [ 3.2238e-01, -2.6410e-01, -7.1182e-01,  ..., -1.3495e-01,
          -1.9901e-01, -2.3080e-01],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[-3.5372e-01, -2.1892e-01, -4.2670e-01,  ...,  3.4975e-01,
          -9.3366e-01,  2.8156e-01],
         [-1.4319e-01, -1.8945e-01, -2.4934e-01,  ...,  2.5600e-01,
          -1.0182e+00,  1.6828e-01],
         [-1.4680e-01, -1.0427e-02, -2.5578e-01,  ...,  1.6077e-01,
          -1.0524e+00, -5.0740e-02],
         ...,
         [ 0.0000e+00,  0

### Part 2: Modeling [10 pts]
Let's move to modeling, now that we have dataset iterators that batch our data for us. **Go to src/model.py, and follow the instructions in the file to create a basic neural network. Then, create your model using the class, and define hyperparameters.** 

In [9]:
from src.models import ClassificationModel
model = None
### YOUR CODE GOES HERE (1 line of code) ###
model = ClassificationModel(len(train_vocab),embedding_dim=32,hidden_dim = 32,num_layers = 1,bidirectional = True)

# model.to(device)
# # 
### YOUR CODE ENDS HERE ###

In the following cell, **instantiate the model with some hyperparameters, and select an appropriate loss function and optimizer.** 

Hint: we already use sigmoid in our model. What loss functions are availible for binary classification? Feel free to look at PyTorch docs for help!

In [10]:
from torch.optim import AdamW

criterion, optimizer = None, None
### YOUR CODE GOES HERE ###
criterion, optimizer = torch.nn.CrossEntropyLoss(), torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

### YOUR CODE ENDS HERE ###

### Part 3: Training and Evaluation [10 Points]
The final part of this HW involves training the model, and evaluating it at each epoch. **Fill out the train and test loops below.**

In [11]:
# returns the total loss calculated from criterion
def train_loop(model, criterion, iterator):
    model.train()
    total_loss = 0
    
    for x, y in tqdm(iterator):
        optimizer.zero_grad()
        # x = x.to(device)
        # y = y.to(device)
        ### YOUR CODE STARTS HERE (~6 lines of code) ###
        prediction = model(x)
        prediction = torch.squeeze(prediction,0)
        y = y.round()
        y = y.long()

 
        loss = criterion(prediction,y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    # scheduler.step()
        ### YOUR CODE ENDS HERE ###
    return total_loss

# returns:
# - true: a Python boolean array of all the ground truth values 
#         taken from the dataset iterator
# - pred: a Python boolean array of all model predictions. 
def val_loop(model, criterion, iterator):
    true, pred = [], []
    ### YOUR CODE STARTS HERE (~8 lines of code) ###
    for x, y in tqdm(iterator):
        # x = x.to(device)
        # y = y.to(device)
        # print("x",x)
        # print("y",y)  
    
        preds = model(x)
        preds = torch.flatten(preds)
        for i_batch in range(len(y)):
            true.append(y[i_batch])
            pred.append(torch.argmax(preds[i_batch]))
            
            


    ### YOUR CODE ENDS HERE ###
    return true, pred


We also need evaluation metrics that tell us how well our model is doing on the validation set at each epoch. **Complete the functions in src/eval.py.**

In [12]:
# To test your eval implementation, let's see how well the untrained model does on our dev dataset.
# It should do pretty poorly.
from src.eval_utils import binary_macro_f1, accuracy
true, pred = val_loop(model, criterion, val_iterator)
# print(binary_macro_f1(true, pred))
# print(accuracy(true, pred))


100%|██████████| 123/123 [00:03<00:00, 34.96it/s]


### Part 4: Actually training the model [1 point]
Watch your model train :D You should be able to achieve a validation F-1 score of at least .8 if everything went correctly. **Feel free to adjust the number of epochs to prevent overfitting or underfitting.**

In [13]:
TOTAL_EPOCHS = 7
for epoch in range(TOTAL_EPOCHS):
    train_loss = train_loop(model, criterion, train_iterator)
    true, pred = val_loop(model, criterion, val_iterator)
    print(f"EPOCH: {epoch}")
    print(f"TRAIN LOSS: {train_loss}")
    print(f"VAL F-1: {binary_macro_f1(true, pred)}")
    print(f"VAL ACC: {accuracy(true, pred)}")


100%|██████████| 979/979 [01:37<00:00,  9.99it/s]
100%|██████████| 123/123 [00:03<00:00, 31.04it/s]


EPOCH: 0
TRAIN LOSS: 1077.0252841711044


ZeroDivisionError: float division by zero

We can also look at the models performance on the held-out test set, using the same val_loop we wrote earlier.

In [None]:
true, pred = val_loop(model, criterion, test_iterator)
print(f"TEST F-1: {binary_macro_f1(true, pred)}")
print(f"TEST ACC: {accuracy(true, pred)}")

100%|██████████| 123/123 [00:03<00:00, 31.93it/s]


ZeroDivisionError: division by zero