In [1]:
from google.colab import drive
drive.mount('/content/drive')

! pip install boto3

Mounted at /content/drive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting boto3
  Downloading boto3-1.24.20-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 4.5 MB/s 
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 5.6 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting botocore<1.28.0,>=1.27.20
  Downloading botocore-1.27.20-py3-none-any.whl (8.9 MB)
[K     |████████████████████████████████| 8.9 MB 31.3 MB/s 
[?25hCollecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.9-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 62.5 MB/s 
Installing collected packages: urllib3, jmespath, botocore, s3transfer, boto3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urll

In [2]:
import pandas as pd
import numpy as np
import sys
import os
import ast  # Used to read byte literals
import boto3
import io
import spacy
from tqdm.notebook import tqdm
import re

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import accuracy_score, f1_score




In [3]:
class TweetDataset(Dataset):
    def __init__(self, data, vocab, max_len):
        self.data = data
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        tokens = row['tokens']

        # Front paddings
        X = torch.zeros(self.max_len)
        for i, token in enumerate(tokens):
            X[self.max_len - len(tokens) + i] = self.vocab.get(token, 1)

        y = torch.tensor(row['label']).long()

        return X.long(), y

In [4]:
class RNN(nn.Module):
    def __init__(self, dict_length, embedding_size, hidden_size):
        super(RNN, self).__init__()
        # padding index turns off gradient for unknown tokens
        self.word_emb = nn.Embedding(dict_length, embedding_size, padding_idx=0)
        
        # RNN doesn't care about length of sequence
        # RNN does care about the size of the word embedding
        # hidden size dictates dimension of output of RNN
        self.rnn = nn.RNN(input_size=embedding_size, hidden_size=hidden_size, batch_first=True)
        self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, 83)
        self.relu = nn.ReLU()
        # PyTorch RNN outputs a sequence of same length as input
        # For many to one, we can either use the final hidden state OR
        # slap a linear layer on the output, taking in all the hidden states
        
    def forward(self, x):
        x = self.word_emb(x)
        
        # RNN layer outputs a tuple, the output and the final hidden state
        # taking the final hidden state as output
        out, hidden = self.rnn(x)
        x = self.linear1(hidden)
        x = self.relu(x)
        x = self.linear2(x)
        x = torch.squeeze(x.softmax(dim=2))
        
        return x

In [5]:
def one_pass(model, dataloader, optimizer, lossFun, backwards=True, print_loss=False):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for x, y in tqdm(dataloader):
        x,y = x.to(device), y.to(device)
        y_pred = model(x)
        loss = lossFun(y_pred, y)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss

def one_pass_acc(model, dataloader, num_points):
    model.eval()

    y_list = []
    y_pred_list = []
    
    for x, y in tqdm(dataloader):  
        x = x.to(device) 
        y_pred = model(x).argmax(dim=1)
        y_list += y.tolist()
        y_pred_list += y_pred.tolist()

    acc = accuracy_score(y_list, y_pred_list)
    f1 = f1_score(y_list, y_pred_list, average='weighted')

    return acc,f1

In [6]:
path = 'drive/MyDrive/Colab Notebooks/MSDS631/'
train_ds = torch.load(path+'train_wdtk_sm_ds.pt')
test_ds = torch.load(path+'test_wdtk_sm_ds.pt')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [7]:
train_dl = DataLoader(train_ds, batch_size=10000, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=10000, shuffle=True)

In [8]:
lossFun = nn.CrossEntropyLoss()

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
  rnn_model = RNN(len(train_ds.vocab), embedding_size=50, hidden_size=50).to(device)
  optimizer = optim.Adam(rnn_model.parameters(), lr = 0.01)
  
  print('Epoch: ', epoch)
  
  loss = one_pass(rnn_model, train_dl, optimizer, lossFun)
  print('Loss: ', loss)
  
  acc,f1 = one_pass_acc(rnn_model, test_dl, len(train_ds))
  print('Accuracy: ', acc)
  print('F1_Score: ', f1)



  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:  0


  0%|          | 0/67 [00:00<?, ?it/s]

Loss:  4.239255940736229


  0%|          | 0/34 [00:00<?, ?it/s]

Accuracy:  0.22858754282491436
F1_Score:  0.10133293390605193
Epoch:  1


  0%|          | 0/67 [00:00<?, ?it/s]

Loss:  4.239050587611412


  0%|          | 0/34 [00:00<?, ?it/s]

Accuracy:  0.2286505426989146
F1_Score:  0.10148939358798359
Epoch:  2


  0%|          | 0/67 [00:00<?, ?it/s]

Loss:  4.237472064459502


  0%|          | 0/34 [00:00<?, ?it/s]

Accuracy:  0.2276755446489107
F1_Score:  0.1007449835273397
Epoch:  3


  0%|          | 0/67 [00:00<?, ?it/s]

Loss:  4.241669882589312


  0%|          | 0/34 [00:00<?, ?it/s]

Accuracy:  0.2279485441029118
F1_Score:  0.1009379041271551
Epoch:  4


  0%|          | 0/67 [00:00<?, ?it/s]

Loss:  4.238000827049142


  0%|          | 0/34 [00:00<?, ?it/s]

Accuracy:  0.23626152747694504
F1_Score:  0.10609112574231441
