This notebook implements a simple recurrent neural network on Supreme Court oral arguments. The model attempts to predict the outcome of cases. It is a binary classification problem.

In [None]:
from google.colab import drive # Uncomment first time running notebook
drive.mount('/content/drive') # Uncomment first time running notebook
!ls "/content/drive/Shareddrives/Advanced ML Project Spring 2021"

Mounted at /content/drive
 01b_cnn_offline.ipynb			  'Mid-quarter presentation.gslides'
'1 - Simple Sentiment Analysis.ipynb'	   naive_bayes.ipynb
'2 - Upgraded Sentiment Analysis.ipynb'   'Nathan Lit Review.gdoc'
 charlie_midterm_presentation_backup.mp4   notebooks
'Data sources.gdoc'			  'Project To-Do List.gdoc'
'Final presentation.gslides'		  'Proposal ideas.gdoc'
'Final report.gdoc'			  'Proposal (Rough Draft).gdoc'
 intermediate_data			   raw_data
 LSTM.ipynb				   scraped_data


In [None]:
import os
import urllib.request, json
import numpy as np
import pandas as pd
import torch

SHARED_DRIVE_BASE_FOLDER = "/content/drive/Shareddrives/Advanced ML Project Spring 2021/"
LABEL_VAR = 'partyWinning' # 'caseDisposition'
TEXT_VAR = 'text'
# BATCH_SIZE = 64 # batch size for training
BATCH_SIZE = 8 # batch size for training

# with open(SHARED_DRIVE_BASE_FOLDER + "intermediate_data/tuples_2017-2018.json") as fp:
#   tuple_list_2017_2018_data = json.load(fp)
# len(tuple_list_2017_2018_data)

In [None]:
import warnings   # being significantly revamped
import random
import numpy as np
import torch as T
import torchtext as tt
from torchtext.legacy import data

# device = T.device("cpu")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cuda


The cell below loads data into PyTorch Dataset objects, tokenizes the words, removes stop words, and splits into training, testing, and validation sets.

In [None]:
warnings.filterwarnings("ignore")  # else warnings spew
random.seed(2)
T.manual_seed(1)
np.random.seed(1)

print("\nBegin torchtext from raw data demo ")

print("\nCreating RAW, TEXT, LABEL Field objects ")
# RAW = tt.data.RawField()
RAW = data.RawField()
# TEXT = tt.data.Field(sequential=True,
TEXT = data.Field(sequential=True,
  init_token='',  # start of sequence
  eos_token='',   # end of sequence
  lower=True,
  tokenize=tt.data.utils.get_tokenizer("basic_english"),)
# LABEL = tt.data.Field(sequential=False,
LABEL = data.Field(sequential=False,
  use_vocab=False,
  unk_token=None,
  is_target=True,
  dtype=torch.float)

print("\nSplitting into train, valid, test ")
(train_obj, valid_obj, test_obj) = \
  data.TabularDataset.splits(
  # tt.data.TabularDataset.splits(
  path=".\\.data",
  # train='train.csv',
  train=SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/train_balanced_2010-2018.csv",
  # validation='validation.csv',
  validation=SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/validate_balanced_2010-2018.csv",
  # test='test.csv',
  test=SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/test_balanced_2010-2018.csv",
  format='csv',
  skip_header=True,
  # fields=[('id', RAW), ('review', TEXT),
  #   ('label', LABEL)])
  fields=[('id', RAW), ('label', LABEL), ('text', TEXT)])

print("\nThe \'text\' field for item [2] is: ")
print(train_obj[2].text)

print("\nCreating vocabulary object ")
TEXT.build_vocab(train_obj, min_freq=50)
print("The idx of \'good\' is ",
  TEXT.vocab.stoi['good'])  # 13
print("The string value of 8 is ",
  TEXT.vocab.itos[8])  # 'bad'

LABEL.build_vocab(train_obj)
print("LABEL.vocab.stoi is " + str(LABEL.vocab.stoi))
 
print("\nCreating a train BucketIterator ")

# train_iter  = tt.data.BucketIterator(
train_iter = data.BucketIterator(
  dataset=train_obj,
  batch_size = 2,
  sort_key=lambda x: len(x.review),
  shuffle=True,
  device=device)

valid_iter = data.BucketIterator(
  dataset=valid_obj,
  batch_size = 2,
  sort_key=lambda x: len(x.review),
  shuffle=True,
  device=device)

test_iter = data.BucketIterator(
  dataset=test_obj,
  batch_size = 2,
  sort_key=lambda x: len(x.review),
  shuffle=True,
  device=device)
# print("train_obj")
# print(train_obj)
# print(next(train_obj))

print("\nIterating train data (batch_size=2) ")
# for item in iter(train_iter):
for item in train_iter:
  print("\n=====\n")
  print(item.id)
  print(item.text)
  print(item.label)
  
print("\nEnd of demo ")


Begin torchtext from raw data demo 

Creating RAW, TEXT, LABEL Field objects 

Splitting into train, valid, test 

The 'text' field for item [2] is: 
['well', 'hear', 'argument', 'next', 'in', 'case', '121497', 'kellogg', 'brown', 'root', 'services', 'v', 'united', 'states', 'ex', 'rel', 'benjamin', 'carter', 'mr', 'elwood', 'mr', 'chief', 'justice', 'and', 'may', 'it', 'please', 'the', 'court', 'by', 'clearing', 'the', 'way', 'for', 'relator', 'to', 'file', 'a', 'fifth', 'identical', 'false', 'claims', 'act', 'complaint', 'against', 'kbr', 'raising', 'allegations', 'the', 'government', 'had', 'long', 'known', 'from', 'other', 'sources', 'the', 'court', 'below', 'erred', 'in', 'two', 'respects', 'first', 'the', 'plain', 'text', 'and', 'and', 'history', 'of', 'the', 'wartime', 'suspension', 'of', 'limitations', 'act', 'confirmed', 'that', 'it', 'applies', 'exclusively', 'to', 'crimes', 'the', 'language', 'of', 'the', 'provision', 'tolls', 'limitations', 'tolls', 'limitations', 'periods

In [None]:
print("The size of this vocabulary is {}".format(len(TEXT.vocab)))
!ls

The size of this vocabulary is 4578
drive  sample_data


In [None]:
print(f'Number of training examples: {len(train_iter)}')

Number of training examples: 227


Again, we'll view how many examples are in each split.

In [None]:
print(f'Number of training examples: {len(train_iter)}')
print(f'Number of validation examples: {len(valid_iter)}')
print(f'Number of testing examples: {len(test_iter)}')

Number of training examples: 227
Number of validation examples: 82
Number of testing examples: 72


In [None]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 4578
Unique tokens in LABEL vocabulary: 2


In [None]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 313375), ('that', 165361), ('to', 142242), ('of', 111527), ('and', 110603), ('a', 104173), ('is', 97071), ('in', 87910), ('it', 72871), ('i', 70492), ('you', 66947), ('this', 45852), ('not', 41830), ('have', 37049), ('be', 34691), ('but', 34654), ('if', 33050), ('what', 32345), ('we', 31433), ('was', 30999)]


In [None]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '', 'the', 'that', 'to', 'of', 'and', 'a', 'is']


Below is a simple implementation of an RNN. Thanks to Ben Trevett's tutorial on sentiment analysis with RNNs for the guidance here. https://github.com/bentrevett/pytorch-sentiment-analysis

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

We now create an instance of our RNN class. 

The input dimension is the dimension of the one-hot vectors, which is equal to the vocabulary size. 

The embedding dimension is the size of the dense word vectors. This is usually around 50-250 dimensions, but depends on the size of the vocabulary.

The hidden dimension is the size of the hidden states. This is usually around 100-500 dimensions, but also depends on factors such as on the vocabulary size, the size of the dense vectors and the complexity of the task.

The output dimension is usually the number of classes, however in the case of only 2 classes the output value is between 0 and 1 and thus can be 1-dimensional, i.e. a single scalar real number.

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [None]:
def count_parameters(model):
  '''
  Function to count the number of parameters in a model.
  '''
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 549,705 trainable parameters


## Train the Model

In [None]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
  '''
  Train the model.
  '''
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
  '''
  Measure epoch training time
  '''
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 30s
	Train Loss: 0.696 | Train Acc: 52.86%
	 Val. Loss: 0.705 |  Val. Acc: 45.12%
Epoch: 02 | Epoch Time: 1m 30s
	Train Loss: 0.697 | Train Acc: 49.12%
	 Val. Loss: 0.695 |  Val. Acc: 50.61%
Epoch: 03 | Epoch Time: 1m 30s
	Train Loss: 0.694 | Train Acc: 51.98%
	 Val. Loss: 0.706 |  Val. Acc: 43.90%
Epoch: 04 | Epoch Time: 1m 30s
	Train Loss: 0.697 | Train Acc: 48.68%
	 Val. Loss: 0.695 |  Val. Acc: 50.61%
Epoch: 05 | Epoch Time: 1m 30s
	Train Loss: 0.696 | Train Acc: 51.32%
	 Val. Loss: 0.701 |  Val. Acc: 45.73%


In [None]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.691 | Test Acc: 51.39%


In [None]:
model

RNN(
  (embedding): Embedding(4578, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)