## Models and Functions ##

In [2]:
# Import Libraries
import csv
import re
import os
import string
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [3]:
# Download File
if not os.path.exists("semeval-2020-task-7-dataset"):
  !wget https://cs.rochester.edu/u/nhossain/semeval-2020-task-7-dataset.zip
  !unzip semeval-2020-task-7-dataset.zip

--2020-09-18 00:27:37--  https://cs.rochester.edu/u/nhossain/semeval-2020-task-7-dataset.zip
Resolving cs.rochester.edu (cs.rochester.edu)... 192.5.53.208
Connecting to cs.rochester.edu (cs.rochester.edu)|192.5.53.208|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1621456 (1.5M) [application/zip]
Saving to: ‘semeval-2020-task-7-dataset.zip’


2020-09-18 00:27:38 (3.62 MB/s) - ‘semeval-2020-task-7-dataset.zip’ saved [1621456/1621456]

Archive:  semeval-2020-task-7-dataset.zip
   creating: semeval-2020-task-7-dataset/
  inflating: semeval-2020-task-7-dataset/.DS_Store  
   creating: semeval-2020-task-7-dataset/subtask-1/
  inflating: semeval-2020-task-7-dataset/subtask-1/train_funlines.csv  
  inflating: semeval-2020-task-7-dataset/subtask-1/.DS_Store  
  inflating: semeval-2020-task-7-dataset/subtask-1/test.csv  
  inflating: semeval-2020-task-7-dataset/subtask-1/dev.csv  
 extracting: semeval-2020-task-7-dataset/subtask-1/baseline.zip  
  inflating: semeval-2

In [4]:
# Utility Functions

# Preprocess function
def get_tokenized_corpus(corpus):
  punctuation = string.punctuation.replace('*', '')
  tokenized_corpus = []
  
  for sentence in corpus:
    sentence = "".join([i for i in sentence if i not in punctuation]) # Punctuation removal
    sentence = sentence.lower() # convert to lowercase
    tokenized_sentence = []
    for token in sentence.split(' '): 
      tokenized_sentence.append(token)
    tokenized_corpus.append(tokenized_sentence)
  return tokenized_corpus

def get_word2idx(tokenized_corpus):
  vocabulary = []
  for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)
  
  word2idx = {w: idx+1 for (idx, w) in enumerate(vocabulary)}
  # we reserve the 0 index for the padding token
  word2idx['<pad>'] = 0
 
  return word2idx

def get_model_inputs(tokenized_corpus, word2idx, labels):
  # we index our sentences
  vectorized_sents = [[word2idx[tok] for tok in sent if tok in word2idx] for sent in tokenized_corpus]

  # Sentence lengths
  sent_lengths = [len(sent) for sent in vectorized_sents]

  # Get maximum length
  max_len = max(sent_lengths)
  
  # we create a tensor of a fixed size filled with zeroes for padding
  sent_tensor = torch.zeros((len(vectorized_sents), max_len)).long()

  # we fill it with our vectorized sentences 
  for idx, (sent, sentlen) in enumerate(zip(vectorized_sents, sent_lengths)):
    sent_tensor[idx, :sentlen] = torch.LongTensor(sent)

  # Label tensor
  label_tensor = torch.FloatTensor(labels)
  
  return sent_tensor, label_tensor

# Define the test input function
def get_test_inputs(tokenized_corpus, word2idx):
  # we index our sentences
  vectorized_sents = [[word2idx[tok] for tok in sent if tok in word2idx] for sent in tokenized_corpus]

  # Sentence lengths
  sent_lengths = [len(sent) for sent in vectorized_sents]

  # Get maximum length
  max_len = max(sent_lengths)
  
  # we create a tensor of a fixed size filled with zeroes for padding
  sent_tensor = torch.zeros((len(vectorized_sents), max_len)).long()

  # we fill it with our vectorized sentences 
  for idx, (sent, sentlen) in enumerate(zip(vectorized_sents, sent_lengths)):
    sent_tensor[idx, :sentlen] = torch.LongTensor(sent)

  return sent_tensor

In [5]:
# RNN model
class RNN(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 bidirectional, dropout, pad_idx, variant):

        super().__init__()

        self.variant = variant
        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        # One single unidirectional RNN, bidirectional RNN or LSTM layer.
        if self.variant == 'rnn':
          self.rnn = nn.RNN(embedding_dim,
                            hidden_dim,
                            batch_first=True,
                            bidirectional=bidirectional,
                            num_layers=1)
        elif self.variant == 'lstm':
          self.rnn = nn.LSTM(embedding_dim, 
                               hidden_dim, 
                               batch_first=True)
          
        if self.bidirectional and self.variant != 'lstm':
            linear_hidden_in = hidden_dim * 2
        else:
            linear_hidden_in = hidden_dim
            

        # The output (linear) layer
        self.fc = nn.Linear(linear_hidden_in, output_dim)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):

        # Apply dropout to the embedding layer
        embedded = self.dropout(self.embedding(text))
        # Store the returned values from the RNN layer
        all_hidden, last_hidden = self.rnn(embedded)

        if self.variant == "lstm":
          last_hidden = last_hidden[0]
        elif self.bidirectional:
            # Concat the final forward (hidden[0,:,:]) and backward (hidden[1,:,:]) hidden layers
            last_hidden = torch.cat((last_hidden[0, :, :], last_hidden[1, :, :]), dim=-1)
        else:
          last_hidden = last_hidden.squeeze(0)

        # Output redictions.
        logits = self.fc(self.dropout(last_hidden))
        
        return logits

## Subtask 1 ##

In [6]:
# Download File
if not os.path.exists("semeval-2020-task-7-dataset"):
  !wget https://cs.rochester.edu/u/nhossain/semeval-2020-task-7-dataset.zip
  !unzip semeval-2020-task-7-dataset.zip

id = set()
corpus = []
grade = []

with open('semeval-2020-task-7-dataset/subtask-1/train.csv', newline='') as csvfile:
  reader = csv.DictReader(csvfile)
  for row in reader:
    if not row['id'] in id:
      edit = row['edit']
      corpus.append(re.sub('<[a-zA-z0-9]*/>', edit, row['original'])) # Substitute original tag with edits
      grade.append(float(row['meanGrade']))
      id.add(row['id'])

with open('semeval-2020-task-7-dataset/subtask-1/train_funlines.csv', newline='') as csvfile:
  reader = csv.DictReader(csvfile)
  for row in reader:
    if not row['id'] in id:   
      edit = row['edit']
      corpus.append(re.sub('<[a-zA-z0-9]*/>', edit, row['original'])) # Substitute original tag with edits
      grade.append(float(row['meanGrade']))
      id.add(row['id'])

print("Result: ")
print("Corpus size: " + str(len(corpus)))

Result: 
Corpus size: 17900


In [7]:
# Preprocess Corpus
tokenized_corpus = get_tokenized_corpus(corpus)
print(len(tokenized_corpus))
word2idx = get_word2idx(tokenized_corpus)
train_sent_tensor, train_label_tensor = get_model_inputs(tokenized_corpus, word2idx, grade)

print(f'Vocabulary size: {len(word2idx)}')
print('Training set tensor:')
print(train_sent_tensor.size())
print(train_sent_tensor[0])
print('Training Label tensor:')
print(train_label_tensor.size())

17900
Vocabulary size: 17103
Training set tensor:
torch.Size([17900, 28])
tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
Training Label tensor:
torch.Size([17900])


In [10]:
# Hyperparameters
INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 100
HIDDEN_DIM = 300
OUTPUT_DIM = 1
BIDIRECTIONAL = False
DROPOUT = 0.3
PAD_IDX = 0
EPOCHS = 50

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX,
            'lstm')

In [11]:
# Print the model
print(model)

# we use the stochastic gradient descent (SGD) optimizer
# optimizer = optim.SGD(model.parameters(), lr=LRATE)
optimizer = optim.Adam(model.parameters())

# Use RMSE for loss
eps = 1e-6
MSE = nn.MSELoss()
loss_fn = lambda predictions, target : torch.sqrt(MSE(predictions, target) + eps) #RMSE
#loss_fn = nn.MSELoss()

# Input and label tensors
feature = train_sent_tensor
target = train_label_tensor

################
# Start training
################
print(f'Will train for {EPOCHS} epochs')
loss_hist = []

with tqdm(range(1, EPOCHS + 1)) as pbar:
  for epoch in pbar:
    # to ensure the dropout (explained later) is "turned on" while training
    # good practice to include even if do not use here
    model.train()
    
    # we zero the gradients as they are not removed automatically
    optimizer.zero_grad()
    
    # squeeze is needed as the predictions will have the shape (batch size, 1)
    # and we need to remove the dimension of size 1
    predictions = model(feature).squeeze()

    # Compute the loss
    loss = loss_fn(predictions, target)
    # Q. Compute here the MSE loss
    train_loss = loss.item()

    # calculate the gradient of each parameter
    loss.backward()

    # update the parameters using the gradients and optimizer algorithm 
    optimizer.step()
    
    pbar.set_description(f'Epoch: {epoch:02}')
    print(f'Epoch: {epoch:02}, Train Loss: {train_loss:.3f}')
    loss_hist.append(train_loss)


RNN(
  (embedding): Embedding(17103, 100, padding_idx=0)
  (rnn): LSTM(100, 300, batch_first=True)
  (fc): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)
Will train for 50 epochs


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Epoch: 01, Train Loss: 1.231
Epoch: 02, Train Loss: 1.190
Epoch: 03, Train Loss: 1.147
Epoch: 04, Train Loss: 1.100
Epoch: 05, Train Loss: 1.043
Epoch: 06, Train Loss: 0.963
Epoch: 07, Train Loss: 0.829
Epoch: 08, Train Loss: 0.670
Epoch: 09, Train Loss: 0.710
Epoch: 10, Train Loss: 0.628
Epoch: 11, Train Loss: 0.660
Epoch: 12, Train Loss: 0.659
Epoch: 13, Train Loss: 0.632
Epoch: 14, Train Loss: 0.602
Epoch: 15, Train Loss: 0.607
Epoch: 16, Train Loss: 0.631
Epoch: 17, Train Loss: 0.617
Epoch: 18, Train Loss: 0.599
Epoch: 19, Train Loss: 0.603
Epoch: 20, Train Loss: 0.610
Epoch: 21, Train Loss: 0.614
Epoch: 22, Train Loss: 0.612
Epoch: 23, Train Loss: 0.606
Epoch: 24, Train Loss: 0.600
Epoch: 25, Train Loss: 0.600
Epoch: 26, Train Loss: 0.604
Epoch: 27, Train Loss: 0.607
Epoch: 28, Train Loss: 0.605
Epoch: 29, Train Loss: 0.600
Epoch: 30, Train Loss: 0.598
Epoch: 31, Train Loss: 0.600
Epoch: 32, Train Loss: 0.601
Epoch: 33, Train Loss: 0.602
Epoch: 34, Train Loss: 0.601
Epoch: 35, Tra

In [37]:
# Load test file
id = []
original = []

with open('semeval-2020-task-7-dataset/subtask-1/test.csv', 'r') as csvfile:
  reader = csv.DictReader(csvfile)
  for row in reader:
    id.append(row['id'])

    edit = row['edit']
    original.append(re.sub('<[a-zA-z0-9]*/>', edit, row['original']))

print("Result: ")
print("Test Corpus size: " + str(len(original)))

Result: 
Test Corpus size: 3024


In [38]:
model.eval()

tokenized_original = get_tokenized_corpus(original)
print(len(tokenized_original))

sent_tensor = get_test_inputs(tokenized_original, word2idx)

print(f'Vocabulary size: {len(word2idx)}')
print('Test set tensor:')
print(sent_tensor.size())


with torch.no_grad():
  pred = model(sent_tensor).squeeze().tolist()

print(len(pred))

3024
Vocabulary size: 17103
Test set tensor:
torch.Size([3024, 26])
3024


In [39]:
# Write the prediction to csv file
if os.path.exists('task-1-output.csv'):
  os.remove('task-1-output.csv')

with open('task-1-output.csv', 'w') as outf:
  writer = csv.writer(outf)
  writer.writerow(['id', 'pred'])
  for i in range(len(id)):
    writer.writerow([id[i], pred[i]])


## Subtask 2 ##

In [None]:
id = set()
corpus = []
grade = []

with open('semeval-2020-task-7-dataset/subtask-2/train.csv', newline='') as csvfile:
  reader = csv.DictReader(csvfile)
  for row in reader:
    ids = row['id'].split('-')

    if not ids[0] in id:
      edit1 = row['edit1']
      corpus.append(re.sub('<[a-zA-z0-9]*/>', edit1, row['original1'])) # Substitute original tag with edits
      grade.append(float(row['meanGrade1']))
      id.add(ids[0])

    if not ids[1] in id:
      edit2 = row['edit2']
      corpus.append(re.sub('<[a-zA-z0-9]*/>', edit2, row['original2'])) # Substitute original tag with edits
      grade.append(float(row['meanGrade2']))
      id.add(ids[1])

with open('semeval-2020-task-7-dataset/subtask-2/train_funlines.csv', newline='') as csvfile:
  reader = csv.DictReader(csvfile)
  for row in reader:
    ids = row['id'].split('-')

    if not ids[0] in id:
      edit1 = row['edit1']
      corpus.append(re.sub('<[a-zA-z0-9]*/>', edit1, row['original1'])) # Substitute original tag with edits
      grade.append(float(row['meanGrade1']))
      id.add(ids[0])

    if not ids[1] in id:
      edit2 = row['edit2']
      corpus.append(re.sub('<[a-zA-z0-9]*/>', edit2, row['original2'])) # Substitute original tag with edits
      grade.append(float(row['meanGrade2']))
      id.add(ids[1])

print("Result: ")
print("Corpus size: " + str(len(corpus)))

Result: 
Corpus size: 13049


In [None]:
# Preprocess Corpus
tokenized_corpus = get_tokenized_corpus(corpus)
print(len(tokenized_corpus))
word2idx = get_word2idx(tokenized_corpus)
train_sent_tensor, train_label_tensor = get_model_inputs(tokenized_corpus, word2idx, grade)

print(f'Vocabulary size: {len(word2idx)}')
print('Training set tensor:')
print(train_sent_tensor.size())
print(train_sent_tensor[0])
print('Training Label tensor:')
print(train_label_tensor.size())

13049
Vocabulary size: 13453
Training set tensor:
torch.Size([13049, 28])
tensor([ 1,  2,  3,  1,  4,  5,  6,  7,  8,  1,  9, 10, 11,  1,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
Training Label tensor:
torch.Size([13049])


In [None]:
# Hyperparameters
INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 64
HIDDEN_DIM = 128
OUTPUT_DIM = 1
BIDIRECTIONAL = False
DROPOUT = 0.1
PAD_IDX = 0
EPOCHS = 50

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX,
            'lstm')

In [None]:
# Print the model
print(model)

# we use the Adam optimizer
optimizer = optim.Adam(model.parameters())

# Use MSE for loss
loss_fn = nn.MSELoss()

# Input and label tensors
feature = train_sent_tensor
target = train_label_tensor

################
# Start training
################
print(f'Will train for {EPOCHS} epochs')
loss_hist = []

with tqdm(range(1, EPOCHS + 1)) as pbar:
  for epoch in pbar:
    # to ensure the dropout (explained later) is "turned on" while training
    # good practice to include even if do not use here
    model.train()
    
    # we zero the gradients as they are not removed automatically
    optimizer.zero_grad()
    
    # squeeze is needed as the predictions will have the shape (batch size, 1)
    # and we need to remove the dimension of size 1
    predictions = model(feature).squeeze()

    # Compute the loss
    loss = loss_fn(predictions, target)
    # Q. Compute here the MSE loss
    train_loss = loss.item()

    # calculate the gradient of each parameter
    loss.backward()

    # update the parameters using the gradients and optimizer algorithm 
    optimizer.step()
    
    pbar.set_description(f'Epoch: {epoch:02}')
    print(f'Epoch: {epoch:02}, Train Loss: {train_loss:.3f}')
    loss_hist.append(train_loss)


RNN(
  (embedding): Embedding(13453, 64, padding_idx=0)
  (rnn): LSTM(64, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)
Will train for 50 epochs


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Epoch: 01, Train Loss: 1.230
Epoch: 02, Train Loss: 1.188
Epoch: 03, Train Loss: 1.147
Epoch: 04, Train Loss: 1.107
Epoch: 05, Train Loss: 1.066
Epoch: 06, Train Loss: 1.024
Epoch: 07, Train Loss: 0.980
Epoch: 08, Train Loss: 0.933
Epoch: 09, Train Loss: 0.882
Epoch: 10, Train Loss: 0.826
Epoch: 11, Train Loss: 0.761
Epoch: 12, Train Loss: 0.685
Epoch: 13, Train Loss: 0.593
Epoch: 14, Train Loss: 0.482
Epoch: 15, Train Loss: 0.388
Epoch: 16, Train Loss: 0.506
Epoch: 17, Train Loss: 0.488
Epoch: 18, Train Loss: 0.405
Epoch: 19, Train Loss: 0.368
Epoch: 20, Train Loss: 0.373
Epoch: 21, Train Loss: 0.387
Epoch: 22, Train Loss: 0.399
Epoch: 23, Train Loss: 0.402
Epoch: 24, Train Loss: 0.399
Epoch: 25, Train Loss: 0.392
Epoch: 26, Train Loss: 0.381
Epoch: 27, Train Loss: 0.370
Epoch: 28, Train Loss: 0.359
Epoch: 29, Train Loss: 0.355
Epoch: 30, Train Loss: 0.355
Epoch: 31, Train Loss: 0.360
Epoch: 32, Train Loss: 0.366
Epoch: 33, Train Loss: 0.368
Epoch: 34, Train Loss: 0.367
Epoch: 35, Tra

In [None]:
# Load test file
id = []
original1 = []
original2 = []

with open('semeval-2020-task-7-dataset/subtask-2/test.csv', 'r') as csvfile:
  reader = csv.DictReader(csvfile)
  for row in reader:
    id.append(row['id'])

    edit1 = row['edit1']
    original1.append(re.sub('<[a-zA-z0-9]*/>', edit1, row['original1']))
    edit2 = row['edit2']
    original2.append(re.sub('<[a-zA-z0-9]*/>', edit2, row['original2']))

print("Result: ")
print("Dev1 Corpus size: " + str(len(original1)))
print("Dev2 Corpus size: " + str(len(original2)))

Result: 
Dev1 Corpus size: 2960
Dev2 Corpus size: 2960


In [None]:
model.eval()

tokenized_original1 = get_tokenized_corpus(original1)
tokenized_original2 = get_tokenized_corpus(original2)
print(len(tokenized_original1))
print(len(tokenized_original2))

orgn1_sent_tensor = get_test_inputs(tokenized_original1, word2idx)
orgn2_sent_tensor = get_test_inputs(tokenized_original2, word2idx)

print(f'Vocabulary size: {len(word2idx)}')
print('Dev set tensor:')
print(orgn1_sent_tensor.size())
print(orgn2_sent_tensor.size())


with torch.no_grad():
  pred_org1 = model(orgn1_sent_tensor).squeeze(1)
  pred_org2 = model(orgn2_sent_tensor).squeeze(1)

print(pred_org1[0:5])
print(pred_org2[0:5])

2960
2960
Vocabulary size: 13453
Dev set tensor:
torch.Size([2960, 25])
torch.Size([2960, 25])
tensor([[[1.0458],
         [1.0458],
         [1.0484],
         ...,
         [0.9114],
         [0.9114],
         [0.8937]]])
tensor([[[1.0484],
         [1.0482],
         [1.0482],
         ...,
         [0.8937],
         [0.8894],
         [0.8894]]])


In [None]:
# Write the prediction to csv file
pred = []
for i in range(len(pred_org1[0])):
  pred.append(0 if pred_org1[0][i][0].item() == pred_org2[0][i][0].item() else (1 if pred_org1[0][i][0].item() > pred_org2[0][i][0].item() else 2))

!unzip semeval-2020-task-7-dataset/subtask-2/baseline.zip
  
with open('task-2-output.csv', 'w') as outf:
  writer = csv.writer(outf)
  writer.writerow(['id', 'pred'])
  for i in range(len(id)):
    writer.writerow([id[i], pred[i]])


Archive:  semeval-2020-task-7-dataset/subtask-2/baseline.zip
  inflating: task-2-output.csv       
   creating: __MACOSX/
  inflating: __MACOSX/._task-2-output.csv  
