In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev sentence pairs, and unlabeled test sentence pairs, into lists.

In [2]:
import csv

In [3]:
train, dev, test = [], [], []

In [4]:
with open('./drive/MyDrive/HW4_upload/data/pnli_train.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        train.append(x)
print (len(train))
print (train[:3])

5983
[['Sometimes do exercise.', 'A person typically desire healthy life.', '1'], ['Who eats junk foods.', 'A person typically desire healthy life.', '0'], ['A person is sick.', 'A person typically desire healthy life.', '1']]


In [5]:
with open('./drive/MyDrive/HW4_upload/data/pnli_dev.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        dev.append(x)
print (len(dev))
print (dev[:3])

1055
[['A person is looking for accuracy.', 'A person typically desires accurate results.', '1'], ['A person does not care for accuracy.', 'A person typically desires accurate results.', '0'], ['The person double checks their data.', 'A person typically desires accurate results.', '1']]


In [6]:
with open('./drive/MyDrive/HW4_upload/data/pnli_test_unlabeled.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[0] and x[1] will be the sentence pairs.
        test.append(x)
print (len(test))
print (test[:3])

4850
[['The people want to have a romantic and pleasant feel.', 'People typically does desire to smell violets.'], ['The contract is to buy products from you.', 'Getting contract typically cause to make money or spend money.'], ['Train station is closed.', 'Line can typically be used to move train along tracks.']]


### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [7]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []

In [8]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 48.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 57.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 45.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

## Import the required modules

In [9]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import numpy as np
import pandas as pd

In [10]:
torch.cuda.empty_cache()

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
## Convert to Dataframe
train_df = pd.DataFrame(train, columns=['sent1','sent2','label'])
val_df = pd.DataFrame(dev, columns=['sent1','sent2','label'])
test_df = pd.DataFrame(test, columns=['sent1','sent2'])

In [13]:
# Get the Roberta Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

def tokenize_and_dataloader(df, batch_size, test=True, shuffle=False):
  sent1 = df['sent1'].to_list()
  sent2 = df['sent2'].to_list()
  if not test:
    # If test is false, label should be added
    label = df['label'].apply(lambda x: int(x)).to_list()
    y = torch.tensor(label)

  result = []
  segments = []
  attention_masks = []

  for s1, s2 in zip(sent1, sent2):
    # Tokenize first sentence
    s1_tokenize = tokenizer.encode(s1)

    # Tokenize second sentence
    s2_tokenize = tokenizer.encode(s2)

    # Add class tokens and seperator token
    tokenized_sentence = [tokenizer.cls_token_id] + s1_tokenize + [tokenizer.sep_token_id] + s2_tokenize + [tokenizer.sep_token_id]

    result.append(torch.tensor(tokenized_sentence))
    
  # Pad sequences for same length (of length 52)
  result = pad_sequence(result, batch_first=True)
  
  # If the dataset doesn't have or want labels, it excludes it
  if not test:
    data = TensorDataset(result, y)
  else:
    data = TensorDataset(result)

  # create a dataloader of specific batch_size
  data_loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle)
  

  return data_loader

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [14]:
# Get the model and set the number of labels to 2
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model = model.to(device)

# Get the AdamW optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Tokenize and get the train and validation dataloader
train_loader = tokenize_and_dataloader(train_df, 32, False, True)
dev_loader = tokenize_and_dataloader(val_df, 32)

# Store the validation labels seperatedly to test during evaluation epoch
label = val_df['label'].apply(lambda x: int(x)).to_list()
y_val = torch.tensor(label)

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

## Training

In [15]:
import time
import sys
from tqdm import tqdm, trange
t = trange(10, leave=True)

# Run it for 10 epochs
for e in range(10):
  model.train()
  for idx, (sentence, y) in enumerate(train_loader):
    optimizer.zero_grad()
    l, pred = model(sentence.to(device), labels=y.to(device)).values()
    y_pred = pred.to(device)
    y_pred = torch.tensor(y_pred).argmax(dim=1)

    # Calculate the accuracy
    acc = torch.sum(y.to(device)==y_pred)/float(y_pred.size(0))
    t.set_description(f"Epoch:({e+1}/10) | Accuracy: {acc.item()} | Loss: {l.item()}")
    l.backward()
    optimizer.step()
  
  # Evaluation using Validation dataset
  model.eval()
  test_samples = 0
  acc = 0
  results = []
  with torch.no_grad():
    for idx, sentence in enumerate(dev_loader):
      optimizer.zero_grad()
      pred = model(sentence[0].to(device))['logits']
      y_pred = pred.to(device)
      y_pred = torch.tensor(y_pred).argmax(dim=1)
      results.extend(y_pred.tolist())
      test_samples += len(y_pred)
  
    # Overall accuracy on the validation set.
    acc = float(torch.sum(torch.tensor(results)==y_val))/float(test_samples)
    print(f"\n Evaluation: Accuracy: {acc} | Loss: {l.item()}")

  if sys.path[0] == '':
Epoch:(2/10) | Accuracy: 0.8125 | Loss: 0.39797770977020264:   0%|          | 0/10 [00:29<?, ?it/s]


 Evaluation: Accuracy: 0.776303317535545 | Loss: 0.6723870038986206


Epoch:(3/10) | Accuracy: 0.875 | Loss: 0.3619672656059265:   0%|          | 0/10 [00:58<?, ?it/s]


 Evaluation: Accuracy: 0.8454976303317535 | Loss: 0.4033188223838806


Epoch:(4/10) | Accuracy: 0.90625 | Loss: 0.25574225187301636:   0%|          | 0/10 [01:27<?, ?it/s]


 Evaluation: Accuracy: 0.8473933649289099 | Loss: 0.19692257046699524


Epoch:(5/10) | Accuracy: 0.90625 | Loss: 0.18021129071712494:   0%|          | 0/10 [01:55<?, ?it/s]


 Evaluation: Accuracy: 0.8606635071090047 | Loss: 0.4784910976886749


Epoch:(6/10) | Accuracy: 1.0 | Loss: 0.062124285846948624:   0%|          | 0/10 [02:24<?, ?it/s] 


 Evaluation: Accuracy: 0.8511848341232228 | Loss: 0.1262306571006775


Epoch:(7/10) | Accuracy: 0.96875 | Loss: 0.0623679980635643:   0%|          | 0/10 [02:53<?, ?it/s] 


 Evaluation: Accuracy: 0.8654028436018958 | Loss: 0.17752647399902344


Epoch:(8/10) | Accuracy: 1.0 | Loss: 0.04164998233318329:   0%|          | 0/10 [03:22<?, ?it/s] 


 Evaluation: Accuracy: 0.8654028436018958 | Loss: 0.04985838383436203


Epoch:(9/10) | Accuracy: 0.96875 | Loss: 0.039852939546108246:   0%|          | 0/10 [03:51<?, ?it/s]


 Evaluation: Accuracy: 0.8568720379146919 | Loss: 0.15744751691818237


Epoch:(10/10) | Accuracy: 1.0 | Loss: 0.018276507034897804:   0%|          | 0/10 [04:20<?, ?it/s]  


 Evaluation: Accuracy: 0.8663507109004739 | Loss: 0.010458248667418957


Epoch:(10/10) | Accuracy: 0.9677419066429138 | Loss: 0.14713221788406372:   0%|          | 0/10 [04:47<?, ?it/s]


 Evaluation: Accuracy: 0.8710900473933649 | Loss: 0.14713221788406372


## Prediction on the Test Set

In [16]:
# Get the tokenized test dataloader
test_loader = tokenize_and_dataloader(test_df, 32)

In [17]:
results = []
with torch.no_grad():
    for batch_idx, sentence in enumerate(test_loader):
      optimizer.zero_grad()
      pred = model(sentence[0].to(device))['logits']
      y_pred = pred.to(device)
      y_pred = torch.tensor(y_pred).argmax(dim=1)
      results.extend(y_pred.tolist())
      test_samples += len(y_pred)

  import sys


### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [18]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 4850)

In [19]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [20]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions_latest.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')