In [1]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [2]:
!pip freeze

aiohttp @ file:///home/conda/feedstock_root/build_artifacts/aiohttp_1649013149994/work
aiosignal @ file:///home/conda/feedstock_root/build_artifacts/aiosignal_1636093929600/work
alembic @ file:///home/conda/feedstock_root/build_artifacts/alembic_1657813896088/work
altair @ file:///home/conda/feedstock_root/build_artifacts/altair_1640799865332/work
anyio @ file:///home/conda/feedstock_root/build_artifacts/anyio_1660053721269/work/dist
argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1640817743617/work
argon2-cffi-bindings @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi-bindings_1649500328244/work
astroid @ file:///home/conda/feedstock_root/build_artifacts/astroid_1655142257854/work
asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1618968359944/work
async-generator==1.10
async-timeout @ file:///home/conda/feedstock_root/build_artifacts/async-timeout_1640026696943/work
attrs @ file:///home/conda/feedstock_root/build_artifac

In [3]:
import pandas as pd
import os

# Load Arguments Dataset
data_folder = './data/'
arguments_file = 'arguments-training.tsv'
labels_file = 'labels-training.tsv'
arguments_train_df = pd.read_csv(os.path.join(data_folder, arguments_file), encoding='utf-8', sep='\t', header=0)
labels_train_df = pd.read_csv(os.path.join(data_folder, labels_file), encoding='utf-8', sep='\t', header=0)

print(arguments_train_df)
print(labels_train_df)

     Argument ID                      Conclusion       Stance  \
0         A01001  Entrapment should be legalized  in favor of   
1         A01002     We should ban human cloning  in favor of   
2         A01003      We should abandon marriage      against   
3         A01004       We should ban naturopathy      against   
4         A01005         We should ban fast food  in favor of   
...          ...                             ...          ...   
5215      D27096    Nepotism exists in Bollywood      against   
5216      D27097    Nepotism exists in Bollywood  in favor of   
5217      D27098         India is safe for women  in favor of   
5218      D27099         India is safe for women  in favor of   
5219      D27100         India is safe for women      against   

                                                Premise  
0     if entrapment can serve to more easily capture...  
1     we should ban human cloning as it will only ca...  
2     marriage is the ultimate commitment to 

In [5]:
from datasets import Dataset
import datasets
from sklearn.model_selection import train_test_split

# Combine the columsn in arguments to be a single field to give to bert

# Inputs: 
# an argument df from the source data (ArgumentId, Conclusion, Stance, Premise). 
# Labels df from file. 
# Name of label that will be trained on.

# Returns: df with a single column of arguments that is Conclusion: Conclusion, Stance: stance, Premise: Premise 
# along with the labels
def setup_train_df(arguments_df, labels_df, target_label):
    arguments_df['text'] = 'Conclusion: ' + arguments_df['Conclusion'] + ', Stance: ' + arguments_df['Stance'] + ', Premise: ' + arguments_df['Premise']
    resp = arguments_df.filter(['text'], axis=1)
    resp['label'] = labels_df[target_label]
    return resp

# This is where the specific value label is selected.
target_label = 'Achievement'
train = setup_train_df(arguments_train_df, labels_train_df, target_label)
train, test = train_test_split(train, test_size=0.2)
dataset = datasets.DatasetDict({"train":Dataset.from_pandas(train),"test":Dataset.from_pandas(test)})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4176
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 1044
    })
})


In [5]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# Do some huggingface/transformers setup
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_arguments = dataset.map(preprocess_function, batched=True)
print(tokenized_arguments)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 4176
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1044
    })
})


In [10]:
from typing import List
from transformers import AutoTokenizer


class BatchTokenizer:

  def __init__(self) -> None:
     self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    
  def get_sep_token(self,):
    return self.tokenizer.sep_token
  
  def __call__(self, batch: List[str]):

    enc = self.tokenizer(
        batch,
        padding=True,
        return_token_type_ids=False,
        return_tensors='pt',
        max_length=256,
        truncation=True
    )

    return enc 

In [11]:
from torch.utils.data.dataset import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import torch 

t = BatchTokenizer()

train_text = dataset['train']['text']
train_labels = torch.tensor(dataset['train']['label'])

train_encoded = t(*[train_text])
train_masks = train_encoded['attention_mask']
train_inp_ids = train_encoded['input_ids']

train_set = TensorDataset(train_inp_ids, train_masks, train_labels)

train_dataloader = DataLoader(
    train_set,
    sampler=RandomSampler(train_set),
    batch_size=16
)

#we can use the whole training dataset, validation dataset seems to also be available
#test dataset will be publicly available after the submission deadline 

In [12]:
import random
from typing import Dict
import numpy as np 

def predict(model: torch.nn.Module, map: Dict) -> List:
  out = model(**map)
  logits = out[0]
  logits = logits.detach().cpu()
  return list(torch.argmax(logits, axis=1).squeeze().numpy())

def precision(predicted_labels, true_labels, which_label=1):
  pred_which = np.array(predicted_labels) == which_label
  true_which = np.array(true_labels) == which_label
  denominator = np.sum(pred_which)
  if denominator:
    return np.sum(np.logical_and(pred_which, true_which))/denominator
  else:
    return 0. 

def recall(predicted_labels, true_labels, which_label=1):
  pred_which = np.array(predicted_labels) == which_label
  true_which = np.array(true_labels) == which_label
  denominator = np.sum(true_which)
  if denominator:
    return np.sum(np.logical_and(pred_which, true_which))/denominator
  else:
    return 0. 

def f1_score(predicted_labels, true_labels, which_label):
  P = precision(predicted_labels, true_labels, which_label=which_label)
  R = recall(predicted_labels, true_labels, which_label=which_label)
  if P and R:
    return  2*P*R/(P+R)
  else:
    return 0. 

def macro_f1(predicted_labels: List[int], true_labels: List[int], possible_labels: List[int]):
  scores = [f1_score(predicted_labels, true_labels, l) for l in possible_labels]
  return sum(scores) / len(scores)



device = torch.device("cuda")

def training_loop(num_epochs, train_features, dev_features, optimizer, model):
  print("Training...")
  for i in range(num_epochs):
    losses = []
    model.train()
    loss = 0 
    for n, features in enumerate(train_features):

      map = {
          'input_ids': features[0].to(device),
          'attention_mask': features[1].to(device),
          'labels': features[2].to(device)
      }

      model.zero_grad()

      out = model(**map)

      loss = out[0]

      losses.append(loss.item())

      loss.backward()

      optimizer.step()

    print(f"epoch {i}, loss: {sum(losses)/len(losses)}")
    print("Evaluating dev...")

    all_preds = []
    all_labels = [] 

    for features in dev_features:

      features = tuple(t.to(device) for t in features)

      input_ids, attention_mask, labels = features

      map = {
          'input_ids': features[0],
          'attention_mask': features[1]
      }

      pred = predict(model, map)

      all_preds.append(pred)
      labels = labels.cpu()
      all_labels.append(list(labels.numpy()))

    dev_f1 = macro_f1(all_preds, all_labels, [0, 1])
    print(f"Dev F1 {dev_f1}")
    print("-------------------------------------------------------")

In [6]:
from pynvml import *

# Running this model on GPU https://huggingface.co/docs/transformers/perf_train_gpu_one
# Takes some magic.
# Windows instructions:
# nvidia-smi should work from cmd
# Ended up doing this https://github.com/wookayin/gpustat/issues/90#issuecomment-753591406
# The dll name is nvml.dll

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [None]:
from transformers import AutoModelForSequenceClassification
epochs = 6

LR = 0.00001 

#model = choose .to(device)

#optimizer = torch.optim.Adam(model.parameters(), LR)

training_loop(
    epochs, 
    train_dataloader,
    #validation_dataloader, TODO
    #optimizer,
    #model,
)

In [None]:
#evaluate test 