In [17]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [1]:
!pip freeze

absl-py==1.3.0
accelerate==0.15.0
aiohttp==3.8.3
aiosignal==1.3.1
astunparse==1.6.3
async-timeout==4.0.2
asynctest==0.13.0
attrs==22.1.0
backcall==0.2.0
cachetools==5.2.0
certifi==2022.9.24
charset-normalizer==2.1.1
colorama==0.4.6
cycler==0.11.0
datasets==2.7.1
debugpy==1.6.3
decorator==5.1.1
dill==0.3.6
entrypoints==0.4
evaluate==0.3.0
filelock==3.8.0
flatbuffers==22.11.23
fonttools==4.38.0
frozenlist==1.3.3
fsspec==2022.11.0
gast==0.4.0
google-auth==2.15.0
google-auth-oauthlib==0.4.6
google-pasta==0.2.0
grpcio==1.51.1
h5py==3.7.0
htmlmin==0.1.12
huggingface-hub==0.11.1
idna==3.4
ImageHash==4.3.1
importlib-metadata==5.1.0
ipykernel==6.16.2
ipython==7.34.0
ipywidgets==8.0.2
jedi==0.18.2
Jinja2==3.1.2
joblib==1.2.0
jupyter_client==7.4.7
jupyter_core==4.11.2
jupyterlab-widgets==3.0.3
keras==2.10.0
Keras-Preprocessing==1.1.2
kiwisolver==1.4.4
libclang==14.0.6
Markdown==3.4.1
MarkupSafe==2.1.1
matplotlib==3.5.3
matplotlib-inline==0.1.6
multidict==6.0.3
multimethod==1.9
multiprocess==0.70.



In [2]:
import pandas as pd
import os

# Load Arguments Dataset
data_folder = './data/'
arguments_file = 'arguments-training.tsv'
labels_file = 'labels-training.tsv'
arguments_train_df = pd.read_csv(os.path.join(data_folder, arguments_file), encoding='utf-8', sep='\t', header=0)
labels_train_df = pd.read_csv(os.path.join(data_folder, labels_file), encoding='utf-8', sep='\t', header=0)

print(arguments_train_df)
print(labels_train_df)

     Argument ID                      Conclusion       Stance  \
0         A01001  Entrapment should be legalized  in favor of   
1         A01002     We should ban human cloning  in favor of   
2         A01003      We should abandon marriage      against   
3         A01004       We should ban naturopathy      against   
4         A01005         We should ban fast food  in favor of   
...          ...                             ...          ...   
5215      D27096    Nepotism exists in Bollywood      against   
5216      D27097    Nepotism exists in Bollywood  in favor of   
5217      D27098         India is safe for women  in favor of   
5218      D27099         India is safe for women  in favor of   
5219      D27100         India is safe for women      against   

                                                Premise  
0     if entrapment can serve to more easily capture...  
1     we should ban human cloning as it will only ca...  
2     marriage is the ultimate commitment to 

In [3]:
from datasets import Dataset
import datasets
from sklearn.model_selection import train_test_split

# Combine the columsn in arguments to be a single field to give to bert

# Inputs: 
# an argument df from the source data (ArgumentId, Conclusion, Stance, Premise). 
# Labels df from file. 
# Name of label that will be trained on.

# Returns: df with a single column of arguments that is Conclusion: Conclusion, Stance: stance, Premise: Premise 
# along with the labels
def setup_train_df(arguments_df, labels_df, target_label):
    arguments_df['text'] = 'Conclusion: ' + arguments_df['Conclusion'] + ', Stance: ' + arguments_df['Stance'] + ', Premise: ' + arguments_df['Premise']
    resp = arguments_df.filter(['text'], axis=1)
    resp['label'] = labels_df[target_label]
    return resp

# This is where the specific value label is selected.
target_label = 'Achievement'
train = setup_train_df(arguments_train_df, labels_train_df, target_label)
train, test = train_test_split(train, test_size=0.2)
dataset = datasets.DatasetDict({"train":Dataset.from_pandas(train),"test":Dataset.from_pandas(test)})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4176
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 1044
    })
})


In [4]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# Do some huggingface/transformers setup
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_arguments = dataset.map(preprocess_function, batched=True)
print(tokenized_arguments)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 4176
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1044
    })
})


In [5]:
from pynvml import *

# Running this model on GPU https://huggingface.co/docs/transformers/perf_train_gpu_one
# Takes some magic.
# Windows instructions:
# nvidia-smi should work from cmd
# Ended up doing this https://github.com/wookayin/gpustat/issues/90#issuecomment-753591406
# The dll name is nvml.dll

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [6]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [7]:
# Do training here
# This is based on this guide: https://huggingface.co/docs/transformers/tasks/sequence_classification
# TODO: Split train/test data. Optimizer. Fine tuning. Get the training to run. Evaluation.
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# For to("cuda") to work here, the GPU/CUDA version of torch needs to be installed https://pytorch.org/get-started/locally/
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to("cuda")
print_gpu_utilization()
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_arguments["train"],
    eval_dataset=tokenized_arguments["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

GPU memory occupied: 1557 MB.


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4176
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1305
  Number of trainable parameters = 66955010
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
