In [1]:
import csv
import json

import numpy as np
import pandas as pd
from datasets import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification

import torch

from tqdm import tqdm, trange
# from dataset_loader import load

In [2]:
train_df = pd.read_csv('all_train.csv')

In [3]:
train_df.head()

Unnamed: 0,question_id,text,label
0,17634,The show is based on forensic anthropology and...,0
1,14061,One of the reasons it * continues * is that su...,0
2,7023,When a person receives a donor organ transplan...,1
3,7345,"You * could * call them islands , but the whol...",0
4,14200,Without going into lots of biology : There are...,0


### Preprocess

In [5]:
all_dataset = load_dataset('csv', data_files={'train': 'train_HC3.csv', 'dev': 'test_wiki.csv'})
all_dataset

Found cached dataset csv (/home/haoquanz/.cache/huggingface/datasets/csv/default-36806a2d38c3d79b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 68344
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 60000
    })
})

#### The following code modifies the train and test csv to the same format

In [9]:
test_df = pd.read_csv('test_wiki.csv')

In [10]:
test_df.head()

Unnamed: 0,text,label
0,Sekhukhune I (Matsebe; circa 1814 – 13 August ...,0
1,Mount Washington is a peak in the Olympic Mou...,0
2,Acer hillsi is an extinct maple species in the...,0
3,Derrick George Sherwin (16 April 1936 – 9 Dece...,1
4,The Windows shell is the graphical user interf...,1


In [11]:
train_df.head()

Unnamed: 0,question_id,text,label
0,17634,The show is based on forensic anthropology and...,0
1,14061,One of the reasons it * continues * is that su...,0
2,7023,When a person receives a donor organ transplan...,1
3,7345,"You * could * call them islands , but the whol...",0
4,14200,Without going into lots of biology : There are...,0


In [17]:
train_HC3 = train_df.drop('question_id', axis = 1)
train_HC3.to_csv('train_HC3.csv', index = False)

In [18]:
df = pd.read_csv('train_HC3.csv')
df.head()

Unnamed: 0,text,label
0,The show is based on forensic anthropology and...,0
1,One of the reasons it * continues * is that su...,0
2,When a person receives a donor organ transplan...,1
3,"You * could * call them islands , but the whol...",0
4,Without going into lots of biology : There are...,0


### HuggingFace

In [6]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_all = all_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at /home/haoquanz/.cache/huggingface/datasets/csv/default-36806a2d38c3d79b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-a7a386dff6695c11.arrow


Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

In [7]:
model_name = "microsoft/MiniLM-L12-H384-uncased"

output_dir = '.'

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    learning_rate=8e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    do_eval=True,
    seed=1234,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    logging_dir=output_dir + 'logs/',
    label_smoothing_factor=0.1,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_all["train"],
    eval_dataset=tokenized_all["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.204,0.64629


TrainOutput(global_step=8543, training_loss=0.2204715601945529, metrics={'train_runtime': 938.4103, 'train_samples_per_second': 72.83, 'train_steps_per_second': 9.104, 'total_flos': 3140747957532864.0, 'train_loss': 0.2204715601945529, 'epoch': 1.0})

In [8]:
trainer.evaluate()

dev_logits = trainer.predict(tokenized_all["dev"])
print(dev_logits.predictions.shape, dev_logits.label_ids.shape)
dev_preds = np.argmax(dev_logits.predictions, axis=-1)
precision_recall_fscore_support(dev_logits.label_ids, dev_preds, average='binary')

(60000, 2) (60000,)


(0.7503353184183701, 0.9294852623533713, 0.8303572753855517, None)

In [9]:
fpr, tpr, threshold = roc_curve(np.array(dev_logits.label_ids), dev_preds)
auc(fpr, tpr)

0.8091436075367352