In [1]:
!pip install --quiet datasets evaluate transformers[sentencepiece]

[0m

## Importing Libraries

In [2]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AdamW, AutoModelForSequenceClassification, get_scheduler
import evaluate
from tqdm.auto import tqdm

## Data Preprocessing

In [3]:
raw_datasets = load_dataset("paws", "labeled_final")
# checkpoint = "domenicrosati/deberta-v3-large-finetuned-paws-paraphrase-detector" (already fine tuned on paws)
checkpoint = ''
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading builder script:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading and preparing dataset paws/labeled_final (download: 4.47 MiB, generated: 15.45 MiB, post-processed: Unknown size, total: 19.92 MiB) to /root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/8d567c6472623f42bd2cc635cad06932d0f0cd2f897db56013c1180f4317d338...


Downloading data:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Dataset paws downloaded and prepared to /root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/8d567c6472623f42bd2cc635cad06932d0f0cd2f897db56013c1180f4317d338. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/400 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

In [4]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "id"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [5]:
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Downloading:   0%|          | 0.00/883 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading:   0%|          | 0.00/1.62G [00:00<?, ?B/s]



In [7]:
metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.957, 'f1': 0.9520891364902506}

## Inference

In [9]:
from transformers import pipeline
# model = model.to("cpu")
classifier = pipeline("sentiment-analysis", model = model, tokenizer = tokenizer, device = 0)

In [14]:
#confirming the inference approach by comparing f1 and accuracy
actual = raw_datasets['validation']['label']
predictions = []
for example in raw_datasets['validation']:
    sentence_pair = '[CLS] ' + example['sentence1'] + ' [SEP] ' + example['sentence2'] + ' [SEP]'
    if classifier(sentence_pair)[0]['label'] == 'LABEL_1':
        predictions.append(1) 
    else:
        predictions.append(0)

In [19]:
from sklearn.metrics import f1_score, accuracy_score
print(f"F1 Score: {f1_score(actual, predictions)}")
print(f"Accuracy: {accuracy_score(actual, predictions)}")

F1 Score: 0.952195664257921
Accuracy: 0.957


## Uploading Model to Hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
path = "fine-tuned-paws"
save_dir = "paws"
model.push_to_hub(path)
tokenizer.push_to_hub(path)