https://www.kaggle.com/code/jhoward/getting-started-with-nlp-for-absolute-beginners

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments,Trainer
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict

# Set use_fast=False to avoid using the fast tokenizer
model_nm = 'microsoft/deberta-v3-small'
tokz = AutoTokenizer.from_pretrained(model_nm)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../data/nlp_trainer_dataset.csv")
ds = Dataset.from_pandas(df)

In [3]:
def tok_func(x): return tokz(x["input"])
tok_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/884 [00:00<?, ? examples/s]

Map: 100%|██████████| 884/884 [00:00<00:00, 27384.80 examples/s]


In [4]:
tok_ds = tok_ds.rename_columns({'score':'labels'})

In [5]:
eval_df = pd.read_csv("../data/nlp_test_kaggle.csv")
eval_df['input'] = 'TEXT1: ' + eval_df.context + '; TEXT2: ' + eval_df.target + '; ANC1: ' + eval_df.anchor
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Map: 100%|██████████| 36/36 [00:00<00:00, 9597.95 examples/s]


In [6]:
dds = tok_ds.train_test_split(0.25, seed=42)

In [13]:
bs = 128
epochs = 4
lr = 8e-5

def corr(x,y): return np.corrcoef(x,y)[0][1]
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

args = TrainingArguments(
    output_dir='outputs',
    learning_rate=lr,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    fp16=False,
    evaluation_strategy="epoch",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to='none',
)

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dds['train'],
    eval_dataset=dds['test'],
    tokenizer=tokz,
    compute_metrics=corr_d,
)

ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [None]:
trainer.train();

In [None]:
preds = trainer.predict(eval_ds).predictions.astype(float)

In [16]:
import torch

print(torch.cuda.is_available())

False
