---------------
**Author**: Gunnvant 

**Description**: Basic pipeline for text classification training

------------------

In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_dataset = load_dataset('csv',data_files="../dataset.csv")

In [3]:
train_test_dataset = raw_dataset['train'].train_test_split(0.20)

In [4]:
test_valid = train_test_dataset['test'].train_test_split(0.1)

In [5]:
from datasets import DatasetDict
train_test_valid = DatasetDict({
'train':train_test_dataset['train'],
 'test': test_valid['train'],
'valid': test_valid['test']
})

In [6]:
train_test_valid['train'][0:3]

{'text': ["Shaved my girlfriend's cat. think she'll take the hint?",
  "Rare video shows insane waterslide at new jersey's deadliest theme park",
  'I got a puppy for my daughter... good swap if you ask me.'],
 'humor': [True, False, True]}

In [7]:
def get_label(example):
    example['label'] = int(example['humor'])
    return example

In [8]:
data_with_label=train_test_valid.map(get_label)

Map: 100%|██████████████████████████████████████████████████████| 160000/160000 [00:07<00:00, 22024.11 examples/s]
Map: 100%|████████████████████████████████████████████████████████| 36000/36000 [00:01<00:00, 19029.64 examples/s]
Map: 100%|██████████████████████████████████████████████████████████| 4000/4000 [00:00<00:00, 23286.29 examples/s]


In [9]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification
ckpt="distilbert-base-uncased"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(ckpt)
id2label = {0: "NotHumour", 1: "Humour"}
label2id = {"NotHumour": 0, "Humour": 1}
model = AutoModelForSequenceClassification.from_pretrained(ckpt,num_labels=2,id2label=id2label,label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [12]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_with_label = data_with_label.map(preprocess_function)

Map: 100%|███████████████████████████████████████████████████████| 160000/160000 [00:24<00:00, 6612.69 examples/s]
Map: 100%|█████████████████████████████████████████████████████████| 36000/36000 [00:05<00:00, 7147.28 examples/s]
Map: 100%|███████████████████████████████████████████████████████████| 4000/4000 [00:00<00:00, 7412.43 examples/s]


In [13]:
## Setup evaluation metrics
import evaluate
acc=evaluate.load("accuracy")

In [14]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [15]:
## Training arguments
from transformers import Trainer, TrainingArguments

In [20]:
training_args = TrainingArguments(
    output_dir="humour_detector",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_with_label["train"],
    eval_dataset=data_with_label["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()