In [None]:
!pip install unlimited-classifier

### Importing libraries

In [None]:
from datasets import load_dataset
from unlimited_classifier import TextClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm

### Loading datasets

In [None]:
#emotion
emotion_dataset = load_dataset("dair-ai/emotion")

n = 1000
test_dataset = emotion_dataset['test'].select(range(n))


classes = test_dataset.features["label"].names

idx2class = {idx:class_ for idx, class_ in enumerate(classes)}

N=8
train_dataset = emotion_dataset['train'].shuffle(seed=41).select(range(len(classes)*N))


### Loading a model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# model_name = 'google/flan-t5-base'
model_name = 'knowledgator/flan-t5-large-for-classification'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


### Finetuning

You can run model in zero-shot setting as well as additionally fine-tune on few-examples or providing the in a prompt.

**Skip this if you don't want to fine-tune the model**

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq

batch_size = 8

results = []
for id in tqdm(range(0, len(test_dataset), batch_size)):
    examples = test_dataset[id:id+batch_size]
    texts = examples['text']
    output = classifier.invoke_batch(texts)
    predicts = [res[0] for res in output]
    results+=predicts


def preprocess_function(examples):
    prefix = prompt

    inputs = [prefix + str(doc) for doc in examples["text"]]

    model_inputs = tokenizer(inputs, max_length=64, truncation=True)

    with tokenizer.as_target_tokenizer():

        labels = tokenizer([idx2class[id] for id in examples["label"]], max_length=8, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    del examples['label']
    del examples['text']

    return model_inputs


dataset = train_dataset.train_test_split(test_size=0.1)
tokenized_dataset = dataset.map(preprocess_function, batched=True)



data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="models/classifier_t5",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    save_steps = 300,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    fp16=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


### Initializing classifier

In [None]:
prompt = """Classify the following text and return just the single emotion name that represents it.

Text:"""

classifier = TextClassifier(
    labels=classes,
    model=model_name,
    tokenizer=model_name,
    device='cuda:0',
    num_beams=1,
    prompt=prompt
)

### Testing

In [None]:
batch_size = 5

results = []
for id in tqdm(range(0, len(test_dataset), batch_size)):
    examples = test_dataset[id:id+batch_size]
    output = classifier.invoke_batch(texts)
    predicts = [res[0] for res in output]
    results+=predicts


In [None]:
from sklearn.metrics import classification_report

class2idx = {class_:idx for idx, class_ in idx2class.items()}
predicts = [class2idx[res[0]] for res in results]
labels = test_dataset['label']
print(classification_report(labels, predicts, target_names=classes, digits=4))