Following tutorial [Building a Text Classification Model using DistilBERT](https://medium.com/@prakashram1327/building-a-text-classification-model-using-distilbert-703c1409696c)

In [1]:
# Import necessary libraries
import pandas as pd
import re
from sklearn import preprocessing

model = "gpt-3.5-turbo-1106"

# Load data from CSV file
train_path = f"../data/{model}/train.csv"
test_path = f"../data/{model}/test.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [2]:
class TextCleaner():
    def __init__(self):
        pass
    
    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

cleaner = TextCleaner()
train_df['cleaned_text'] = train_df['prompt'].apply(cleaner.clean_text)
test_df['cleaned_text'] = test_df['prompt'].apply(cleaner.clean_text)

In [3]:
score_encoder = preprocessing.LabelEncoder()
train_df['label'] = score_encoder.fit_transform(train_df['score'].tolist()) # Turns 0, 0.25, 0.5, 0.75, 1 into 0, 1, 2, 3, 4
test_df['label'] = score_encoder.fit_transform(test_df['score'].tolist())

In [72]:
from datasets import Dataset
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Convert datasets to tokenized format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

def tokenize_data(examples):
    return tokenizer(examples["cleaned_text"], truncation=True)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

Map: 100%|██████████| 24964/24964 [00:02<00:00, 12216.95 examples/s]
Map: 100%|██████████| 10699/10699 [00:00<00:00, 13750.18 examples/s]


In [76]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# Load pre-trained DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5)

# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=8e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch"
)

# Define Trainer object for training the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model(f'distilbert')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 45/2343 [17:45<15:06:27, 23.67s/it]
  2%|▏         | 46/2343 [16:25<13:39:46, 21.41s/it]
  0%|          | 0/2343 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [87]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

trained_model = AutoModelForSequenceClassification.from_pretrained("../distilbert_classifier/3epochs", num_labels=5)

In [88]:
trainer = Trainer(
    model=trained_model,
    args=training_args,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

In [64]:
preds = trainer.predict(tokenized_test)

100%|██████████| 1338/1338 [01:17<00:00, 17.37it/s]


In [70]:
pd.DataFrame(preds.predictions).value_counts()

0         1          2          3          4       
0.250063  -1.906416  -1.569271  -0.606630  0.833336    3948
                                           0.833336    2700
                                           0.833336    1286
                                           0.833336     723
                                           0.833336     386
                                           0.833336     317
                                           0.833336     246
                                           0.833336     154
                                -0.606631  0.833336     117
                                -0.606630  0.833336     106
                                           0.833336      91
                                           0.833336      90
                                           0.833336      72
                                           0.833336      62
                                           0.833336      44
                                           0.833

In [65]:
pred_labels = preds.predictions.argmax(axis=1)
print(pd.Series(pred_labels).value_counts())
accuracy = (pred_labels == preds.label_ids).mean()
print(f'Accuracy: {accuracy}')

# print(pd.Series(preds.label_ids).value_counts())
# print(test_df['label'].value_counts())

4    10699
Name: count, dtype: int64
Accuracy: 0.49116739882231986


In [39]:
import json

with open("eval_results.json", "w") as f:
    json.dump(eval_results, f)

In [97]:
input_text = "Writes sometimes always the same"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs = {k: v.to('cpu') for k, v in inputs.items()}  # Move inputs to CPU
trained_model.to('cpu')  # Ensure model is on CPU
outputs = trained_model(**inputs)  # Use the model's __call__ method
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.2501, -1.9064, -1.5693, -0.6066,  0.8333]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [95]:
inputs

{'input_ids': tensor([[ 101, 7009, 2025, 2467, 1996, 2168,  102]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}