In [2]:
#pip install transformers datasets scikit-learn torch

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("datasets/cleanedDataSecondModel.csv")

# Rename for consistency (optional)
df = df[['quote', 'category_encoded']]
df.columns = ['text', 'label']

# Split into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)


In [4]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

# Tokenize text
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


In [5]:
import torch

class QuoteDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        } | {'labels': torch.tensor(self.labels[idx])}

train_dataset = QuoteDataset(train_encodings, train_labels)
test_dataset = QuoteDataset(test_encodings, test_labels)


In [6]:
#pip install transformers[torch]


In [7]:
#pip install accelerate>=0.26.0

In [None]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

num_labels = len(set(df['label']))
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./roberta_results",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_dir="./roberta_logs",
    logging_steps=10
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train
trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [None]:
import numpy as np
from sklearn.metrics import classification_report

# Make predictions
preds = trainer.predict(test_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)

# Print evaluation
print(classification_report(test_labels, pred_labels))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         2
           2       0.25      1.00      0.40         1
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         1
           6       0.20      0.50      0.29         6
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.50      0.50      0.50         4
          12       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          16       0.08      1.00      0.14         1
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         1
          20       0.50      0.50      0.50         4
          21       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def predict_quote_category(text, model, tokenizer):
    # Tokenize the input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Make prediction
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_label = torch.argmax(probs, dim=1).item()

    # Get category name from label ID
    return label_mapping[predicted_label], probs.detach().numpy()


In [None]:
quote = "Believe in yourself and all that you are."
predicted_category, confidence_scores = predict_quote_category(quote, model, tokenizer)

print("Predicted Category:", predicted_category)
print("Confidence Scores:", confidence_scores)


NameError: name 'label_mapping' is not defined