In [1]:
#pip install transformers datasets scikit-learn torch

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("datasets/cleanedDataSecondModel_merged.csv")
df = df[['quote', 'general_label_encoded']]
df.columns = ['text', 'label']

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)


In [2]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [3]:
import torch

class QuoteDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        } | {'labels': torch.tensor(self.labels[idx])}

train_dataset = QuoteDataset(train_encodings, train_labels)
test_dataset = QuoteDataset(test_encodings, test_labels)


In [5]:
# pip install transformers[torch]


In [None]:
# pip install accelerate>=0.26.0

In [5]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

num_labels = len(set(df['label']))
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels) #roberta

# Training arguments
training_args = TrainingArguments(
    output_dir="./roberta_results", #ganti roberta
    num_train_epochs=4, #4 trllu berat
    per_device_train_batch_size=8, #8 trllu berat
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_dir="./roberta_logs",
    logging_steps=10
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.5952,1.530409
2,1.1137,1.184999
3,0.8445,1.079269
4,0.4546,1.185668


TrainOutput(global_step=400, training_loss=1.0350451064109802, metrics={'train_runtime': 1018.6843, 'train_samples_per_second': 3.141, 'train_steps_per_second': 0.393, 'total_flos': 169383824985600.0, 'train_loss': 1.0350451064109802, 'epoch': 4.0})

In [6]:
import numpy as np
from sklearn.metrics import classification_report

preds = trainer.predict(test_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)
print(classification_report(test_labels, pred_labels))


              precision    recall  f1-score   support

           0       0.67      0.67      0.67        15
           1       0.80      0.50      0.62         8
           2       0.40      0.40      0.40        42
           3       0.56      0.69      0.62        29
           4       0.78      0.73      0.76        49
           5       0.71      0.70      0.71        57

    accuracy                           0.64       200
   macro avg       0.65      0.62      0.63       200
weighted avg       0.64      0.64      0.64       200



In [8]:
#predict
test_quote = "Success is not final, failure is not fatal"

inputs = tokenizer(test_quote, return_tensors="pt", truncation=True, padding=True, max_length=128)

device = next(model.parameters()).device 
inputs = {key: val.to(device) for key, val in inputs.items()}

model.eval()

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()

print(f"\nPredicted label for: \"{test_quote}\" → {predicted_label}")



Predicted label for: "Success is not final, failure is not fatal" → 5


5 -> Self & Inner Growth