In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm

df = pd.read_csv("TOSDR_labeled.csv")

label_map = {"bad": 0, "neutral": 1}
df['Point'] = df['Point'].map(label_map)
df = df.dropna()

train_texts, val_texts, train_labels, val_labels = train_test_split(df['QouteText'].values,
                                                                    df['Point'].values,
                                                                    test_size=0.23,
                                                                    random_state=42)

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_len = 128
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=max_len)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=max_len)

train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            torch.tensor(val_labels))


batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


num_epochs = 3
for epoch in range(num_epochs):
    train_loss = 0
    for batch in tqdm(train_loader):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()


    model.eval()
    val_preds = []
    val_labels = []
    for batch in val_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1]}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        val_preds.extend(preds)
        val_labels.extend(batch[2].cpu().numpy())

    val_accuracy = np.mean(np.array(val_preds) == np.array(val_labels))
    print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_loss / len(train_loader)}, Validation Accuracy: {val_accuracy}')

val_preds = np.array(val_preds)
val_labels = np.array(val_labels)
print(classification_report(val_labels, val_preds, target_names=label_map.keys()))


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 70/70 [36:07<00:00, 30.96s/it]  


Epoch 1/3, Training Loss: 0.36362067120415825, Validation Accuracy: 0.8647014361300076


100%|██████████| 70/70 [19:50<00:00, 17.01s/it]


Epoch 2/3, Training Loss: 0.23114497544510024, Validation Accuracy: 0.8964474678760394


100%|██████████| 70/70 [20:31<00:00, 17.59s/it]


Epoch 3/3, Training Loss: 0.1440897567995957, Validation Accuracy: 0.9115646258503401
              precision    recall  f1-score   support

         bad       0.69      0.74      0.72       199
     neutral       0.95      0.94      0.95      1124

    accuracy                           0.91      1323
   macro avg       0.82      0.84      0.83      1323
weighted avg       0.91      0.91      0.91      1323



In [25]:
def predict_sentence(sentence, model, tokenizer, device):

    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=max_len)
    

    inputs = {key: value.to(device) for key, value in inputs.items()}
    

    with torch.no_grad():
        outputs = model(**inputs)
    

    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()
    
    return predicted_label


sentence_to_predict = "warranty"


predicted_label = predict_sentence(sentence_to_predict, model, tokenizer, device)


label_map_reverse = {v: k for k, v in label_map.items()}
predicted_class = label_map_reverse[predicted_label]

print("Predicted class for the sentence:", predicted_class)


Predicted class for the sentence: neutral


In [None]:
model_save_path = "roberta_model"

model.save_pretrained(model_save_path)
