In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm

df = pd.read_csv("TOSDR_labeled.csv")

label_map = {"bad": 0, "neutral": 1}
df['Point'] = df['Point'].map(label_map)
df = df.dropna()

train_texts, val_texts, train_labels, val_labels = train_test_split(df['QouteText'].values,
                                                                    df['Point'].values,
                                                                    test_size=0.23,
                                                                    random_state=42)

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_len = 128
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=max_len)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=max_len)

train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            torch.tensor(val_labels))


batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


num_epochs = 3
for epoch in range(num_epochs):
    train_loss = 0
    for batch in tqdm(train_loader):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()


    model.eval()
    val_preds = []
    val_labels = []
    bad_confidences = []  # Confidence scores for the "bad" class
    for batch in val_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1]}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        bad_confidence = probs[:, 0].detach().cpu().numpy()  # Confidence scores for the "bad" class
        bad_confidences.extend(bad_confidence)
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        val_preds.extend(preds)
        val_labels.extend(batch[2].cpu().numpy())

    val_accuracy = np.mean(np.array(val_preds) == np.array(val_labels))
    print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_loss / len(train_loader)}, Validation Accuracy: {val_accuracy}')

val_preds = np.array(val_preds)
val_labels = np.array(val_labels)
print(classification_report(val_labels, val_preds, target_names=label_map.keys()))

# Additional analysis using bad_confidences
# You can use bad_confidences to analyze the intensity of badness for sentences labeled as "bad"
# bad_indices = np.where(val_labels == 0)[0]  # Indices of sentences labeled as "bad"
# for idx in bad_indices:
#     print(f"Sentence: {val_texts[idx]}, Badness Confidence: {bad_confidences[idx]}")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 70/70 [16:54<00:00, 14.49s/it]


Epoch 1/3, Training Loss: 0.3963406349931444, Validation Accuracy: 0.8495842781557067


100%|██████████| 70/70 [14:43<00:00, 12.62s/it]


Epoch 2/3, Training Loss: 0.2938976890274457, Validation Accuracy: 0.8790627362055934


100%|██████████| 70/70 [14:46<00:00, 12.66s/it]


Epoch 3/3, Training Loss: 0.2129213680114065, Validation Accuracy: 0.8858654572940288
              precision    recall  f1-score   support

         bad       0.89      0.28      0.42       199
     neutral       0.89      0.99      0.94      1124

    accuracy                           0.89      1323
   macro avg       0.89      0.64      0.68      1323
weighted avg       0.89      0.89      0.86      1323



In [2]:
bad_indices = np.where(val_labels == 0)[0]  # Indices of sentences labeled as "bad"
for idx in bad_indices[:10]:
    print(f"Badness Confidence: {bad_confidences[idx]}")

Badness Confidence: 0.7146732211112976
Badness Confidence: 0.042185228317976
Badness Confidence: 0.28201979398727417
Badness Confidence: 0.5453076958656311
Badness Confidence: 0.33847421407699585
Badness Confidence: 0.3652246296405792
Badness Confidence: 0.0923534408211708
Badness Confidence: 0.5594250559806824
Badness Confidence: 0.8760133981704712
Badness Confidence: 0.30464163422584534


In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch

def predict_sentence(sentence, model, tokenizer, device, max_len=128):
    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=max_len)
    
    # Move tensors to the appropriate device
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Interpret the output to get the predicted label and confidence scores
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1)
    predicted_label = torch.argmax(logits, dim=1).item()
    confidence_score = probs[:, predicted_label].item()
    
    return predicted_label, confidence_score

# Example sentence to predict
sentence_to_predict = "Please note that if you request the erasure of your personal information: We may retain some of your personal information as necessary for our legitimate business interests, such as fraud detection and prevention and enhancing safety.For example, if we suspend an Airbnb Account for fraud or safety reasons, we may retain certain information from that Airbnb Account to prevent that Member from opening a new Airbnb Account in the future.We may retain and use your personal information to the extent necessary to comply with our legal obligations.For example, Airbnb and Airbnb Payments may keep some of your information for tax, legal reporting and auditing obligations." 


# Load the saved model and tokenizer
model_save_path = "roberta_model"
loaded_model = RobertaForSequenceClassification.from_pretrained(model_save_path)
loaded_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Predict using the loaded model
predicted_label, confidence_score = predict_sentence(sentence_to_predict, loaded_model, loaded_tokenizer, device)

label_map = {"bad": 0, "neutral": 1}

# Map the predicted label to the original class label
label_map_reverse = {v: k for k, v in label_map.items()}
predicted_class = label_map_reverse[predicted_label]

print("Predicted class for the sentence:", predicted_class)
print("Risk Level:", confidence_score)


Predicted class for the sentence: bad
Risk Level: 0.8488851189613342


In [4]:
model_save_path = "roberta_model2"

model.save_pretrained(model_save_path)