In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
import pandas as pd
from scipy.special import softmax
import torch

In [None]:
# load data
data = pd.read_csv("results/reddit_prep.csv")

In [None]:
# Load model, tokenizer, and configuration
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)


In [5]:
# Clean text column: replace NaN with empty string, cast to str
data["body"] = data["body"].fillna("").astype(str)

In [15]:
# Batch inference
labels, scores = [], []
batch_size     = 32

for start in tqdm(range(0, len(data), batch_size)):
    batch_texts = data["body"].iloc[start:start + batch_size].tolist()

    encoded = tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to("cuda")

    with torch.no_grad():
        logits = model(**encoded).logits

    probs   = softmax(logits.cpu().numpy(), axis=1)
    preds   = probs.argmax(axis=1)

    labels.extend([config.id2label[i] for i in preds])
    scores.extend(probs)

100%|██████████| 46614/46614 [2:20:59<00:00,  5.51it/s]  


In [16]:
# 4. Attach predictions to DataFrame
data["pred_label"] = labels
score_mat = np.vstack(scores)
data["pred_score"] = score_mat.max(axis=1) 

In [None]:
data.head()