In [1]:
# %%
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import pandas as pd

# %%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained("../data/fine_tuned_bert_imdb").to(device)
tokenizer = BertTokenizer.from_pretrained("../data/fine_tuned_bert_imdb")
model.eval()

# Load plain text file where each line is a review
with open("../data/reviews.txt", "r", encoding="utf-8") as f:
    reviews = [line.strip() for line in f if line.strip()]

# %%
batch_size = 32
all_predictions = []
all_confidences = []

for i in tqdm(range(0, len(reviews), batch_size), desc="Running inference"):
    batch = reviews[i:i+batch_size]
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1)
        confs = torch.max(probs, dim=1).values

    all_predictions.extend(preds.cpu().tolist())
    all_confidences.extend(confs.cpu().tolist())

# Map predictions to label names if applicable (0=negative, 1=positive)
label_map = {0: "negative", 1: "positive"}
predicted_labels = [label_map[p] for p in all_predictions]

# Save results to CSV
df_out = pd.DataFrame({
    "review": reviews,
    "predicted_sentiment": predicted_labels,
    "confidence": all_confidences
})
df_out.to_csv("../data/review_predictions.csv", index=False)


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '../data/fine_tuned_bert_imdb'. Use `repo_type` argument if needed.