In [None]:
import json
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
#paths to directories
INPUT_FOLDER = Path("radical/aqcuisition_fase-scrapers/Reddit/data")
OUTPUT_FOLDER = Path("radical/analyse_fase-kenmerken_onderzoek/Data/reddit_jsons_with_sentiment")

OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

MODEL_NAME = "tabularisai/multilingual-sentiment-analysis"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()

sentiment_map = {
    0: "Very Negative",
    1: "Negative",
    2: "Neutral",
    3: "Positive",
    4: "Very Positive"
}



3.0.0


Unnamed: 0,post,comments,scraped_at
0,{'title': 'Mamdani is Islamifying New York and...,"[{'author': 'Agreeable-Turnip8509', 'body': 'T...",2026-02-06 21:15:33.610537
1,{'title': 'I support our law enforcement which...,"[{'author': 'AutoModerator', 'body': 'The OP h...",2026-02-06 21:17:06.288766
2,{'title': 'Wounder how long the ice thing will...,[],2026-02-06 21:20:06.433393
3,{'title': 'Racist anti ICE rioters harass Hisp...,[],2026-02-06 21:20:47.465233
4,{'title': 'Sen. Eric Schmitt shows how Public ...,[],2026-02-06 21:21:55.064818


In [None]:
def predict_batched(texts, batch_size=16, max_length=256):
    labels, confs = []

    labels, confs = [], []

    for i in range(0, len(texts), batch_size):
        batch = [str(x) for x in texts[i:i+batch_size]]

        inputs = tokenizer(
            batch,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=max_length
        )

        with torch.no_grad():
            outputs = model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

        for p in probs:
            idx = torch.argmax(p).item()
            labels.append(sentiment_map[idx])
            confs.append(float(p[idx]))

        del inputs, outputs, probs

    return labels, confs

In [None]:
for json_file in INPUT_FOLDER.glob("*.json"):

    with open(json_file, "r", encoding="utf-8") as f:
        record = json.load(f)

    targets = []
    setters = []

    # --- post.title
    post = record.get("post", {})
    if post.get("title"):
        targets.append(post["title"])
        setters.append(lambda lab, conf, post=post:
                       post.update({"sentiment": lab, "confidence": conf}))

    # --- post.text
    if post.get("text"):
        targets.append(post["text"])
        setters.append(lambda lab, conf, post=post:
                       post.update({"text_sentiment": lab, "text_confidence": conf}))

    # --- comments + replies
    def walk_comments(comment_list):
        for c in comment_list or []:
            if c.get("body"):
                targets.append(c["body"])
                setters.append(lambda lab, conf, c=c:
                               c.update({"sentiment": lab, "confidence": conf}))
            walk_comments(c.get("replies", []))

    walk_comments(record.get("comments", []))

    # --- sentiment berekenen
    if targets:
        labels, confs = predict_batched(targets)

        for setter, lab, conf in zip(setters, labels, confs):
            setter(lab, conf)

    # --- opslaan
    output_path = OUTPUT_FOLDER / json_file.name
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(record, f, ensure_ascii=False, indent=2)

    print(f"âœ… Klaar: {json_file.name}")

print("ðŸŽ¯ Alles verwerkt.")