In [7]:
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

df1 = pd.read_json(
    'radical/aqcuisition_fase-scrapers/Youtube/output/yt_results_uniq_with_comments.jsonl',
    lines=True
)

print("Aantal video records:", len(df1))

Aantal video records: 527


In [8]:
MODEL_NAME = "tabularisai/multilingual-sentiment-analysis"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()

sentiment_map = {
    0: "Very Negative",
    1: "Negative",
    2: "Neutral",
    3: "Positive",
    4: "Very Positive"
}

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

In [9]:
rows = []

for _, video in df1.iterrows():
    video_id = video.get("video_id")

    for c in video.get("comment_list", []):
        rows.append({
            "video_id": video_id,
            "published_at": c.get("published_at"),
            "author": c.get("author"),
            "text": c.get("text", ""),
            "is_reply": False,
        })

        for r in c.get("replies", []):
            rows.append({
                "video_id": video_id,
                "published_at": r.get("published_at"),
                "author": r.get("author"),
                "text": r.get("text", ""),
                "is_reply": True,
            })

comments_df = pd.DataFrame(rows)
comments_df["text"] = comments_df["text"].astype(str)

print("Totaal aantal comments:", len(comments_df))

Totaal aantal comments: 85920


In [10]:
def predict_batched(texts, batch_size=16, max_length=256):
    labels, confs = [], []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        inputs = tokenizer(
            batch,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=max_length
        )

        with torch.no_grad():
            outputs = model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

        for p in probs:
            idx = torch.argmax(p).item()
            labels.append(sentiment_map[idx])
            confs.append(float(p[idx]))

        del inputs, outputs, probs

    return labels, confs


labels, confs = predict_batched(
    comments_df["text"].tolist(),
    batch_size=16,   # verlaag naar 8 als RAM problemen
    max_length=256   # verlaag naar 128 bij lange comments
)

comments_df["sentiment"] = labels
comments_df["confidence"] = confs

In [15]:
comments_df.head()
comments_df.info()

<class 'pandas.DataFrame'>
RangeIndex: 85920 entries, 0 to 85919
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   video_id      85920 non-null  str    
 1   published_at  85920 non-null  str    
 2   author        85920 non-null  str    
 3   text          85920 non-null  str    
 4   is_reply      85920 non-null  bool   
 5   sentiment     85920 non-null  str    
 6   confidence    85920 non-null  float64
dtypes: bool(1), float64(1), str(5)
memory usage: 4.0 MB


In [16]:
comments_df.to_json('radical/analyse_fase-kenmerken_onderzoek/Data/yt_comments_sentiment-labeld.json', orient='records', lines=True)