The purpose of the following code was to apply the classifier created in notebook 7 (Creating the topical classifier) on the YouTube metadata tables created in notebook 4 (YouTube data cleaning). This way each video is assigned to one topic.
Parts of this code were generated with the help of ChatGPT and altered for the specific needs of this study.

In [1]:
import pandas as pd
BILD = pd.read_csv("BILD_ready_for_classification.csv")
DW = pd.read_csv("DW_ready_for_classification.csv")

In [5]:
import os, sys
print("cwd:", os.getcwd())
print("torch.py exists here?", os.path.exists("torch.py"))
print("torch/ dir exists here?", os.path.isdir("torch"))
print("first sys.path entry:", sys.path[0])
print("files in cwd:", sorted(os.listdir("."))[:50])


cwd: /content
torch.py exists here? False
torch/ dir exists here? False
first sys.path entry: /content
files in cwd: ['.config', '.ipynb_checkpoints', 'BILD_ready_for_classification.csv', 'DW_ready_for_classification.csv', 'final_model', 'sample_data']


In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_DIR = "final_model"

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

print("Loaded model from:", MODEL_DIR)


Loaded model from: final_model


In [7]:
import numpy as np
from tqdm import tqdm

LABEL_ID_TO_CATEGORY = {
    0: "Sports",
    1: "Panorama",
    2: "Economy",
    3: "Media / Culture",
    4: "Knowledge / Education",
    5: "Health / Medicine",
    6: "Politics",
    7: "Social Issues",
    8: "Cars / Transportation",
    9: "Internet",
    10: "Construction / Real Estate",
    11: "Environment",
    12: "Tourism / Vacations",
}

def add_topic_column(df, text_col="text", batch_size=32, max_length=512):
    df = df.copy()
    texts = df[text_col].fillna("").astype(str).tolist()

    model.eval()
    all_pred_ids = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i+batch_size]

            encodings = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            ).to(device)

            outputs = model(**encodings)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_pred_ids.extend(preds)

    topics = [LABEL_ID_TO_CATEGORY[int(i)] for i in all_pred_ids]

    df["classified topic"] = topics
    return df


BILD_with_topics = add_topic_column(BILD, text_col="text")
DW_with_topics   = add_topic_column(DW,   text_col="text")

BILD_with_topics.to_csv("BILD_with_topics.csv", index=False)
DW_with_topics.to_csv("DW_with_topics.csv", index=False)

print("saved: BILD_with_topics.csv & DW_with_topics.csv")


100%|██████████| 434/434 [03:59<00:00,  1.82it/s]
100%|██████████| 94/94 [01:10<00:00,  1.33it/s]


saved: BILD_with_topics.csv & DW_with_topics.csv
