### Pieter

In [1]:
import pandas as pd

# Load raw unlabeled comment pool (full, not the 150 labeled)
pieter_raw = pd.read_csv("../data/middle/pieter/pieter_combined_clean.csv")

# Preview columns
pieter_raw.columns


Index(['video_id', 'comment_id', 'reply_to_comment_id', 'original_text',
       'translated_text', 'emoji_only', 'reply_to_original_text'],
      dtype='object')

In [2]:
# Clean and prep
pieter_clean = pieter_raw.copy()

# Drop null or empty original text
pieter_clean = pieter_clean[pieter_clean['original_text'].notna()]
pieter_clean = pieter_clean[pieter_clean['original_text'].str.strip() != ""]

# Add manual_label_new placeholder
pieter_clean['manual_label_new'] = ""

# Reset index for safe slicing later
pieter_clean = pieter_clean.reset_index(drop=True)

# Preview
print(f"✅ Pieter pool cleaned with {len(pieter_clean)} comments ready for labeling.")
pieter_clean.head()


✅ Pieter pool cleaned with 4253 comments ready for labeling.


Unnamed: 0,video_id,comment_id,reply_to_comment_id,original_text,translated_text,emoji_only,reply_to_original_text,manual_label_new
0,https://www.tiktok.com/@thepieterkriel/video/7...,7508363880948581125,,I don't think he means Jesus sinned against Go...,I don't think he means Jesus sinned against Go...,,,
1,https://www.tiktok.com/@thepieterkriel/video/7...,7508344569785172744,,"Guys this is not abt religion, Vusisizwe is tr...","Guys this is not abt religion, Vusisizwe is tr...",,,
2,https://www.tiktok.com/@thepieterkriel/video/7...,7509886482505614098,,Book The Obscured Principles by Dorian Caine s...,Book The Obscured Principles by Dorian Caine s...,,,
3,https://www.tiktok.com/@thepieterkriel/video/7...,7508312245500609285,,Let us protect Vusisizwe at all costs,Let us protect Vusisizwe at all costs,,,
4,https://www.tiktok.com/@thepieterkriel/video/7...,7508299963786019640,,Yep. I’ve come to the conclusion that he was p...,Yep. Ive come to the conclusion that he was pr...,,,


In [None]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
import os

# === Config ===
SAVE_PATH = "../data/middle/pieter/benchmark_manual_labeled.csv"
categories = ["Chaos", "Ubuntu"]
label_target_per_category = {"Chaos": 264, "Ubuntu": 209}
batch_size = 10
current_index = 0

# === Load cleaned pool
df = pieter_clean.copy()
df['manual_label_new'] = df.get('manual_label_new', '')  # Ensure column exists

# === Load existing labels from save file
try:
    labeled_df = pd.read_csv(SAVE_PATH)
    df.set_index("comment_id", inplace=True)
    labeled_df.set_index("comment_id", inplace=True)
    df.update(labeled_df['manual_label_new'])
    df.reset_index(inplace=True)
    print("✅ Existing labels loaded.")

except Exception as e:
    print("⚠️ No existing label file found.", e)

# === Helper: Get only unlabeled rows
def get_unlabeled_subset():
    return df[~df['manual_label_new'].isin(["Chaos", "Ubuntu", "Discard"])].reset_index(drop=True)

unlabeled_df = get_unlabeled_subset()

# === Helper: Count current labels from saved CSV
def get_label_counts_from_csv():
    if os.path.exists(SAVE_PATH):
        temp = pd.read_csv(SAVE_PATH)
        return temp['manual_label_new'].value_counts()
    else:
        return pd.Series(dtype=int)

# === Save Chaos/Ubuntu only
def save_progress():
    df_to_save = df[df['manual_label_new'].isin(categories)]
    df_to_save[['comment_id', 'original_text', 'translated_text', 'manual_label_new']].to_csv(SAVE_PATH, index=False)
    print("✅ Saved Chaos/Ubuntu labels to:", SAVE_PATH)

# === Show UI
def show_next_batch():
    global current_index, unlabeled_df
    clear_output(wait=True)

    # Show progress
    counts = get_label_counts_from_csv()

    progress_lines = []
    for cat in categories:
        done = counts.get(cat, 0)
        target = label_target_per_category[cat]
        left = max(0, target - done)
        progress_lines.append(f"<b>{cat}</b>: {done} / {target} &nbsp; ({left} left to do)")

    progress_html = widgets.HTML(
        value="<h4>🎯 Labeling Progress</h4>" + "<br>".join(progress_lines)
    )


    # Comment UI blocks
    comment_blocks = []
    end_index = min(current_index + batch_size, len(unlabeled_df))

    for i in range(current_index, end_index):
        row = unlabeled_df.iloc[i]
        df_idx = df[df['comment_id'] == row['comment_id']].index[0]

        dropdown = widgets.Dropdown(
            options=["", "Chaos", "Ubuntu", "Discard"],
            value=df.at[df_idx, 'manual_label_new'],
            description="Label:",
            layout=widgets.Layout(width='250px')
        )

        def make_handler(idx):
            def on_change(change):
                if change['type'] == 'change' and change['name'] == 'value':
                    df.at[idx, 'manual_label_new'] = change['new']
            return on_change

        dropdown.observe(make_handler(df_idx))

        comment_html = widgets.HTML(
            value=f"""
            <b>Comment ID:</b> {row['comment_id']}<br><br>
            <b>Original:</b><br>{row['original_text']}<br><br>
            <b>Translated:</b><br>{row['translated_text']}<br>
            """
        )

        card = widgets.VBox([comment_html, dropdown])
        card.layout = widgets.Layout(
            border='1px solid #ddd',
            padding='10px',
            margin='0 0 12px 0'
        )

        comment_blocks.append(card)

    # Navigation buttons
    back_btn = widgets.Button(description="⬅️ Back")
    save_btn = widgets.Button(description="💾 Save")
    next_btn = widgets.Button(description="➡️ Next")

    def on_back(b):
        global current_index
        current_index = max(0, current_index - batch_size)
        show_next_batch()

    def on_next(b):
        global current_index
        current_index += batch_size
        show_next_batch()

    def on_save(b):
        save_progress()

    back_btn.on_click(on_back)
    save_btn.on_click(on_save)
    next_btn.on_click(on_next)

    nav = widgets.HBox([back_btn, save_btn, next_btn])
    display(widgets.VBox([progress_html] + comment_blocks + [nav]))


# === Launch
show_next_batch()



VBox(children=(HTML(value='<h4>🎯 Labeling Progress</h4><b>Chaos</b>: 26 / 264 &nbsp; (238 left to do)<br><b>Ub…

### Save 

In [5]:
import pandas as pd

saved = pd.read_csv("../data/middle/pieter/benchmark_manual_labeled.csv")
print(f"✅ Saved file has {len(saved)} rows")
print(saved['manual_label_new'].value_counts())
saved.head()



✅ Saved file has 72 rows
manual_label_new
Ubuntu    46
Chaos     26
Name: count, dtype: int64


Unnamed: 0,comment_id,original_text,translated_text,manual_label_new
0,7508312245500609285,Let us protect Vusisizwe at all costs,Let us protect Vusisizwe at all costs,Ubuntu
1,7508379942851592968,Listen carefully to what he is saying before c...,Listen carefully to what he is saying before c...,Ubuntu
2,7508295770643923718,Thanks for what you are doing. Society needs y...,Thanks for what you are doing. Society needs y...,Ubuntu
3,7508302538077291270,Society needs more voices ❤️,Society needs more voices,Ubuntu
4,7510218829995246344,This boy God protect you,This boy God protect you,Ubuntu


In [11]:
import pandas as pd

# File paths
main_path = "../notebooksV2/benchmark_manual_labeled.csv"
pieter_path = "../data/middle/pieter/benchmark_manual_labeled.csv"

# Load both
main_df = pd.read_csv(main_path)
pieter_df = pd.read_csv(pieter_path)

# Concatenate and deduplicate (Pieter labels take priority)
merged_df = pd.concat([main_df, pieter_df])
merged_df = merged_df.drop_duplicates(subset="comment_id", keep="last")

# Save the updated main benchmark file
merged_df.to_csv(main_path, index=False)

# Confirm
print("✅ Merge complete!")
print(f"📝 Total rows in updated benchmark: {len(merged_df)}")
print("🔢 Label distribution:")
print(merged_df['manual_label_new'].value_counts())


✅ Merge complete!
📝 Total rows in updated benchmark: 518
🔢 Label distribution:
manual_label_new
Middle    323
Ubuntu    136
Chaos      59
Name: count, dtype: int64


In [14]:
import pandas as pd

# Load the main benchmark
benchmark_path = "../notebooksV2/benchmark_manual_labeled.csv"
df = pd.read_csv(benchmark_path)

# Define all expected categories
expected_labels = ["Ubuntu", "Chaos", "Middle"]

# Count category occurrences
counts = df['manual_label_new'].value_counts()

# Print category counts
print("📊 Category counts in final benchmark:")
for label in expected_labels:
    print(f" - {label}: {counts.get(label, 0)}")

# Report any unexpected values
unexpected = set(counts.index) - set(expected_labels)
if unexpected:
    print("\n⚠️ Other unexpected labels found:")
    for u in unexpected:
        print(f" - {u}: {counts[u]}")

📊 Category counts in final benchmark:
 - Ubuntu: 136
 - Chaos: 59
 - Middle: 323


In [16]:
import pandas as pd

# === Config ===
MAIN_LABEL_PATH = "../notebooksV2/benchmark_manual_labeled.csv"
label_target_per_category = {"Chaos": 300, "Ubuntu": 300}
categories = list(label_target_per_category.keys())

# === Load main benchmark
main_df = pd.read_csv(MAIN_LABEL_PATH)
current_counts = main_df['manual_label_new'].value_counts()

# === Determine what's still needed
labels_needed = {cat: max(0, label_target_per_category[cat] - current_counts.get(cat, 0)) for cat in categories}
print("📉 Remaining labels needed for each category:")
for cat, remaining in labels_needed.items():
    print(f" - {cat}: {remaining} left to do")


📉 Remaining labels needed for each category:
 - Chaos: 241 left to do
 - Ubuntu: 164 left to do


### Dr Phillips

In [22]:
import pandas as pd

# Load full drphillips dataset
drphillips_raw = pd.read_csv("../data/chaos/drphillips/drphillips_combined_clean.csv")

# Clean and prep
drphillips_clean = drphillips_raw.copy()
drphillips_clean = drphillips_clean[drphillips_clean['original_text'].notna()]
drphillips_clean = drphillips_clean[drphillips_clean['original_text'].str.strip() != ""]
drphillips_clean['manual_label_new'] = ""
drphillips_clean = drphillips_clean.reset_index(drop=True)

print(f"✅ DrPhillips pool cleaned with {len(drphillips_clean)} comments ready for labeling.")
drphillips_clean.head()


✅ DrPhillips pool cleaned with 680 comments ready for labeling.


Unnamed: 0,video_id,comment_id,reply_to_comment_id,original_text,translated_text,emoji_only,reply_to_original_text,manual_label_new
0,https://www.tiktok.com/@dr_phillips_original/v...,7509239877658395448,,jy gaan my comments weer report maar jou kakeb...,You're going to report my comments again but y...,,,
1,https://www.tiktok.com/@dr_phillips_original/v...,7509395894028288774,,kom try my,Kom Try My,,,
2,https://www.tiktok.com/@dr_phillips_original/v...,7508847163221476101,,Nomer 1 in die Hof ook 👀,Nomer 1 in court too,👀,,
3,https://www.tiktok.com/@dr_phillips_original/v...,7509110164794393349,,Ai ek wens ek was so sterk en so slim soos jy ...,Oh I wish I was so strong and as smart as you ...,😢😢,,
4,https://www.tiktok.com/@dr_phillips_original/v...,7509169950792286981,,Se jy 😂,Say you,😂,,


In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import os

# === Config ===
SAVE_PATH = "../data/chaos/drphillips/benchmark_manual_labeled.csv"
categories = ["Chaos", "Ubuntu"]
label_target_per_category = {"Chaos": 241, "Ubuntu": 164}
batch_size = 10
current_index = 0

# === Load from cleaned pool
df = drphillips_clean.copy()

# === Insert manual label column if not present
df['manual_label_new'] = df.get('manual_label_new', '')

# === Load saved labels if exist
if os.path.exists(SAVE_PATH):
    labeled_df = pd.read_csv(SAVE_PATH)
    df.set_index("comment_id", inplace=True)
    labeled_df.set_index("comment_id", inplace=True)
    df.update(labeled_df['manual_label_new'])
    df.reset_index(inplace=True)
    print("✅ Existing labels loaded.")

# === Save function
def save_progress():
    df_to_save = df[df['manual_label_new'].isin(categories)]
    df_to_save[['comment_id', 'original_text', 'translated_text', 'manual_label_new']].to_csv(SAVE_PATH, index=False)
    print("✅ Saved Chaos/Ubuntu labels to:", SAVE_PATH)

# === Filter for unlabeled
def get_unlabeled_subset():
    return df[~df['manual_label_new'].isin(["Chaos", "Ubuntu", "Discard"])].reset_index(drop=True)

unlabeled_df = get_unlabeled_subset()

# === Show UI
def show_next_batch():
    global current_index, unlabeled_df
    clear_output(wait=True)

    # === Live progress counts from SAVE_PATH
    def get_label_counts_from_csv():
        if os.path.exists(SAVE_PATH):
            temp = pd.read_csv(SAVE_PATH)
            return temp['manual_label_new'].value_counts()
        else:
            return pd.Series(dtype=int)

    counts = get_label_counts_from_csv()

    # === Build progress summary
    progress_lines = []
    for cat in categories:
        done = counts.get(cat, 0)
        target = label_target_per_category[cat]
        left = max(0, target - done)
        progress_lines.append(f"<b>{cat}</b>: {done} / {target} &nbsp; ({left} left to do)")

    progress_html = widgets.HTML(
        value="<h4>🎯 Labeling Progress</h4>" + "<br>".join(progress_lines)
    )

    # === Comment batch
    comment_blocks = []
    end_index = min(current_index + batch_size, len(unlabeled_df))

    for i in range(current_index, end_index):
        row = unlabeled_df.iloc[i]
        df_idx = df[df['comment_id'] == row['comment_id']].index[0]

        dropdown = widgets.Dropdown(
            options=["", "Chaos", "Ubuntu", "Discard"],
            value=df.at[df_idx, 'manual_label_new'],
            description="Label:",
            layout=widgets.Layout(width='250px')
        )

        def make_handler(idx):
            def on_change(change):
                if change['type'] == 'change' and change['name'] == 'value':
                    df.at[idx, 'manual_label_new'] = change['new']
            return on_change

        dropdown.observe(make_handler(df_idx))

        comment_html = widgets.HTML(
            value=f"""
            <b>Comment ID:</b> {row['comment_id']}<br><br>
            <b>Original:</b><br>{row['original_text']}<br><br>
            <b>Translated:</b><br>{row['translated_text']}<br>
            """
        )

        card = widgets.VBox([comment_html, dropdown])
        card.layout = widgets.Layout(
            border='1px solid #ddd',
            padding='10px',
            margin='0 0 12px 0'
        )

        comment_blocks.append(card)

    # === Navigation buttons
    back_btn = widgets.Button(description="⬅️ Back")
    save_btn = widgets.Button(description="💾 Save")
    next_btn = widgets.Button(description="➡️ Next")

    def on_back(b):
        global current_index
        current_index = max(0, current_index - batch_size)
        show_next_batch()

    def on_next(b):
        global current_index
        current_index += batch_size
        show_next_batch()

    def on_save(b):
        save_progress()

    back_btn.on_click(on_back)
    save_btn.on_click(on_save)
    next_btn.on_click(on_next)

    nav = widgets.HBox([back_btn, save_btn, next_btn])
    
    # === FINAL DISPLAY
    display(widgets.VBox([progress_html] + comment_blocks + [nav]))


# === Launch
show_next_batch()


VBox(children=(HTML(value='<h4>🎯 Labeling Progress</h4><b>Chaos</b>: 14 / 18 &nbsp; (4 left to do)<br><b>Ubunt…

In [26]:
import pandas as pd

# === Paths
drphillips_path = "../data/chaos/drphillips/benchmark_manual_labeled.csv"
main_path = "../notebooksV2/benchmark_manual_labeled.csv"

# === Load both
main_df = pd.read_csv(main_path)
dr_df = pd.read_csv(drphillips_path)

# === Remove duplicates from main (if same comment_id exists in dr_df)
merged_df = pd.concat([
    main_df[~main_df['comment_id'].isin(dr_df['comment_id'])],
    dr_df
], ignore_index=True)

# === Save merged result back to main
merged_df.to_csv(main_path, index=False)
print("✅ DrPhillips labels successfully merged into main.")
print(f"📦 New total: {len(merged_df)} rows")


✅ DrPhillips labels successfully merged into main.
📦 New total: 741 rows


In [27]:
# === Count labels in merged main
counts = merged_df['manual_label_new'].value_counts()
print("📊 Label counts in MAIN benchmark_manual_labeled.csv:\n")
print(counts)


📊 Label counts in MAIN benchmark_manual_labeled.csv:

manual_label_new
Middle    323
Chaos     282
Ubuntu    136
Name: count, dtype: int64


### Pokkels

In [29]:
import pandas as pd

# Load full drphillips dataset
pokkels_raw = pd.read_csv("../data/chaos/pokkels/pokkels_combined_clean.csv")

# Clean and prep
pokkels_clean = pokkels_raw.copy()
pokkels_clean = pokkels_clean[pokkels_clean['original_text'].notna()]
pokkels_clean = pokkels_clean[pokkels_clean['original_text'].str.strip() != ""]
pokkels_clean['manual_label_new'] = ""
pokkels_clean = pokkels_clean.reset_index(drop=True)

print(f"✅ Pokkels pool cleaned with {len(pokkels_clean)} comments ready for labeling.")
pokkels_clean.head()

✅ Pokkels pool cleaned with 1289 comments ready for labeling.


Unnamed: 0,video_id,comment_id,reply_to_comment_id,original_text,translated_text,emoji_only,reply_to_original_text,manual_label_new
0,https://www.tiktok.com/@pokkels_/video/7509108...,7509211577100714758,,You must learn to forgive yourself 💕 You are w...,You must learn to forgive yourself You are wo...,💕🙏🏻,,
1,https://www.tiktok.com/@pokkels_/video/7509108...,7509517276912157445,,So jammer❤️❤️❤️❤️‍🩹❤️‍🩹❤️‍🩹,So jammer,❤❤❤❤🩹❤🩹❤🩹,,
2,https://www.tiktok.com/@pokkels_/video/7509108...,7509369170926535480,,Sooooo soooooo jammer skat,SOOOOO SOOOOOOO JAMMER TAX,,,
3,https://www.tiktok.com/@pokkels_/video/7509108...,7509167046995854086,,so sorry to hear what you have gone through yo...,so sorry to hear what you have gone through yo...,,,
4,https://www.tiktok.com/@pokkels_/video/7509108...,7511975528715305735,,sorry this happened to you. but you r a strong...,sorry this happened to you. but you r a strong...,❤,,


In [None]:
import os
import ipywidgets as widgets
from IPython.display import display, clear_output

# === Config for Pokkels ===
SAVE_PATH = "../data/chaos/pokkels/benchmark_manual_labeled.csv"
categories = ["Chaos", "Ubuntu"]
label_target_per_category = {"Chaos": 18, "Ubuntu": 164}  # ← adjust if needed
batch_size = 10
current_index = 0

# === Load cleaned pool
df = pokkels_clean.copy()
df['manual_label_new'] = df.get('manual_label_new', '')  # Ensure column exists

# === Load existing labels from save file
try:
    labeled_df = pd.read_csv(SAVE_PATH)
    df.set_index("comment_id", inplace=True)
    labeled_df.set_index("comment_id", inplace=True)
    df.update(labeled_df['manual_label_new'])
    df.reset_index(inplace=True)
    print("✅ Existing labels loaded.")
except Exception as e:
    print("⚠️ No existing label file found.", e)

# === Helper: Get only unlabeled rows
def get_unlabeled_subset():
    return df[~df['manual_label_new'].isin(["Chaos", "Ubuntu", "Discard"])].reset_index(drop=True)

unlabeled_df = get_unlabeled_subset()

# === Helper: Count current labels from saved CSV
def get_label_counts_from_csv():
    if os.path.exists(SAVE_PATH):
        temp = pd.read_csv(SAVE_PATH)
        return temp['manual_label_new'].value_counts()
    else:
        return pd.Series(dtype=int)

# === Save Chaos/Ubuntu only
def save_progress():
    df_to_save = df[df['manual_label_new'].isin(categories)]
    df_to_save[['comment_id', 'original_text', 'translated_text', 'manual_label_new']].to_csv(SAVE_PATH, index=False)
    print("✅ Saved Chaos/Ubuntu labels to:", SAVE_PATH)

# === Show UI
def show_next_batch():
    global current_index, unlabeled_df
    clear_output(wait=True)

    # Show progress
    counts = get_label_counts_from_csv()
    progress_lines = []
    for cat in categories:
        done = counts.get(cat, 0)
        target = label_target_per_category[cat]
        left = max(0, target - done)
        progress_lines.append(f"<b>{cat}</b>: {done} / {target} &nbsp; ({left} left to do)")

    progress_html = widgets.HTML(
        value="<h4>🎯 Labeling Progress</h4>" + "<br>".join(progress_lines)
    )

    # Comment UI blocks
    comment_blocks = []
    end_index = min(current_index + batch_size, len(unlabeled_df))

    for i in range(current_index, end_index):
        row = unlabeled_df.iloc[i]
        df_idx = df[df['comment_id'] == row['comment_id']].index[0]

        dropdown = widgets.Dropdown(
            options=["", "Chaos", "Ubuntu", "Discard"],
            value=df.at[df_idx, 'manual_label_new'],
            description="Label:",
            layout=widgets.Layout(width='250px')
        )

        def make_handler(idx):
            def on_change(change):
                if change['type'] == 'change' and change['name'] == 'value':
                    df.at[idx, 'manual_label_new'] = change['new']
            return on_change

        dropdown.observe(make_handler(df_idx))

        comment_html = widgets.HTML(
            value=f"""
            <b>Comment ID:</b> {row['comment_id']}<br><br>
            <b>Original:</b><br>{row['original_text']}<br><br>
            <b>Translated:</b><br>{row['translated_text']}<br>
            """
        )

        card = widgets.VBox([comment_html, dropdown])
        card.layout = widgets.Layout(
            border='1px solid #ddd',
            padding='10px',
            margin='0 0 12px 0'
        )

        comment_blocks.append(card)

    # Navigation buttons
    back_btn = widgets.Button(description="⬅️ Back")
    save_btn = widgets.Button(description="💾 Save")
    next_btn = widgets.Button(description="➡️ Next")

    def on_back(b):
        global current_index
        current_index = max(0, current_index - batch_size)
        show_next_batch()

    def on_next(b):
        global current_index
        current_index += batch_size
        show_next_batch()

    def on_save(b):
        save_progress()

    back_btn.on_click(on_back)
    save_btn.on_click(on_save)
    next_btn.on_click(on_next)

    nav = widgets.HBox([back_btn, save_btn, next_btn])
    display(widgets.VBox([progress_html] + comment_blocks + [nav]))

# === Launch
show_next_batch()


VBox(children=(HTML(value='<h4>🎯 Labeling Progress</h4><b>Chaos</b>: 14 / 18 &nbsp; (4 left to do)<br><b>Ubunt…

In [32]:
import pandas as pd

# === Paths
pokkels_path = "../data/chaos/pokkels/benchmark_manual_labeled.csv"
main_path = "../notebooksV2/benchmark_manual_labeled.csv"

# === Load both
main_df = pd.read_csv(main_path)
dr_df = pd.read_csv(pokkels_path)

# === Remove duplicates from main (if same comment_id exists in dr_df)
merged_df = pd.concat([
    main_df[~main_df['comment_id'].isin(dr_df['comment_id'])],
    dr_df
], ignore_index=True)

# === Save merged result back to main
merged_df.to_csv(main_path, index=False)
print("✅ Pokkels labels successfully merged into main.")
print(f"📦 New total: {len(merged_df)} rows")


✅ Pokkels labels successfully merged into main.
📦 New total: 814 rows


In [33]:
# === Count labels in merged main
counts = merged_df['manual_label_new'].value_counts()
print("📊 Label counts in MAIN benchmark_manual_labeled.csv:\n")
print(counts)

📊 Label counts in MAIN benchmark_manual_labeled.csv:

manual_label_new
Middle    323
Chaos     296
Ubuntu    195
Name: count, dtype: int64


### Anton

In [1]:
import pandas as pd

# Load full anton dataset
anton_raw = pd.read_csv("../data/middle/anton/anton_combined_clean.csv")

# Clean and prep
anton_clean = anton_raw.copy()
anton_clean = anton_clean[anton_clean['original_text'].notna()]
anton_clean = anton_clean[anton_clean['original_text'].str.strip() != ""]
anton_clean['manual_label_new'] = ""
anton_clean = anton_clean.reset_index(drop=True)

print(f"✅ Anton pool cleaned with {len(anton_clean)} comments ready for labeling.")
anton_clean.head()

✅ Anton pool cleaned with 3066 comments ready for labeling.


Unnamed: 0,video_id,comment_id,reply_to_comment_id,original_text,translated_text,emoji_only,reply_to_original_text,manual_label_new
0,https://www.tiktok.com/@anton.taylor/video/744...,7448259702431761158,,Listen carefully before you comment here..he i...,Listen carefully before you comment here..he i...,,,
1,https://www.tiktok.com/@anton.taylor/video/744...,7448213529889112838,,Only legends will understand the message😂😂😂,Only legends will understand the message,😂😂😂,,
2,https://www.tiktok.com/@anton.taylor/video/744...,7448253712659890949,,"🤣🤣the sarcasm is tops 🔥 💯,those who didn't go ...","the sarcasm is tops ,those who didn't go to s...",🤣🤣🔥💯,,
3,https://www.tiktok.com/@anton.taylor/video/744...,7491916422369657655,,🤣🤣🤣At first he's confusing bt along he's speec...,At first he's confusing bt along he's speech h...,🤣🤣🤣,,
4,https://www.tiktok.com/@anton.taylor/video/744...,7465214572354994949,,Jokes that needs Diploma🤣🤣,Jokes that needs Diploma,🤣🤣,,


In [None]:
import os
import ipywidgets as widgets
from IPython.display import display, clear_output

# === Config for Anton ===
SAVE_PATH = "../data/middle/anton/benchmark_manual_labeled.csv"
categories = ["Chaos", "Ubuntu"]
label_target_per_category = {"Chaos": 4, "Ubuntu": 105}  # ← adjust if needed
batch_size = 10
current_index = 0

# === Load cleaned pool
df = anton_clean.copy()
df['manual_label_new'] = df.get('manual_label_new', '')  # Ensure column exists

# === Load existing labels from save file
try:
    labeled_df = pd.read_csv(SAVE_PATH)
    df.set_index("comment_id", inplace=True)
    labeled_df.set_index("comment_id", inplace=True)
    df.update(labeled_df['manual_label_new'])
    df.reset_index(inplace=True)
    print("✅ Existing labels loaded.")
except Exception as e:
    print("⚠️ No existing label file found.", e)

# === Helper: Get only unlabeled rows
def get_unlabeled_subset():
    return df[~df['manual_label_new'].isin(["Chaos", "Ubuntu", "Discard"])].reset_index(drop=True)

unlabeled_df = get_unlabeled_subset()

# === Helper: Count current labels from saved CSV
def get_label_counts_from_csv():
    if os.path.exists(SAVE_PATH):
        temp = pd.read_csv(SAVE_PATH)
        return temp['manual_label_new'].value_counts()
    else:
        return pd.Series(dtype=int)

# === Save Chaos/Ubuntu only
def save_progress():
    df_to_save = df[df['manual_label_new'].isin(categories)]
    df_to_save[['comment_id', 'original_text', 'translated_text', 'manual_label_new']].to_csv(SAVE_PATH, index=False)
    print("✅ Saved Chaos/Ubuntu labels to:", SAVE_PATH)

# === Show UI
def show_next_batch():
    global current_index, unlabeled_df
    clear_output(wait=True)

    # Show progress
    counts = get_label_counts_from_csv()
    progress_lines = []
    for cat in categories:
        done = counts.get(cat, 0)
        target = label_target_per_category[cat]
        left = max(0, target - done)
        progress_lines.append(f"<b>{cat}</b>: {done} / {target} &nbsp; ({left} left to do)")

    progress_html = widgets.HTML(
        value="<h4>🎯 Labeling Progress</h4>" + "<br>".join(progress_lines)
    )

    # Comment UI blocks
    comment_blocks = []
    end_index = min(current_index + batch_size, len(unlabeled_df))

    for i in range(current_index, end_index):
        row = unlabeled_df.iloc[i]
        df_idx = df[df['comment_id'] == row['comment_id']].index[0]

        dropdown = widgets.Dropdown(
            options=["", "Chaos", "Ubuntu", "Discard"],
            value=df.at[df_idx, 'manual_label_new'],
            description="Label:",
            layout=widgets.Layout(width='250px')
        )

        def make_handler(idx):
            def on_change(change):
                if change['type'] == 'change' and change['name'] == 'value':
                    df.at[idx, 'manual_label_new'] = change['new']
            return on_change

        dropdown.observe(make_handler(df_idx))

        comment_html = widgets.HTML(
            value=f"""
            <b>Comment ID:</b> {row['comment_id']}<br><br>
            <b>Original:</b><br>{row['original_text']}<br><br>
            <b>Translated:</b><br>{row['translated_text']}<br>
            """
        )

        card = widgets.VBox([comment_html, dropdown])
        card.layout = widgets.Layout(
            border='1px solid #ddd',
            padding='10px',
            margin='0 0 12px 0'
        )

        comment_blocks.append(card)

    # Navigation buttons
    back_btn = widgets.Button(description="⬅️ Back")
    save_btn = widgets.Button(description="💾 Save")
    next_btn = widgets.Button(description="➡️ Next")

    def on_back(b):
        global current_index
        current_index = max(0, current_index - batch_size)
        show_next_batch()

    def on_next(b):
        global current_index
        current_index += batch_size
        show_next_batch()

    def on_save(b):
        save_progress()

    back_btn.on_click(on_back)
    save_btn.on_click(on_save)
    next_btn.on_click(on_next)

    nav = widgets.HBox([back_btn, save_btn, next_btn])
    display(widgets.VBox([progress_html] + comment_blocks + [nav]))

# === Launch
show_next_batch()

VBox(children=(HTML(value='<h4>🎯 Labeling Progress</h4><b>Chaos</b>: 6 / 4 &nbsp; (0 left to do)<br><b>Ubuntu<…

In [4]:
import pandas as pd

# === Paths
anton_path = "../data/middle/anton/benchmark_manual_labeled.csv"
main_path = "../notebooksV2/benchmark_manual_labeled.csv"

# === Load both
main_df = pd.read_csv(main_path)
dr_df = pd.read_csv(anton_path)

# === Remove duplicates from main (if same comment_id exists in dr_df)
merged_df = pd.concat([
    main_df[~main_df['comment_id'].isin(dr_df['comment_id'])],
    dr_df
], ignore_index=True)

# === Save merged result back to main
merged_df.to_csv(main_path, index=False)
print("✅ Anton labels successfully merged into main.")
print(f"📦 New total: {len(merged_df)} rows")

✅ Anton labels successfully merged into main.
📦 New total: 853 rows


In [5]:
# === Count labels in merged main
counts = merged_df['manual_label_new'].value_counts()
print("📊 Label counts in MAIN benchmark_manual_labeled.csv:\n")
print(counts)

📊 Label counts in MAIN benchmark_manual_labeled.csv:

manual_label_new
Middle    323
Chaos     302
Ubuntu    228
Name: count, dtype: int64


### Dodo

In [1]:
import pandas as pd

# Load full anton dataset
dodo_raw = pd.read_csv("../data/ubuntu/dodo/dodo_combined_clean.csv")

# Clean and prep
dodo_clean = dodo_raw.copy()
dodo_clean = dodo_clean[dodo_clean['original_text'].notna()]
dodo_clean = dodo_clean[dodo_clean['original_text'].str.strip() != ""]
dodo_clean['manual_label_new'] = ""
dodo_clean = dodo_clean.reset_index(drop=True)

print(f"✅ Dodo pool cleaned with {len(dodo_clean)} comments ready for labeling.")
dodo_clean.head()

✅ Dodo pool cleaned with 1793 comments ready for labeling.


Unnamed: 0,video_id,comment_id,reply_to_comment_id,original_text,translated_text,emoji_only,reply_to_original_text,manual_label_new
0,https://www.tiktok.com/@dodonyokamusic/video/7...,7.46398e+18,,Ons Nederlanders ondersteun jou!,Our Dutch support you!,,,
1,https://www.tiktok.com/@dodonyokamusic/video/7...,7.464051e+18,,i am from belgium and I can understand 80% of ...,i am from belgium and I can understand 80% of ...,,,
2,https://www.tiktok.com/@dodonyokamusic/video/7...,7.463936e+18,,"It is just dutch but less grammar, so that’s w...","It is just dutch but less grammar, so thats wh...",,,
3,https://www.tiktok.com/@dodonyokamusic/video/7...,7.464495e+18,,you going places young man🥰,you going places young man,🥰,,
4,https://www.tiktok.com/@dodonyokamusic/video/7...,7.463915e+18,,In the netherlands we understand almost the wh...,In the netherlands we understand almost the wh...,🔥,,


In [3]:
import os
import ipywidgets as widgets
from IPython.display import display, clear_output

# === Config for Dodo ===
SAVE_PATH = "../data/ubuntu/dodo/benchmark_manual_labeled.csv"
categories = ["Chaos", "Ubuntu"]
label_target_per_category = {"Chaos": 21, "Ubuntu": 95}  # ← adjust if needed
batch_size = 10
current_index = 0

# === Load cleaned pool
df = dodo_clean.copy()
df['manual_label_new'] = df.get('manual_label_new', '')  # Ensure column exists

# === Load existing labels from save file
try:
    labeled_df = pd.read_csv(SAVE_PATH)
    df.set_index("comment_id", inplace=True)
    labeled_df.set_index("comment_id", inplace=True)
    df.update(labeled_df['manual_label_new'])
    df.reset_index(inplace=True)
    print("✅ Existing labels loaded.")
except Exception as e:
    print("⚠️ No existing label file found.", e)

# === Helper: Get only unlabeled rows
def get_unlabeled_subset():
    return df[~df['manual_label_new'].isin(["Chaos", "Ubuntu", "Discard"])].reset_index(drop=True)

unlabeled_df = get_unlabeled_subset()

# === Helper: Count current labels from saved CSV
def get_label_counts_from_csv():
    if os.path.exists(SAVE_PATH):
        temp = pd.read_csv(SAVE_PATH)
        return temp['manual_label_new'].value_counts()
    else:
        return pd.Series(dtype=int)

# === Save Chaos/Ubuntu only
def save_progress():
    df_to_save = df[df['manual_label_new'].isin(categories)]
    df_to_save[['comment_id', 'original_text', 'translated_text', 'manual_label_new']].to_csv(SAVE_PATH, index=False)
    print("✅ Saved Chaos/Ubuntu labels to:", SAVE_PATH)

# === Show UI
def show_next_batch():
    global current_index, unlabeled_df
    clear_output(wait=True)

    # Show progress
    counts = get_label_counts_from_csv()
    progress_lines = []
    for cat in categories:
        done = counts.get(cat, 0)
        target = label_target_per_category[cat]
        left = max(0, target - done)
        progress_lines.append(f"<b>{cat}</b>: {done} / {target} &nbsp; ({left} left to do)")

    progress_html = widgets.HTML(
        value="<h4>🎯 Labeling Progress</h4>" + "<br>".join(progress_lines)
    )

    # Comment UI blocks
    comment_blocks = []
    end_index = min(current_index + batch_size, len(unlabeled_df))

    for i in range(current_index, end_index):
        row = unlabeled_df.iloc[i]
        df_idx = df[df['comment_id'] == row['comment_id']].index[0]

        dropdown = widgets.Dropdown(
            options=["", "Chaos", "Ubuntu", "Discard"],
            value=df.at[df_idx, 'manual_label_new'],
            description="Label:",
            layout=widgets.Layout(width='250px')
        )

        def make_handler(idx):
            def on_change(change):
                if change['type'] == 'change' and change['name'] == 'value':
                    df.at[idx, 'manual_label_new'] = change['new']
            return on_change

        dropdown.observe(make_handler(df_idx))

        comment_html = widgets.HTML(
            value=f"""
            <b>Comment ID:</b> {row['comment_id']}<br><br>
            <b>Original:</b><br>{row['original_text']}<br><br>
            <b>Translated:</b><br>{row['translated_text']}<br>
            """
        )

        card = widgets.VBox([comment_html, dropdown])
        card.layout = widgets.Layout(
            border='1px solid #ddd',
            padding='10px',
            margin='0 0 12px 0'
        )

        comment_blocks.append(card)

    # Navigation buttons
    back_btn = widgets.Button(description="⬅️ Back")
    save_btn = widgets.Button(description="💾 Save")
    next_btn = widgets.Button(description="➡️ Next")

    def on_back(b):
        global current_index
        current_index = max(0, current_index - batch_size)
        show_next_batch()

    def on_next(b):
        global current_index
        current_index += batch_size
        show_next_batch()

    def on_save(b):
        save_progress()

    back_btn.on_click(on_back)
    save_btn.on_click(on_save)
    next_btn.on_click(on_next)

    nav = widgets.HBox([back_btn, save_btn, next_btn])
    display(widgets.VBox([progress_html] + comment_blocks + [nav]))

# === Launch
show_next_batch()

VBox(children=(HTML(value='<h4>🎯 Labeling Progress</h4><b>Chaos</b>: 23 / 21 &nbsp; (0 left to do)<br><b>Ubunt…

In [5]:
import pandas as pd

# === Paths
dodo_path = "../data/ubuntu/dodo/benchmark_manual_labeled.csv"
main_path = "../notebooksV2/benchmark_manual_labeled.csv"

# === Load both
main_df = pd.read_csv(main_path)
dr_df = pd.read_csv(dodo_path)

# === Remove duplicates from main (if same comment_id exists in dr_df)
merged_df = pd.concat([
    main_df[~main_df['comment_id'].isin(dr_df['comment_id'])],
    dr_df
], ignore_index=True)

# === Save merged result back to main
merged_df.to_csv(main_path, index=False)
print("✅ Dodo labels successfully merged into main.")
print(f"📦 New total: {len(merged_df)} rows")

✅ Dodo labels successfully merged into main.
📦 New total: 935 rows


In [6]:
# === Count labels in merged main
counts = merged_df['manual_label_new'].value_counts()
print("📊 Label counts in MAIN benchmark_manual_labeled.csv:\n")
print(counts)

📊 Label counts in MAIN benchmark_manual_labeled.csv:

manual_label_new
Chaos     325
Middle    323
Ubuntu    287
Name: count, dtype: int64


### Yerik

In [1]:
import pandas as pd

# Load full anton dataset
yerik_raw = pd.read_csv("../data/ubuntu/yerik/yerik_combined_clean.csv")

# Clean and prep
yerik_clean = yerik_raw.copy()
yerik_clean = yerik_clean[yerik_clean['original_text'].notna()]
yerik_clean = yerik_clean[yerik_clean['original_text'].str.strip() != ""]
yerik_clean['manual_label_new'] = ""
yerik_clean = yerik_clean.reset_index(drop=True)

print(f"✅ Yerik pool cleaned with {len(yerik_clean)} comments ready for labeling.")
yerik_clean.head()

✅ Yerik pool cleaned with 3237 comments ready for labeling.


Unnamed: 0,video_id,comment_id,reply_to_comment_id,original_text,translated_text,emoji_only,reply_to_original_text,manual_label_new
0,https://www.tiktok.com/@yerik.jomei/video/7390...,7390672468887094021,,Cause am a diabetes ☠️🥲🤣😅🤣,Cause am a diabetes,☠🥲🤣😅🤣,,
1,https://www.tiktok.com/@yerik.jomei/video/7390...,7390703669787230981,,we can use the favourite colour as a guide to ...,we can use the favourite colour as a guide to ...,🤭😭,,
2,https://www.tiktok.com/@yerik.jomei/video/7390...,7390117155025388294,,The end is always beautiful 🥰,The end is always beautiful,🥰,,
3,https://www.tiktok.com/@yerik.jomei/video/7390...,7390322606279738117,,was the backflip part necessary 😂😂😂,was the backflip part necessary,😂😂😂,,
4,https://www.tiktok.com/@yerik.jomei/video/7390...,7390240002872001286,,"“…are yu a cancer, coz um a diabetes”😆😂💔","are yu a cancer, coz um a diabetes",😆😂💔,,


In [None]:
import os
import ipywidgets as widgets
from IPython.display import display, clear_output

# === Config for Dodo ===
SAVE_PATH = "../data/ubuntu/yerik/benchmark_manual_labeled.csv"
categories = ["Chaos", "Ubuntu"]
label_target_per_category = {"Chaos": 0, "Ubuntu": 36}  # ← adjust if needed
batch_size = 10
current_index = 0

# === Load cleaned pool
df = yerik_clean.copy()
df['manual_label_new'] = df.get('manual_label_new', '')  # Ensure column exists

# === Load existing labels from save file
try:
    labeled_df = pd.read_csv(SAVE_PATH)
    df.set_index("comment_id", inplace=True)
    labeled_df.set_index("comment_id", inplace=True)
    df.update(labeled_df['manual_label_new'])
    df.reset_index(inplace=True)
    print("✅ Existing labels loaded.")
except Exception as e:
    print("⚠️ No existing label file found.", e)

# === Helper: Get only unlabeled rows
def get_unlabeled_subset():
    return df[~df['manual_label_new'].isin(["Chaos", "Ubuntu", "Discard"])].reset_index(drop=True)

unlabeled_df = get_unlabeled_subset()

# === Helper: Count current labels from saved CSV
def get_label_counts_from_csv():
    if os.path.exists(SAVE_PATH):
        temp = pd.read_csv(SAVE_PATH)
        return temp['manual_label_new'].value_counts()
    else:
        return pd.Series(dtype=int)

# === Save Chaos/Ubuntu only
def save_progress():
    df_to_save = df[df['manual_label_new'].isin(categories)]
    df_to_save[['comment_id', 'original_text', 'translated_text', 'manual_label_new']].to_csv(SAVE_PATH, index=False)
    print("✅ Saved Chaos/Ubuntu labels to:", SAVE_PATH)

# === Show UI
def show_next_batch():
    global current_index, unlabeled_df
    clear_output(wait=True)

    # Show progress
    counts = get_label_counts_from_csv()
    progress_lines = []
    for cat in categories:
        done = counts.get(cat, 0)
        target = label_target_per_category[cat]
        left = max(0, target - done)
        progress_lines.append(f"<b>{cat}</b>: {done} / {target} &nbsp; ({left} left to do)")

    progress_html = widgets.HTML(
        value="<h4>🎯 Labeling Progress</h4>" + "<br>".join(progress_lines)
    )

    # Comment UI blocks
    comment_blocks = []
    end_index = min(current_index + batch_size, len(unlabeled_df))

    for i in range(current_index, end_index):
        row = unlabeled_df.iloc[i]
        df_idx = df[df['comment_id'] == row['comment_id']].index[0]

        dropdown = widgets.Dropdown(
            options=["", "Chaos", "Ubuntu", "Discard"],
            value=df.at[df_idx, 'manual_label_new'],
            description="Label:",
            layout=widgets.Layout(width='250px')
        )

        def make_handler(idx):
            def on_change(change):
                if change['type'] == 'change' and change['name'] == 'value':
                    df.at[idx, 'manual_label_new'] = change['new']
            return on_change

        dropdown.observe(make_handler(df_idx))

        comment_html = widgets.HTML(
            value=f"""
            <b>Comment ID:</b> {row['comment_id']}<br><br>
            <b>Original:</b><br>{row['original_text']}<br><br>
            <b>Translated:</b><br>{row['translated_text']}<br>
            """
        )

        card = widgets.VBox([comment_html, dropdown])
        card.layout = widgets.Layout(
            border='1px solid #ddd',
            padding='10px',
            margin='0 0 12px 0'
        )

        comment_blocks.append(card)

    # Navigation buttons
    back_btn = widgets.Button(description="⬅️ Back")
    save_btn = widgets.Button(description="💾 Save")
    next_btn = widgets.Button(description="➡️ Next")

    def on_back(b):
        global current_index
        current_index = max(0, current_index - batch_size)
        show_next_batch()

    def on_next(b):
        global current_index
        current_index += batch_size
        show_next_batch()

    def on_save(b):
        save_progress()

    back_btn.on_click(on_back)
    save_btn.on_click(on_save)
    next_btn.on_click(on_next)

    nav = widgets.HBox([back_btn, save_btn, next_btn])
    display(widgets.VBox([progress_html] + comment_blocks + [nav]))

# === Launch
show_next_batch()

VBox(children=(HTML(value='<h4>🎯 Labeling Progress</h4><b>Chaos</b>: 0 / 0 &nbsp; (0 left to do)<br><b>Ubuntu<…

In [3]:
import pandas as pd

# === Paths
yerik_path = "../data/ubuntu/yerik/benchmark_manual_labeled.csv"
main_path = "../notebooksV2/benchmark_manual_labeled.csv"

# === Load both
main_df = pd.read_csv(main_path)
dr_df = pd.read_csv(yerik_path)

# === Remove duplicates from main (if same comment_id exists in dr_df)
merged_df = pd.concat([
    main_df[~main_df['comment_id'].isin(dr_df['comment_id'])],
    dr_df
], ignore_index=True)

# === Save merged result back to main
merged_df.to_csv(main_path, index=False)
print("✅ Yerik labels successfully merged into main.")
print(f"📦 New total: {len(merged_df)} rows")

✅ Yerik labels successfully merged into main.
📦 New total: 969 rows


In [4]:
# === Count labels in merged main
counts = merged_df['manual_label_new'].value_counts()
print("📊 Label counts in MAIN benchmark_manual_labeled.csv:\n")
print(counts)

📊 Label counts in MAIN benchmark_manual_labeled.csv:

manual_label_new
Chaos     325
Middle    323
Ubuntu    321
Name: count, dtype: int64


In [1]:
import pandas as pd

# === Load your final main benchmark file
main_path = "../notebooksV2/benchmark_manual_labeled.csv"
df = pd.read_csv(main_path)

# === Filter valid labels only
valid_labels = ["Chaos", "Ubuntu", "Middle"]
df = df[df['manual_label_new'].isin(valid_labels)]

# === Downsample each to 320 if needed
balanced_df = pd.concat([
    df[df['manual_label_new'] == label].sample(n=320, random_state=42)
    for label in valid_labels
])

# === Shuffle for fairness
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# === Save balanced version
balanced_path = "../notebooksV2/benchmark_manual_labeled_balanced.csv"
balanced_df.to_csv(balanced_path, index=False)

print("✅ Saved balanced CSV to:", balanced_path)
print("\nNew label counts:")
print(balanced_df['manual_label_new'].value_counts())


✅ Saved balanced CSV to: ../notebooksV2/benchmark_manual_labeled_balanced.csv

New label counts:
manual_label_new
Middle    320
Ubuntu    320
Chaos     320
Name: count, dtype: int64
