# Loading precleaned datasets

In [1]:
import polars as pl
import numpy as np

In [2]:
pre_1 = pl.read_csv("Precleaned-Data/precleaned_chunk_1.csv", separator='~', quote_char='"')

In [3]:
pre_1.head()

body,subreddit,body_cleaned
str,str,str
"""Brick""","""r-3amjokes""","""Brick"""
"""Ban wave #2 is complete, pleas…","""r-3amjokes""","""Ban wave #2 is complete, pleas…"
"""I'm not an insomniac, but I am…","""r-3amjokes""","""I'm not an insomniac, but I am…"
"""Hear you loud and clear, thank…","""r-3amjokes""","""Hear you loud and clear, thank…"
"""I had no idea this sub had mod…","""r-3amjokes""","""I had no idea this sub had mod…"


In [4]:
def polars_info(df: pl.DataFrame):
    print("Shape:", df.shape)
    print("\nSchema:")
    for name, dtype in df.schema.items():
        print(f" - {name}: {dtype}")
    print("\nNull Counts:")
    print(df.null_count())

polars_info(pre_1)

Shape: (400000, 3)

Schema:
 - body: String
 - subreddit: String
 - body_cleaned: String

Null Counts:
shape: (1, 3)
┌──────┬───────────┬──────────────┐
│ body ┆ subreddit ┆ body_cleaned │
│ ---  ┆ ---       ┆ ---          │
│ u32  ┆ u32       ┆ u32          │
╞══════╪═══════════╪══════════════╡
│ 0    ┆ 0         ┆ 0            │
└──────┴───────────┴──────────────┘


# Funnelling special characters

In [5]:
import csv
import string
import unicodedata
import emoji

In [6]:
# Path to your CSV file
csv_file_path = 'Precleaned-Data/precleaned_chunk_1.csv'

# Sets to collect characters
punctuations_found = set()
emojis_found = set()
special_chars_found = set()

# Function to extract emojis (real emojis, even if multi-char)
def extract_emojis(text):
    return [match["emoji"] for match in emoji.emoji_list(text)]

# Open and read the CSV file with correct delimiter and quote character
with open(csv_file_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter='~', quotechar='"')
    for row in reader:
        for cell in row:
            # Emojis (multi-char supported)
            emojis = extract_emojis(cell)
            emojis_found.update(emojis)

            for char in cell:
                if char in string.punctuation:
                    punctuations_found.add(char)
                elif not char.isalnum() and not char.isspace() and char not in emojis:
                    special_chars_found.add(char)

# Sort and print results
print("Punctuation marks found in the CSV:")
print(" ".join(sorted(punctuations_found)))

print("\nEmojis found in the CSV:")
print(" ".join(sorted(emojis_found)))

print("\nOther special characters found in the CSV:")
print(" ".join(sorted(special_chars_found)))


Punctuation marks found in the CSV:
! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~

Emojis found in the CSV:
0️⃣ 1️⃣ 3️⃣ 5️⃣ 6️⃣ 8️⃣ © ©️ ® ‼️ ⁉️ ™ ™️ ↔️ ↖️ ↗️ ↘️ ↙️ ⌚ ⌨ ⏰ ⏱️ ⏲️ ◀️ ☀️ ☄️ ☎️ ☕ ☘️ ☝ ☝️ ☝🏻 ☝🏽 ☝🏿 ☠ ☠️ ☪️ ☮️ ☹ ☹️ ☺ ☺️ ♀ ♀️ ♊ ♥ ♥️ ♻️ ♾️ ♿ ⚒ ⚓ ⚔️ ⚜️ ⚠️ ⚡ ⚪ ⚫ ⚰ ⚰️ ⚱️ ⚽ ⛏️ ⛑ ⛑️ ⛓️ ⛰ ⛱️ ⛳ ⛸️ ⛹️‍♂️ ⛺ ⛽ ✂️ ✅ ✈️ ✊ ✊🏼 ✊🏽 ✊🏾 ✊🏿 ✋ ✋🏻 ✋🏼 ✋🏽 ✌ ✌️ ✌🏻 ✌🏼 ✌🏽 ✌🏿 ✍️ ✏️ ✔ ✔️ ✖️ ✝️ ✡️ ✨ ✴ ❄️ ❌ ❎ ❓ ❔ ❗ ❣️ ❤ ❤️ ❤️‍🔥 ❤️‍🩹 ➗ ➡️ ⤴️ ⤵️ ⬅️ ⬆️ ⬇️ ⬛ ⭐ 🃏 🅰 🅰️ 🅱 🅱️ 🅾 🅾️ 🅿 🆎 🆒 🆗 🇦🇩 🇦🇲 🇦🇷 🇦🇹 🇦🇺 🇧🇪 🇧🇬 🇧🇱 🇧🇷 🇨🇦 🇨🇭 🇨🇮 🇨🇱 🇨🇳 🇨🇷 🇨🇺 🇨🇿 🇩🇪 🇩🇰 🇩🇴 🇪🇸 🇪🇺 🇫🇮 🇫🇷 🇬🇧 🇬🇱 🇬🇷 🇭🇺 🇮🇪 🇮🇱 🇮🇳 🇮🇶 🇮🇷 🇮🇹 🇯🇲 🇯🇵 🇰🇵 🇰🇷 🇱🇧 🇱🇷 🇱🇹 🇲🇨 🇲🇪 🇲🇽 🇲🇾 🇳🇱 🇳🇴 🇳🇿 🇵🇱 🇵🇸 🇵🇹 🇷🇴 🇷🇺 🇸🇪 🇸🇯 🇸🇲 🇹🇩 🇹🇬 🇹🇷 🇺🇦 🇺🇲 🇺🇳 🇺🇸 🇻🇪 🇻🇮 🇻🇳 🇾🇪 🇿🇦 🇿🇼 🌀 🌃 🌅 🌈 🌊 🌌 🌍 🌎 🌓 🌕 🌖 🌙 🌚 🌝 🌞 🌟 🌠 🌥️ 🌧 🌩 🌩️ 🌪 🌪️ 🌬️ 🌭 🌮 🌱 🌲 🌳 🌴 🌵 🌶️ 🌷 🌸 🌹 🌺 🌼 🌽 🌿 🍀 🍁 🍃 🍄 🍅 🍆 🍇 🍈 🍉 🍊 🍋 🍌 🍍 🍎 🍐 🍑 🍒 🍔 🍕 🍖 🍗 🍝 🍞 🍟 🍠 🍤 🍨 🍩 🍪 🍫 🍬 🍭 🍰 🍳 🍴 🍵 🍷 🍸 🍹 🍺 🍻 🍼 🍽️ 🍾 🍿 🎀 🎁 🎂 🎃 🎅 🎆 🎇 🎈 🎉 🎊 🎒 🎓 🎖️ 🎣 🎤 🎥 🎧 🎨 🎩 🎪 🎭 🎮 🎯 🎰 🎱 🎳 🎵 🎶 🎷 🎸 🎺 🎻 🎼 🏁 🏃 🏃‍♀️ 🏃‍♂️ 🏃‍♂️‍➡️ 🏃🏽‍♀️‍➡️ 🏄‍♀️ 🏄🏽‍♀️ 🏅 🏆 🏇 🏈 🏉 🏋 🏌️‍♀️ 🏍 🏍️ 🏖️ 🏞️ 

# API meta-tagging

In [1]:
import os
import time
import polars as pl
from google import genai

client = genai.Client(api_key="AIzaSyD-Fc6V_NuYwLIqiX4EZpF-J-rOHHY52oQ")

In [4]:
try:
    df = pl.read_csv("Precleaned-Data/precleaned_chunk_1.csv", separator='~', quote_char='"')
    texts = df["body_cleaned"].to_list()
    print(f"Loaded {len(texts)} reviews from file.")
except Exception as e:
    print(f"Failed to read input CSV: {e}")
    texts = []

# Configuration
batch_size = 20
total = len(texts)
models_to_try = ["gemini-2.5-flash", "gemini-2.5-pro"]
output_file = "labelled_output_chunk_1.csv"
header_written = os.path.exists(output_file)

if not texts:
    print("No texts found to process. Exiting.")
else:
    total_batches = (total + batch_size - 1) // batch_size
    print(f"📦 Starting processing: {total_batches} batches of up to {batch_size} reviews each.")

    for batch_num, i in enumerate(range(0, total, batch_size), start=1):
        print(f"\nBatch {batch_num}/{total_batches} — Processing reviews {i + 1} to {min(i + batch_size, total)}")
        batch_start_time = time.time()

        batch_reviews = texts[i:i + batch_size]
        reviews_block = "\n".join(batch_reviews)

        prompt = f'''
Classify each review using the schema and rules below.

**Output Format**
- CSV format
- Separator: ⁓
- Quote character: "
- Column order: "text","label","tone","context","contains_slang","complexity","causal","contains_gibberish"

**Column Definitions**
- **text**: Original review (unchanged — preserve punctuation, emojis, and casing).
- **label**: One of "positive", "neutral", or "negative".
- **tone**: One of: "sincere", "sarcastic", or "neutral".
- **context**: Always "comment" or "review".
- **contains_slang**: "true" if slang, vulgarity, emojis with sexual tone, wordplay involving body parts, or innuendo is used; otherwise "false".
- **complexity**: "low", "medium", or "high".
- **causal**: "true" if reasoning, explanation, cause-effect, blame, or justification exists; else "false".
- **contains_gibberish**: "true" if the text contains random characters, non-words, nonsensical phrases, or emoji-only content that lacks structure or meaning; otherwise "false".

**Annotation Rules**
1. Never alter the original review text.
2. Gibberish or emoji-only → `contains_gibberish=true` and `tone=sarcastic`
3. Puns, dirty jokes → `contains_slang=true`, `tone=sarcastic`
4. Emoji-only texts → infer sentiment unless meaningless
5. Sarcasm → `tone=sarcastic`, complexity ≥ "medium"
6. Reasoning or cause → `causal=true`

Start your output with this header:
"text","label","tone","context","contains_slang","complexity","causal","contains_gibberish"

Reviews:
{reviews_block}
'''

        raw_output = ""
        success = False
        model_start_time = time.time()

        for model_name in models_to_try:
            try:
                print(f"🧪 Trying model: {model_name}")
                response = client.models.generate_content(
                    model=model_name,
                    contents=prompt
                )

                if hasattr(response, 'text') and response.text:
                    raw_output = response.text.strip()
                elif hasattr(response, 'candidates'):
                    raw_output = response.candidates[0].content.parts[0].text.strip()
                else:
                    print(f"⚠️ {model_name} returned no usable content.")
                    continue

                print(f"Model {model_name} succeeded.")
                success = True
                break

            except Exception as e:
                print(f"Error using model {model_name}: {e}")
                continue

        model_duration = round(time.time() - model_start_time, 2)

        if not success:
            print(f"Skipping batch {batch_num} due to total model failure.")
            continue

        print(f"Output generated in {model_duration}s")
        print("Output Preview:\n", raw_output[:300])
        print("-" * 80)

        # Writing output to file
        try:
            with open(output_file, "a", encoding="utf-8") as f:
                lines = raw_output.strip().split("\n")

                if not header_written and lines[0].startswith('"text"'):
                    f.write(lines[0] + "\n")
                    f.write("\n".join(lines[1:]) + "\n")
                    header_written = True
                elif lines[0].startswith('"text"'):
                    f.write("\n".join(lines[1:]) + "\n")
                else:
                    f.write(raw_output + "\n")

            print(f"Batch {batch_num} written to {output_file}")
        except Exception as e:
            print(f"Failed to write to output file: {e}")

        batch_total_time = round(time.time() - batch_start_time, 2)
        print(f"Batch {batch_num} completed in {batch_total_time}s")

        if batch_num % 5 == 0 and (i + batch_size) < total:
            print("Sleeping for 5 seconds to prevent overload...")
            time.sleep(5)

📄 Loaded 400000 reviews from file.
📦 Starting processing: 20000 batches of up to 20 reviews each.

🚀 Batch 1/20000 — Processing reviews 1 to 20
🧪 Trying model: gemini-2.5-flash
✅ Model gemini-2.5-flash succeeded.
⏱️ Output generated in 23.47s
🔍 Output Preview:
 "text"⁓"label"⁓"tone"⁓"context"⁓"contains_slang"⁓"complexity"⁓"causal"⁓"contains_gibberish"
"Brick"⁓"neutral"⁓"neutral"⁓"comment"⁓"false"⁓"low"⁓"false"⁓"false"
"Ban wave #2 is complete, please keep reporting the shitheels so i can be entertained by sob stories in modmail. > Bans due to rule #4 tend 
--------------------------------------------------------------------------------
💾 Batch 1 written to labelled_output.csv
✅ Batch 1 completed in 23.47s

🚀 Batch 2/20000 — Processing reviews 21 to 40
🧪 Trying model: gemini-2.5-flash
✅ Model gemini-2.5-flash succeeded.
⏱️ Output generated in 30.22s
🔍 Output Preview:
 "text"⁓"label"⁓"tone"⁓"context"⁓"contains_slang"⁓"complexity"⁓"causal"⁓"contains_gibberish"
"Just getting your wanks in 

KeyboardInterrupt: 