In [1]:
# ✅ SOSNet Cleaning, Translation & Augmentation Pipeline
import os
import re
import torch
import random
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from concurrent.futures import ThreadPoolExecutor

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from slangdict import slangdict

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import malaya
from malaya.augmentation import abstractive

  from .autonotebook import tqdm as notebook_tqdm
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))


In [2]:
# === Step 1: Load SOSNet files ===
sosnet_dir = "C:/Users/MMU/Downloads/FYP_Prototype/FYP_Prototype/model/IEEE Big Data 2020 Cyberbullying Dataset"
file_map = {
    "8000age.txt": "age",
    "8000ethnicity.txt": "ethnicity",
    "8000gender.txt": "gender",
    "8000religion.txt": "religion",
    "8000other.txt": "other_cyberbullying",
    "8000notcb.txt": "not_cyberbullying"
}
data = []
for fname, label in file_map.items():
    with open(os.path.join(sosnet_dir, fname), "r", encoding="utf-8") as f:
        tweets = f.read().splitlines()
        data.extend([(tweet.strip(), label) for tweet in tweets if tweet.strip()])
df = pd.DataFrame(data, columns=["tweet_text", "cyberbullying_type"])

In [3]:
def normalize_slang(text):
    words = text.split()
    return " ".join([slangdict.get(w, w) for w in words])

def is_low_quality(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return True
    if len(text.split()) < 3:
        return True
    if len(re.findall(r'[a-zA-Z0-9]', text)) / max(len(text), 1) < 0.3:
        return True
    if not re.search(r'[a-zA-Z]', text):
        return True
    return False

text_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number'],
    annotate=set(),
    fix_html=True,
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,
    unpack_contractions=True,
    spell_correct_elong=False,
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons]
)

df["normalized_text"] = df["tweet_text"].apply(lambda t: normalize_slang(t.lower()))
df["is_low_quality"] = df["normalized_text"].apply(is_low_quality)
df_filtered = df[df["is_low_quality"] == False].copy()

print("✅ BEFORE Filtering:\n", df["cyberbullying_type"].value_counts())
print("✅ AFTER Filtering:\n", df_filtered["cyberbullying_type"].value_counts())

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading twitter - 1grams ...
✅ BEFORE Filtering:
 cyberbullying_type
other_cyberbullying    8006
gender                 8004
not_cyberbullying      8001
ethnicity              8000
religion               8000
age                    7999
Name: count, dtype: int64
✅ AFTER Filtering:
 cyberbullying_type
religion               7999
age                    7996
ethnicity              7996
gender                 7931
not_cyberbullying      7772
other_cyberbullying    7592
Name: count, dtype: int64


In [4]:
from tqdm import tqdm  # make sure this is imported

model_name = "mesolitica/translation-t5-base-standard-bahasa-cased"
trans_tokenizer = AutoTokenizer.from_pretrained(model_name)
trans_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trans_model = trans_model.to(device)

def split_text(text, chunk_size=100):
    sentences = sent_tokenize(text)
    chunks, current = [], ""
    for s in sentences:
        if len(current) + len(s) <= chunk_size:
            current += " " + s
        else:
            chunks.append(current.strip())
            current = s
    if current: chunks.append(current.strip())
    return chunks

def safe_translate_text(text, model, tokenizer, max_tokens=256, batch_size=64):
    try:
        chunks = split_text(str(text))
        if not chunks:
            return ""
        translated_chunks = []
        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i + batch_size]
            inputs = tokenizer([f"terjemah ke Melayu: {chunk}" for chunk in batch],
                               return_tensors="pt", padding=True, truncation=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            inputs.pop("token_type_ids", None)
            with torch.no_grad():
                outputs = model.generate(**inputs, max_new_tokens=max_tokens, no_repeat_ngram_size=2)
            translated_chunks.extend([tokenizer.decode(t, skip_special_tokens=True) for t in outputs])
        return " ".join(translated_chunks)
    except Exception as e:
        print("[Translation Error]", str(e)[:100], "| Text:", text[:80])
        return ""

def fast_batch_translate(texts, model, tokenizer, max_tokens=256, batch_size=128):
    try:
        inputs = tokenizer([f"terjemah ke Melayu: {t}" for t in texts],
                           return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        inputs.pop("token_type_ids", None)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=max_tokens, no_repeat_ngram_size=2)
        return [tokenizer.decode(o, skip_special_tokens=True) for o in outputs]
    except Exception as e:
        print("[Batch Translation Error]", str(e))
        return ["" for _ in texts]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
# 🔍 Preview 10 translations
test_df = df_filtered.sample(5, random_state=42)
for orig in test_df["tweet_text"]:
    cleaned = " ".join(text_processor.pre_process_doc(orig))
    translated = safe_translate_text(cleaned, trans_model, trans_tokenizer)
    print(f"🔸 Original:   {orig}")
    print(f"🧽 Cleaned:    {cleaned}")
    print(f"🌐 Translated: {translated}\n{'-' * 80}")

🔸 Original:   "Two things that I just can't respect.... A fuck nigga and a lyin ass bitch" @YoungDolph DOLPHHHHH!!!
🧽 Cleaned:    " two things that i just can not respect . . . . a fuck nigga and a lyin ass bitch " <user> dolphhhhh ! ! !
🌐 Translated: Dua perkara yang saya tidak boleh hormati.. A fucknigga dan seorang lipin pantat bitch <user> Dolphhhhh!
--------------------------------------------------------------------------------
🔸 Original:   If a white man called me, a black woman, a negro, I would be offended. I would consider it a slur.
🧽 Cleaned:    if a white man called me , a black woman , a negro , i would be offended . i would consider it a slur .
🌐 Translated: Jika seorang lelaki kulit putih memanggil saya, wanita berkulit hitam, Saya akan menganggapnya sebagai omong kosong.
--------------------------------------------------------------------------------
🔸 Original:   @y_alibhai you smear and insult Jews and cravenly cheer on the faction of left wing useful idiots who hat

In [9]:
import os
import pandas as pd

# Look for the latest checkpoint
checkpoint_files = sorted(
    [f for f in os.listdir() if f.startswith("checkpoint_") and f.endswith(".csv")],
    key=lambda x: int(re.findall(r'\d+', x)[0])
)

if checkpoint_files:
    latest_checkpoint = checkpoint_files[-1]
    print(f"🔄 Resuming from {latest_checkpoint}")
    
    df_checkpoint = pd.read_csv(latest_checkpoint)
    translated_texts = df_checkpoint["translated"].tolist()
    cleaned_texts = df_checkpoint["cleaned"].tolist()
    processed_count = len(translated_texts)
else:
    print("🆕 No checkpoint found. Starting fresh.")
    translated_texts, cleaned_texts = [], []
    processed_count = 0


🔄 Resuming from checkpoint_12000.csv


In [10]:
df_checkpoint.to_csv(f"checkpoint_{len(translated_texts)}.csv", index=False, encoding="utf-8-sig")

In [11]:
from concurrent.futures import ThreadPoolExecutor

# Prepare lists
translated_texts, cleaned_texts = [], []
batch, cleaned_batch = [], []

# Language detection
lang_model = malaya.language_detection.fasttext(
    model='mesolitica/fasttext-language-detection-ms-id'
)

# 🔹 Stage 1: Translate all tweets first
batch_count = 0
checkpoint_interval = 2000  # every 2000 tweets

for tweet in tqdm(df_filtered["tweet_text"].iloc[processed_count:], desc="Translating Tweets", initial=processed_count, total=len(df_filtered)):
    cleaned = " ".join(text_processor.pre_process_doc(tweet))
    cleaned_batch.append(cleaned)
    batch.append(tweet)

    if len(batch) == 32:
        short_texts = [t for t in cleaned_batch if len(t.split()) < 100]
        long_texts = [t for t in cleaned_batch if len(t.split()) >= 100]

        fast_trans = fast_batch_translate(short_texts, trans_model, trans_tokenizer)
        with ThreadPoolExecutor(max_workers=4) as executor:
            slow_trans = list(executor.map(safe_translate_with_fallback, long_texts))

        translated_texts.extend(fast_trans + slow_trans)
        cleaned_texts.extend(short_texts + long_texts)

        batch_count += 1

        # 🛟 Auto-save every 2000 translated tweets
        if len(translated_texts) % checkpoint_interval == 0:
            df_checkpoint = pd.DataFrame({
                "cleaned": cleaned_texts,
                "translated": translated_texts
            })
            df_checkpoint.to_csv(f"checkpoint_{len(translated_texts)}.csv", index=False, encoding="utf-8-sig")
            print(f"💾 Checkpoint saved at {len(translated_texts)} tweets")

        batch, cleaned_batch = [], []
        torch.cuda.empty_cache()

# Final flush
if batch:  
    short_texts = [t for t in cleaned_batch if len(t.split()) < 100]
    long_texts = [t for t in cleaned_batch if len(t.split()) >= 100]

    fast_trans = fast_batch_translate(short_texts, trans_model, trans_tokenizer)
    slow_trans = [safe_translate_text(t, trans_model, trans_tokenizer) for t in long_texts]

    translated_texts.extend(fast_trans + slow_trans)
    cleaned_texts.extend(short_texts + long_texts)


Translating Tweets:  34%|███▍      | 16000/47286 [43:22<5:04:35,  1.71it/s]

💾 Checkpoint saved at 4000 tweets


Translating Tweets:  42%|████▏     | 20004/47286 [1:23:39<4:23:26,  1.73it/s]

💾 Checkpoint saved at 8000 tweets


Translating Tweets:  51%|█████     | 23968/47286 [2:01:23<3:52:15,  1.67it/s]

💾 Checkpoint saved at 12000 tweets


Translating Tweets:  59%|█████▉    | 28000/47286 [2:46:25<3:30:29,  1.53it/s]

💾 Checkpoint saved at 16000 tweets


Translating Tweets:  68%|██████▊   | 32000/47286 [3:34:28<1:44:58,  2.43it/s]

💾 Checkpoint saved at 20000 tweets


Translating Tweets:  76%|███████▌  | 36000/47286 [3:51:58<36:36,  5.14it/s]  

💾 Checkpoint saved at 24000 tweets


Translating Tweets:  85%|████████▍ | 39968/47286 [4:12:36<1:00:18,  2.02it/s]

💾 Checkpoint saved at 28000 tweets


Translating Tweets:  93%|█████████▎| 43968/47286 [4:31:53<23:26,  2.36it/s]  

💾 Checkpoint saved at 32000 tweets


Translating Tweets: 100%|██████████| 47286/47286 [4:47:53<00:00,  2.04it/s]


In [12]:

df_translated = pd.DataFrame({
    "cleaned_english": cleaned_texts,
    "translated_malay": translated_texts,
    "cyberbullying_type": df_filtered["cyberbullying_type"].tolist()[processed_count:]
})

df_translated.to_csv("clean_translated_full_v2.csv", index=False, encoding="utf-8-sig")
print("✅ Saved translated output to translated_malay_full.csv")


✅ Saved translated output to translated_malay_full.csv


In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# ✅ Load original filtered dataframe
df_full = df_filtered.copy()

# ✅ Step 1: Decide how many tweets are missing
missing_start = 0
missing_end = 12000

# ✅ Step 2: Select only missing tweets
df_missing = df_full.iloc[missing_start:missing_end]

# ✅ Step 3: Translate missing tweets
translated_texts = []
cleaned_texts = []
batch, cleaned_batch = [], []

def safe_translate_with_fallback(text):
    try:
        return safe_translate_text(text, trans_model, trans_tokenizer)
    except Exception as e:
        print(f"[Error] {e} | Text: {text[:80]}")
        return ""

def fast_batch_translate(texts, model, tokenizer, max_tokens=256, batch_size=64):
    translated = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        prompts = [f"terjemah ke Melayu: {t}" for t in batch]
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items() if k != "token_type_ids"}
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=max_tokens, no_repeat_ngram_size=2)
        translated.extend([tokenizer.decode(o, skip_special_tokens=True) for o in outputs])
    return translated

# ✅ Step 4: Translate
for tweet in tqdm(df_missing["tweet_text"], desc="Translating Missing Tweets"):
    cleaned = " ".join(text_processor.pre_process_doc(tweet))
    cleaned_batch.append(cleaned)
    batch.append(tweet)

    if len(batch) == 32:
        short_texts = [t for t in cleaned_batch if len(t.split()) < 100]
        long_texts = [t for t in cleaned_batch if len(t.split()) >= 100]

        fast_trans = fast_batch_translate(short_texts, trans_model, trans_tokenizer)
        with ThreadPoolExecutor(max_workers=4) as executor:
            slow_trans = list(executor.map(safe_translate_with_fallback, long_texts))

        translated_texts.extend(fast_trans + slow_trans)
        cleaned_texts.extend(short_texts + long_texts)

        batch, cleaned_batch = [], []
        torch.cuda.empty_cache()

# ✅ Final small batch if still left
if batch:
    short_texts = [t for t in cleaned_batch if len(t.split()) < 100]
    long_texts = [t for t in cleaned_batch if len(t.split()) >= 100]

    fast_trans = fast_batch_translate(short_texts, trans_model, trans_tokenizer)
    slow_trans = [safe_translate_text(t, trans_model, trans_tokenizer) for t in long_texts]

    translated_texts.extend(fast_trans + slow_trans)
    cleaned_texts.extend(short_texts + long_texts)

# ✅ Step 5: Save missing 1–12000 into DIFFERENT file
df_missing_translated = pd.DataFrame({
    "cleaned": cleaned_texts,
    "translated": translated_texts,
    "cyberbullying_type": df_missing["cyberbullying_type"].values   # ⬅️ Add label here
})


df_missing_translated.to_csv("translated_missing_1_to_12000.csv", index=False, encoding="utf-8-sig")
print("✅ Saved missing translations: translated_missing_1_to_12000.csv")


In [22]:
# Reload saved CSV
df_translated_only = pd.read_csv("translated_missing_1_to_12000.csv")

# Get original labels
df_labels_only = df_filtered.iloc[0:len(df_translated_only)]["cyberbullying_type"].reset_index(drop=True)

# Combine back properly
df_fixed = pd.DataFrame({
    "cleaned_english": df_translated_only["cleaned_english"],
    "translated_malay": df_translated_only["translated_malay"],
    "cyberbullying_type": df_labels_only
})

# Save fixed version
df_fixed.to_csv("translated_missing_1_to_12000_fixed.csv", index=False, encoding="utf-8-sig")
print(f"✅ Saved recovered file with labels: translated_missing_1_to_12000_fixed.csv")


✅ Saved recovered file with labels: translated_missing_1_to_12000_fixed.csv


In [24]:
# Load missing translation
df_missing = pd.read_csv("translated_missing_1_to_12000_fixed.csv")

# Load existing translation
df_existing = pd.read_csv("clean_translated_full_v2.csv")

# ✅ Optional but recommended: ensure same columns
assert list(df_existing.columns) == list(df_missing.columns), "❗ Column mismatch detected!"

# ✅ Merge (missing first, then existing)
df_combined = pd.concat([df_missing, df_existing], ignore_index=True)

# ✅ Save
df_combined.to_csv("clean_translated_combined.csv", index=False, encoding="utf-8-sig")

print("✅ Successfully merged! Saved to clean_translated_combined.csv")

✅ Successfully merged! Saved to clean_translated_combined.csv


In [25]:
# Load your merged file
df = pd.read_csv("merged_translated_all_v2.csv")

# Show class distribution
print("📊 Cyberbullying Type Distribution:\n")
print(df["cyberbullying_type"].value_counts())


📊 Cyberbullying Type Distribution:

cyberbullying_type
religion               7999
age                    7996
ethnicity              7996
gender                 7931
not_cyberbullying      7772
other_cyberbullying    7592
Name: count, dtype: int64


In [26]:
# 🔹 Load merged dataset
df = pd.read_csv("merged_translated_all_v2.csv")
translated_texts = df["translated_malay"].tolist()
original_labels = df["cyberbullying_type"].tolist()

# 🔹 Load Malaya augmenter
augmenter = abstractive.huggingface(
    model='mesolitica/translation-nanot5-small-malaysian-cased'
)

# 🔹 Define augmentation rules
augmentation_prob = {
    "gender": 0.3,               # 🔹 Mild boost
    "not_cyberbullying": 0.5,    # 🔸 Bigger boost for low recall
    "other_cyberbullying": 0.7   # 🔸 Largest boost to fix false positives
}

# 🔹 Run selective augmentation
augmented_texts = []
augmented_flags = []

for text, label in tqdm(zip(translated_texts, original_labels), total=len(df), desc="Selective Augmentation"):
    try:
        lang = lang_model.predict(text)[0].lower()
        prob = augmentation_prob.get(label, 0)

        if lang in ["standard-malay", "local-malay", "malay"] and random.random() < prob:
            result = augmenter.generate([text], to_lang="pasar ms", max_length=128)[0]
            if isinstance(result, list):
                augmented_texts.append(result[0])
            else:
                augmented_texts.append(result)
            augmented_flags.append(True)
        else:
            augmented_texts.append(None)
            augmented_flags.append(False)
    except Exception as e:
        print("[Aug Error]", e)
        augmented_texts.append(None)
        augmented_flags.append(False)

Selective Augmentation: 100%|██████████| 47286/47286 [1:29:50<00:00,  8.77it/s]  


In [27]:
# 🔹 Combine back into the original DataFrame
df["augmented_malay"] = augmented_texts
df["was_augmented"] = augmented_flags
df["final_text"] = df["augmented_malay"].combine_first(df["translated_malay"])

In [28]:
# 🔍 Optional: Show class distribution of augmented samples
print("\n✅ Augmentation done.")
print(df["cyberbullying_type"].value_counts())
print(df[df["was_augmented"] == True]["cyberbullying_type"].value_counts())


✅ Augmentation done.
cyberbullying_type
religion               7999
age                    7996
ethnicity              7996
gender                 7931
not_cyberbullying      7772
other_cyberbullying    7592
Name: count, dtype: int64
cyberbullying_type
other_cyberbullying    2396
not_cyberbullying      1925
gender                 1411
Name: count, dtype: int64


In [29]:
# 🔹 Save updated version
df.to_csv("augmented_merged_translated_all_v3.csv", index=False, encoding="utf-8-sig")

In [30]:
# Load full DataFrame with all sources
df = pd.read_csv("augmented_merged_translated_all_v3.csv")

# Tag each row with source based on where the text is available
records = []

for idx, row in df.iterrows():
    label = row["cyberbullying_type"]

    if pd.notna(row.get("cleaned_english")):
        records.append({"source": "english", "cyberbullying_type": label})
    if pd.notna(row.get("translated_malay")):
        records.append({"source": "translated", "cyberbullying_type": label})
    if pd.notna(row.get("augmented_malay")):
        records.append({"source": "augmented", "cyberbullying_type": label})

# Create DataFrame
df_lang_dist = pd.DataFrame(records)

# Group and count
lang_summary = df_lang_dist.groupby(["cyberbullying_type", "source"]).size().unstack(fill_value=0)

print("📊 Language breakdown per class:")
print(lang_summary)


📊 Language breakdown per class:
source               augmented  english  translated
cyberbullying_type                                 
age                          0     7996        7996
ethnicity                    0     7996        7996
gender                    1411     7931        7931
not_cyberbullying         1925     7772        7772
other_cyberbullying       2396     7592        7592
religion                     0     7999        7999


In [32]:
df = pd.read_csv("augmented_merged_translated_all_v3.csv")

final_rows = []

# Target total per class
target_total = 12000

for label in df["cyberbullying_type"].unique():
    df_label = df[df["cyberbullying_type"] == label]

    # 🔹 Separate sources
    eng_rows = df_label[df_label["cleaned_english"].notna()]
    trans_rows = df_label[df_label["translated_malay"].notna()]
    aug_rows = df_label[df_label["augmented_malay"].notna()]

    # 1️⃣ Take all augmented
    selected_aug = aug_rows

    # 2️⃣ Take half English
    selected_eng = eng_rows.sample(frac=0.5, random_state=42)

    # 3️⃣ Remaining needed
    remaining = target_total - len(selected_aug) - len(selected_eng)

    # 4️⃣ Take from translated (only how many available)
    if remaining > len(trans_rows):
        selected_trans = trans_rows  # take all available
    else:
        selected_trans = trans_rows.sample(n=remaining, random_state=42)


    # 🧩 Combine selected parts
    final_rows.extend([selected_aug, selected_eng, selected_trans])

# 🧹 Merge all
df_final = pd.concat(final_rows, ignore_index=True)

# 🛠️ Merge text columns into single 'tweet_text'
def combine_text(row):
    if pd.notna(row.get("augmented_malay")):
        return row["augmented_malay"]
    elif pd.notna(row.get("translated_malay")):
        return row["translated_malay"]
    elif pd.notna(row.get("cleaned_english")):
        return row["cleaned_english"]
    return None

df_final["tweet_text"] = df_final.apply(combine_text, axis=1)

# Keep only necessary columns
df_final = df_final[["tweet_text", "cyberbullying_type"]]

# Show final counts
print("✅ Final class distribution:")
print(df_final["cyberbullying_type"].value_counts())

✅ Final class distribution:
cyberbullying_type
gender                 12000
other_cyberbullying    12000
not_cyberbullying      12000
religion               11999
age                    11994
ethnicity              11994
Name: count, dtype: int64


In [33]:
df_final.to_csv("final_balanced_dataset_v3.csv", index=False, encoding="utf-8-sig")