<a href="https://colab.research.google.com/github/JagrutiKate47/Malicious-Emoji-Detection/blob/main/MaliciousandBenignEmojis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
pip install pandas datasets emoji transformers



In [15]:
import pandas as pd
import random
import unicodedata
import re
import emoji # Make sure you have this installed: pip install emoji
from transformers import AutoTokenizer
import os # For checking if the file exists

print("--- Phase 1: Deep Understanding and Data Strategy ---")
print("--- Step 1: Obtaining Clean, Legitimate Text with Diverse Emoji Usage (Benign Data from your Downloaded File) ---")

# --- 1.1. Specify your local dataset file path ---
# <<< IMPORTANT: ENSURE YOU HAVE DOWNLOADED 'emoji-data.csv' FROM KAGGLE
# <<< AND THAT THIS PATH IS CORRECT. IF YOUR FILE HAS A DIFFERENT NAME, CHANGE IT HERE.
LOCAL_DATASET_FILE = '/content/drive/MyDrive/archive (1)/emoji_df.csv'

# Verify the file exists before proceeding
if not os.path.exists(LOCAL_DATASET_FILE):
    raise FileNotFoundError(f"Error: The file '{LOCAL_DATASET_FILE}' was not found. "
                            f"Please download it from https://www.kaggle.com/datasets/eliasdabbas/emoji-data-descriptions-codepoints/data "
                            f"and place it in the same directory as this script, or update the 'LOCAL_DATASET_FILE' path.")

print(f"Attempting to load data from your local file: '{LOCAL_DATASET_FILE}'...")
clean_texts_base_for_manipulation = []

try:
    df_raw_emojis = pd.read_csv(LOCAL_DATASET_FILE)
    print(f"Successfully loaded {len(df_raw_emojis)} entries from '{LOCAL_DATASET_FILE}'.")
    print(f"Columns in the dataframe: {df_raw_emojis.columns}")

    # --- 1.2. Preprocessing and constructing emoji-rich base sentences ---
    # The Kaggle dataset 'emoji-data-descriptions-codepoints' has 'Emoji' and 'Description' columns.
    # We will combine these to create sentences that explicitly contain the emoji.

    def preprocess_text_part(text):
        if not isinstance(text, str): # Handle potential non-string entries
            return ""
        text = text.strip()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'@\w+|#\w+', '', text)
        text = unicodedata.normalize('NFKC', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    print("Constructing benign base sentences by combining 'emoji' and 'name' from your dataset...")
    for index, row in df_raw_emojis.iterrows():
        current_emoji = preprocess_text_part(row.get('emoji', ''))
        description = preprocess_text_part(row.get('name', ''))

        if current_emoji and description:
            # Create a few sentence patterns to vary the benign context
            sentence_patterns = [
                f"{description} {current_emoji}.",
                f"This is a {description} emoji: {current_emoji}.",
                f"It's a {description} {current_emoji} and it looks great.",
                f"The symbol {current_emoji} represents {description}.",
                f"{description}, like {current_emoji}, is common."
            ]
            clean_texts_base_for_manipulation.append(random.choice(sentence_patterns))
        elif current_emoji: # Fallback if only emoji is available, but description is missing
            clean_texts_base_for_manipulation.append(f"Emoji: {current_emoji}")

    # Filter to ensure final base texts genuinely contain a Unicode emoji
    # This uses the robust `emoji` library's detection.
    clean_texts_final_for_manipulation = [
        text for text in clean_texts_base_for_manipulation if emoji.emoji_count(text) > 0
    ]

    # Remove any duplicate base sentences that might have been generated
    clean_texts_final_for_manipulation = list(set(clean_texts_final_for_manipulation))

    if not clean_texts_final_for_manipulation:
        raise RuntimeError("CRITICAL: No valid texts with Unicode emojis could be generated from your provided dataset. "
                           "Please ensure the CSV has 'emoji' and 'name' columns with actual Unicode emojis.")

    print(f"Constructed {len(clean_texts_final_for_manipulation)} unique emoji-rich texts for manipulation.")

except Exception as e:
    raise RuntimeError(f"Error loading or processing your local dataset '{LOCAL_DATASET_FILE}': {e}. "
                       f"Please check the file name, its format (should be CSV), and column headers ('emoji', 'name').")


print(f"First 3 examples of base texts (from your downloaded database, containing emojis):")
for i, text in enumerate(clean_texts_final_for_manipulation[:3]):
    print(f"  {i+1}. '{text}'")


### --- Step 2: Develop a Robust Synthetic Data Generation Pipeline for Malicious Emojis ---

print("\n--- Step 2: Developing Synthetic Data Generation Pipeline for Malicious Emojis ---")

# --- 2.1. Define Malicious Character Pools ---
ZERO_WIDTH_CHARS = [
    '\u200B', # Zero Width Space
    '\u200C', # Zero Width Non-Joiner
    '\u200D', # Zero Width Joiner (can be benign, but used maliciously to disrupt or force joins)
    '\uFEFF', # Byte Order Mark (sometimes used as ZWS)
    '\u00AD', # Soft Hyphen (invisible unless line breaks)
    '\u0009', # Tab (can be visually ambiguous in some contexts)
]

CONTROL_CHARS = [
    '\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007',
    '\u0008', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013', '\u0014',
    '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C',
    '\u001D', '\u001E', '\u001F', '\u007F', # DEL
    '\u0080', '\u0081', '\u0082', '\u0083', '\u0084', '\u0085', '\u0086', '\u0087',
    '\u0088', '\u0089', '\u008A', '\u008B', '\u008C', '\u008D', '\u008E', '\u008F',
    '\u0090', '\u0091', '\u0092', '\u0093', '\u0094', '\u0095', '\u0096', '\u0097',
    '\u0098', '\u0099', '\u009A', '\u009B', '\u009C', '\u009D', '\u009E', '\u009F' # C1 Control codes
]

HOMOGLYPH_MAP = {
    'a': ['\u0430', '\uFF41', '\u1D00', '\u2C65'], 'b': ['\u0432', '\uFF42'],
    'c': ['\u0441', '\uFF43', '\u2C61'], 'e': ['\u0435', '\uFF45', '\u0454', '\u03B5'],
    'f': ['\u0493', '\uFF46'], 'g': ['\u0433', 'ｇ'], 'h': ['һ', 'ｈ'],
    'i': ['і', 'ｉ', 'I', 'ī'], 'j': ['ј', 'ｊ'], 'k': ['к', 'ｋ'], 'l': ['л', 'ｌ', 'I'],
    'm': ['м', 'ｍ'], 'n': ['н', 'ｎ'],
    'o': ['о', 'ｏ', 'ο', 'Ｏ'], 'p': ['р', 'ｐ', 'ᴩ'],
    'q': ['я', 'ｑ'], 'r': ['г', 'ｒ', 'я'],
    's': ['ѕ', 'ｓ'], 't': ['т', 'ｔ'], 'u': ['ц', 'ｕ'],
    'v': ['ѵ', 'ｖ'], 'w': ['Ш', 'ｗ'], 'x': ['х', 'ｘ'],
    'y': ['у', 'ｙ'], 'z': ['з', 'ｚ'],
    '0': ['О', 'о'], '1': ['l', 'L', '１'],
}

# --- Helper Functions for Emoji/Word Detection ---

def get_emoji_indices(text):
    indices = []
    for i, char in enumerate(text):
        if emoji.emoji_count(char) > 0:
            indices.append((i, i + 1))
    return indices

def get_word_indices(text):
    return [(m.start(), m.end()) for m in re.finditer(r'\b\w+\b', text)]

def is_within_grapheme_cluster(index, text):
    # This is a simplified check. A truly robust grapheme cluster check
    # might require a more sophisticated library if complex ZWJ sequences are common.
    # For the purpose of this script, checking if the character at the index is an emoji
    # is a good approximation.
    if index < 0 or index >= len(text):
        return False
    return emoji.emoji_count(text[index]) > 0

# --- 2.2. Implement the Generation Pipeline Functions ---

def insert_zero_width_chars(text, char_pool=ZERO_WIDTH_CHARS, insertion_prob=0.15):
    modified_text_list = list(text)
    insert_points_with_chars = []

    for i in range(len(text) + 1):
        if is_within_grapheme_cluster(i, text) or (i > 0 and is_within_grapheme_cluster(i-1, text)):
            if random.random() < insertion_prob * 3:
                insert_points_with_chars.append((i, random.choice(char_pool)))
        elif i < len(text) and text[i].isalnum():
            if random.random() < insertion_prob:
                insert_points_with_chars.append((i, random.choice(char_pool)))
        elif i == len(text) and text:
            if random.random() < insertion_prob:
                insert_points_with_chars.append((i, random.choice(char_pool)))

    for idx, char_to_insert in sorted(insert_points_with_chars, key=lambda x: x[0], reverse=True):
        modified_text_list.insert(idx, char_to_insert)

    return "".join(modified_text_list)

def disrupt_emoji_sequence(text, char_pool=ZERO_WIDTH_CHARS, disruption_prob=0.3):
    modified_text = text
    non_zwj_chars = [c for c in char_pool if c != '\u200D']
    if not non_zwj_chars: non_zwj_chars = ['\u200B']

    zwj_matches = [m.start() for m in re.finditer(r'\u200D', text)]
    if zwj_matches and random.random() < disruption_prob:
        zwj_idx = random.choice(zwj_matches)
        disrupt_char = random.choice(non_zwj_chars)
        insert_pos = zwj_idx + random.choice([0, 1])
        modified_text = modified_text[:insert_pos] + disrupt_char + modified_text[insert_pos:]
        return modified_text

    all_emoji_spans = get_emoji_indices(text)
    for start, end in all_emoji_spans:
        emoji_segment = text[start:end]
        if len(emoji_segment) > 1 and random.random() < disruption_prob:
            if not (len(emoji_segment) == 2 and unicodedata.category(emoji_segment[1]) == 'Sk'):
                 insert_idx_in_segment = random.randint(1, len(emoji_segment) - 1)
                 char_to_insert = random.choice(non_zwj_chars)
                 new_emoji_segment = emoji_segment[:insert_idx_in_segment] + char_to_insert + emoji_segment[insert_idx_in_segment:]
                 modified_text = text[:start] + new_emoji_segment + text[end:]
                 return modified_text

    return modified_text


def substitute_homoglyphs(text, homoglyph_map=HOMOGLYPH_MAP, substitution_prob=0.07):
    modified_text_list = list(text)
    emoji_indices = get_emoji_indices(text)
    word_indices = get_word_indices(text)

    target_char_indices = set()

    for ws, we in word_indices:
        word_str = text[ws:we]

        is_relevant = False
        for es, ee in emoji_indices:
            if (ws <= ee + 10 and ws >= es - 10) or \
               (we <= ee + 10 and we >= es - 10):
                is_relevant = True
                break

        if not is_relevant and re.search(r'\b(?:https?://|www\.|[a-zA-Z0-9-]+\.(?:com|org|net|io|dev|app|bank|co|in|xyz))\b', word_str, re.IGNORECASE):
            is_relevant = True

        if is_relevant:
            for i in range(ws, we):
                if text[i].lower() in homoglyph_map:
                    target_char_indices.add(i)

    for i in sorted(list(target_char_indices)):
        char = modified_text_list[i]
        if random.random() < substitution_prob:
            modified_text_list[i] = random.choice(homoglyph_map[char.lower()])

    return "".join(modified_text_list)

def insert_control_codes(text, char_pool=CONTROL_CHARS, insertion_prob=0.01):
    modified_text_list = list(text)
    insert_points = []
    for i in range(len(text) + 1):
        if random.random() < insertion_prob:
            insert_points.append(i)

    for idx in sorted(insert_points, reverse=True):
        modified_text_list.insert(idx, random.choice(char_pool))

    return "".join(modified_text_list)


def generate_malicious_example(original_sentence):
    temp_sentence = original_sentence
    initial_sentence = original_sentence

    selected_transformations = []

    prob_zwc = 0.95
    prob_homoglyph = 0.7
    prob_zwj_disruption = 0.5
    prob_control_code = 0.15

    if random.random() < prob_zwc:
        selected_transformations.append("zwc")
    if get_word_indices(original_sentence) and random.random() < prob_homoglyph:
        selected_transformations.append("homoglyph")
    if get_emoji_indices(original_sentence) and random.random() < prob_zwj_disruption:
        selected_transformations.append("zwj_disruption")
    if random.random() < prob_control_code:
        selected_transformations.append("control_code")

    if not selected_transformations:
        if original_sentence and get_emoji_indices(original_sentence):
            selected_transformations.append("zwc")
        elif original_sentence and get_word_indices(original_sentence):
            selected_transformations.append("homoglyph")
        else:
            selected_transformations.append("zwc")

    random.shuffle(selected_transformations)

    for transform_type in selected_transformations:
        if transform_type == "zwc":
            temp_sentence = insert_zero_width_chars(temp_sentence)
        elif transform_type == "homoglyph":
            temp_sentence = substitute_homoglyphs(temp_sentence)
        elif transform_type == "zwj_disruption":
            temp_sentence = disrupt_emoji_sequence(temp_sentence)
        elif transform_type == "control_code":
            temp_sentence = insert_control_codes(temp_sentence)

    if temp_sentence == initial_sentence:
        if original_sentence:
            insert_idx = random.randint(0, len(original_sentence))
            temp_sentence = original_sentence[:insert_idx] + random.choice(ZERO_WIDTH_CHARS) + original_sentence[insert_idx:]
        else:
            temp_sentence = random.choice(ZERO_WIDTH_CHARS)

    return temp_sentence

# --- 2.3. Dataset Generation Loop (ONLY for malicious examples) ---

malicious_dataset = []

print(f"\nGenerating ONLY malicious examples from your provided base texts...")

NUM_MALICIOUS_VARIANTS_PER_BASE = 5 # Generate 2 to 5 malicious variants per base sentence

for i, s in enumerate(clean_texts_final_for_manipulation):
    if i % 1000 == 0:
        print(f"  Processing base text {i+1}/{len(clean_texts_final_for_manipulation)}")

    for _ in range(random.randint(2, NUM_MALICIOUS_VARIANTS_PER_BASE)):
        malicious_text = generate_malicious_example(s)
        if malicious_text != s:
            malicious_dataset.append({"text": malicious_text, "label": 1})

print(f"Generated {len(malicious_dataset)} malicious examples.")

# Convert to Pandas DataFrame (only malicious examples)
df_malicious_dataset = pd.DataFrame(malicious_dataset)
print(f"\nTotal malicious dataset size: {len(df_malicious_dataset)} examples.")
print("Malicious Dataset head (first 10 examples):")
print(df_malicious_dataset.head(10))

# --- 2.4. Validation and Quality Check (Tokenizer Inspection) ---

print("\n--- Validation and Quality Check (Tokenizer Inspection) ---")

tokenizer = AutoTokenizer.from_pretrained("gpt2")
print("Loaded GPT-2 tokenizer (gpt2 is a common base for GPT-like tokenization).")

print("\n--- Demonstrating tokenizer behavior on a malicious example: ---")
malicious_sample = None
if not df_malicious_dataset.empty:
    malicious_sample = df_malicious_dataset.sample(1).iloc[0]

if malicious_sample is not None:
    text_to_tokenize = malicious_sample['text']
    print(f"Malicious Text (Label {malicious_sample['label']}): '{text_to_tokenize}'")

    tokens = tokenizer.tokenize(text_to_tokenize)
    print(f"Tokens detected by GPT tokenizer: {tokens}")

    decoded_text = tokenizer.decode(tokenizer.encode(text_to_tokenize))
    print(f"Decoded text from tokens (may re-hide invisible chars): '{decoded_text}'")

    print(f"\nRaw Python string representation (shows escape codes for invisible chars):")
    print(f"'{repr(text_to_tokenize)}'")

    print("\nNote: 'repr()' shows the actual Unicode escape sequences (e.g., '\\u200B') for invisible characters.")
    print("This confirms the characters are present in the string data that the tokenizer processes.")
else:
    print("No malicious sample found for tokenizer demonstration. This indicates no malicious data was generated.")

print("\n--- Synthetic Malicious Emoji Dataset Generation Complete ---")
print("You now have a Pandas DataFrame 'df_malicious_dataset' containing ONLY your labeled malicious data.")
print("You can save it to CSV, JSON, or use it directly.")

# Example of saving the dataset to a CSV file
# df_malicious_dataset.to_csv("malicious_emoji_dataset_only.csv", index=False, encoding="utf-8")
# print("\nDataset saved to 'malicious_emoji_dataset_only.csv'")

--- Phase 1: Deep Understanding and Data Strategy ---
--- Step 1: Obtaining Clean, Legitimate Text with Diverse Emoji Usage (Benign Data from your Downloaded File) ---
Attempting to load data from your local file: '/content/drive/MyDrive/archive (1)/emoji_df.csv'...
Successfully loaded 4724 entries from '/content/drive/MyDrive/archive (1)/emoji_df.csv'.
Columns in the dataframe: Index(['emoji', 'name', 'group', 'sub_group', 'codepoints'], dtype='object')
Constructing benign base sentences by combining 'emoji' and 'name' from your dataset...
Constructed 4693 unique emoji-rich texts for manipulation.
First 3 examples of base texts (from your downloaded database, containing emojis):
  1. 'It's a person golfing: medium-light skin tone 🏌🏼 and it looks great.'
  2. 'The symbol 👨🏿‍🦲 represents man: dark skin tone, bald.'
  3. 'The symbol 👨‍👧‍👧 represents family: man, girl, girl.'

--- Step 2: Developing Synthetic Data Generation Pipeline for Malicious Emojis ---

Generating ONLY malicious exa

In [17]:
# --- Step 3: Create the Corresponding Benign Dataset ---

print("\n--- Step 3: Creating the Benign (Label 0) Dataset ---")

# The 'clean_texts_final_for_manipulation' list contains our clean, emoji-rich texts.
# We will now formalize this into a DataFrame with a 'label' of 0.

benign_dataset = [{"text": text, "label": 0} for text in clean_texts_final_for_manipulation]

# Convert to Pandas DataFrame
df_benign_dataset = pd.DataFrame(benign_dataset)

print(f"Created a benign dataset with {len(df_benign_dataset)} examples.")
print("Benign Dataset head (first 10 examples):")
display(df_benign_dataset.head(10))

print("\n--- Benign Dataset Creation Complete ---")
print("You now have a Pandas DataFrame 'df_benign_dataset' containing your labeled benign data.")


--- Step 3: Creating the Benign (Label 0) Dataset ---
Created a benign dataset with 4693 examples.
Benign Dataset head (first 10 examples):


Unnamed: 0,text,label
0,It's a person golfing: medium-light skin tone ...,0
1,The symbol 👨🏿‍🦲 represents man: dark skin tone...,0
2,"The symbol 👨‍👧‍👧 represents family: man, girl,...",0
3,This is a reverse button emoji: ◀.,0
4,The symbol 🙍🏾 represents person frowning: medi...,0
5,The symbol 🏄🏻‍♂️ represents man surfing: light...,0
6,It's a man shrugging 🤷‍♂ and it looks great.,0
7,This is a horse racing: medium-light skin tone...,0
8,"The symbol 👨🏿‍❤‍💋‍👨🏿 represents kiss: man, man...",0
9,It's a man in lotus position: medium skin tone...,0



--- Benign Dataset Creation Complete ---
You now have a Pandas DataFrame 'df_benign_dataset' containing your labeled benign data.


In [18]:
# --- Step 4: Combine and Shuffle the Datasets ---

print("\n--- Step 4: Combining Benign and Malicious Datasets ---")

# Concatenate the two dataframes
df_dataset = pd.concat([df_benign_dataset, df_malicious_dataset], ignore_index=True)

# Shuffle the dataset to mix benign and malicious examples
df_dataset = df_dataset.sample(frac=1).reset_index(drop=True)

print(f"Combined dataset created with {len(df_dataset)} total examples.")
print(f"Class distribution:\n{df_dataset['label'].value_counts()}")

print("\nCombined and Shuffled Dataset head (first 10 examples):")
display(df_dataset.head(10))

print("\n--- Dataset Combination and Shuffling Complete ---")
print("You now have a unified, shuffled dataset in the 'df_dataset' DataFrame, ready for model training.")


--- Step 4: Combining Benign and Malicious Datasets ---
Combined dataset created with 20969 total examples.
Class distribution:
label
1    16276
0     4693
Name: count, dtype: int64

Combined and Shuffled Dataset head (first 10 examples):


Unnamed: 0,text,label
0,The symbol 🚶🏼‍♀ represents woman walking: medi...,0
1,It's a woman shrugging: light skin tone 🤷🏻‍♀ a...,0
2,"kiss: woma\tn, woman, li﻿ght skin ​tone, d﻿ar﻿...",1
3,T﻿h﻿e sy\tmbol ​🚴‌ repr‍esents person bik‌i﻿n­g.,1
4,ｔhe sy‌mbol ‌👎🏽 rep﻿resеnts thumbs down: med‍﻿...,1
5,This is a flag: Cocos (Keeling) Islands emoji:...,0
6,Tｈe symbоl 🏌️‍‌♂ ﻿r‌epresents man go​lfing.,1
7,accordion 🪗.,0
8,T\th­e symbol 🛩️ ­rep­rese­nts ‍smal​l air­plane.,1
9,This is a ​goggleｓ e‍moji: 🥽.\t,1



--- Dataset Combination and Shuffling Complete ---
You now have a unified, shuffled dataset in the 'df_dataset' DataFrame, ready for model training.


In [26]:
# Save the malicious dataset to a CSV file
df_malicious_dataset.to_csv("malicious_emoji_dataset.csv", index=False, encoding="utf-8")

print("Malicious emoji dataset saved to 'malicious_emoji_dataset.csv'")

Malicious emoji dataset saved to 'malicious_emoji_dataset.csv'


In [27]:
# Save the combined dataset to a CSV file
df_dataset.to_csv("combined_emoji_dataset.csv", index=False, encoding="utf-8")

print("Combined emoji dataset saved to 'combined_emoji_dataset.csv'")

Combined emoji dataset saved to 'combined_emoji_dataset.csv'
