In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/homoglyph-text/homoglyph_data.csv
/kaggle/input/watermarked-and-unwatermarked-text-truncated/data_trunk.csv


## Attack Simulation

In [2]:
# !pip install confusable_homoglyphs
# from confusable_homoglyphs import confusables

In [3]:
def replace_with_homoglyph(char):
    """
    Replaces a character with a visually similar homoglyph.
    
    :param char: The character to replace.
    :return: The replaced character.
    """
    # Basic mapping of characters to homoglyphs
    homoglyph_mapping = {
        'a': 'а', # Latin 'a' to Cyrillic 'a'
        'e': 'е', # Latin 'e' to Cyrillic 'e'
        'o': 'о', # Latin 'o' to Cyrillic 'o'
        'i': 'і', # Latin 'i' to Cyrillic 'i'
        # Add more mappings as needed
    }
    
    return homoglyph_mapping.get(char, char) # Return the homoglyph if available, else return the original character

def homoglyph_attack(text):
    """
    Replaces characters in the text with their homoglyphs.
    
    :param text: The original text.
    :return: The text with homoglyphs.
    """
    # Replace each character in the text with its homoglyph
    homoglyph_text = ''.join(replace_with_homoglyph(c) for c in text)
    return homoglyph_text

# Example usage
original_text = "Hello, World!"
homoglyph_text = homoglyph_attack(original_text)
print(homoglyph_text)
print(':'.join(hex(ord(x)) for x in original_text))
print(':'.join(hex(ord(x)) for x in homoglyph_text))


Hеllо, Wоrld!
0x48:0x65:0x6c:0x6c:0x6f:0x2c:0x20:0x57:0x6f:0x72:0x6c:0x64:0x21
0x48:0x435:0x6c:0x6c:0x43e:0x2c:0x20:0x57:0x43e:0x72:0x6c:0x64:0x21


In [4]:
data = pd.read_csv("/kaggle/input/watermarked-and-unwatermarked-text-truncated/data_trunk.csv")
# print(data)

# Filter for watermarked "Generated text" with a limit of 1000 rows
filtered_data = data[data["label"] == "watermarked"][["Generated Text"]]
# Initialize empty list to store paraphrased text
homoglyph_text = []

# Loop through the data using tqdm for progress bar
for text in tqdm(filtered_data["Generated Text"].tolist()):
    homoglyph = homoglyph_attack(text)
    homoglyph_text.append(homoglyph)

# Create a new dataframe with original and paraphrased text
df = pd.DataFrame({"Original Text": filtered_data["Generated Text"], "Homoglyph Text": homoglyph_text})

# Save dataframe as CSV file
df.to_csv("homoglyph_data.csv", index=False)

print("Replacing homoglyph complete! Homoglyph data saved to homoglyph_data.csv")

100%|██████████| 1000/1000 [00:00<00:00, 3921.60it/s]


Replacing homoglyph complete! Homoglyph data saved to homoglyph_data.csv


## Detect and Counteract Homoglyph Attack

In [5]:
def find_homoglyphs(text):
    """
    This function finds the positions of characters in the text that have homoglyphs 
    defined in the homoglyph_mapping dictionary.

    Args:
      text: The text to search for homoglyphs.
      homoglyph_mapping: A dictionary mapping characters to their homoglyphs.

    Returns:
      A list of tuples where each tuple contains the index of the homoglyph in the text 
      and the corresponding homoglyph character. 
    """
    homoglyphs = []
    normal_to_homoglyph = {
        'а':'a', #  Cyrillic 'a' to Latin 'a'
        'е':'e' , # Cyrillic 'e' to Latin 'e'
        'о':'o' , # Cyrillic 'o' to Latin 'o'
        'і':'i' , # Cyrillic 'i' to Latin 'i' 
    }
    for i, char in enumerate(text):
        if char in normal_to_homoglyph and char != normal_to_homoglyph[char]:
            homoglyphs.append((i, normal_to_homoglyph[char]))
    return homoglyphs

# Example usage
# text = "This is a test."
# homoglyphs = find_homoglyphs(homoglyph_text)

# if homoglyphs:
#     print("Found homoglyphs at positions:")
#     for index, homo in homoglyphs:
#         print(f"\t- Index: {index}, Character: ({homo})")
# else:
#     print("No homoglyphs found in the text.")


In [6]:
def normalize_homoglyph(text,i,char):
    """
    Replaces a character with a visually similar homoglyph.
    
    :param char: The character to replace.
    :return: The replaced character.
    """
    # Basic mapping of characters to homoglyphs
    normal_to_homoglyph = {
        'а':'a', #  Cyrillic 'a' to Latin 'a'
        'е':'e' , # Cyrillic 'e' to Latin 'e'
        'о':'o' , # Cyrillic 'o' to Latin 'o'
        'і':'i' , # Cyrillic 'i' to Latin 'i' 
    }
    return normal_to_homoglyph.get(char, char) # Return the homoglyph if available, else return the original character

def counteract_homoglyph(text):
    """
    Replaces homoglyphs in the text with their normalized counterparts.
    
    :param text: The original text with homoglyph.
    :return: The text with homoglyphs.
    """
    # Replace each character in the text with its homoglyph
    normalized_text = ''.join(normalize_homoglyph(text,i,c) for i,c in enumerate(text))
    return normalized_text

# Example usage
text = "This is a test."
homoglyphs = find_homoglyphs(counteract_homoglyph(text))

if homoglyphs:
    print("Found homoglyphs at positions:")
    for index, homo in homoglyphs:
        print(f"\t- Index: {index}, Character: ({homo})")
else:
    print("No homoglyphs found in the text.")



No homoglyphs found in the text.


In [7]:
data = pd.read_csv("/kaggle/working/homoglyph_data.csv")
normalized_text = []

# Loop through the data using tqdm for progress bar
for text in tqdm(data["Homoglyph Text"].tolist()):
    normalize = counteract_homoglyph(text)
    normalized_text.append(normalize)
# Create a new dataframe with original and paraphrased text
df = pd.DataFrame({"Homoglyph Text": data["Homoglyph Text"], "Normalized Text": normalized_text})

# Save dataframe as CSV file
df.to_csv("normalized_data.csv", index=False)

print("Replacing homoglyph with normal characters complete! Homoglyph data saved to normalized_data.csv")

100%|██████████| 1000/1000 [00:00<00:00, 3265.77it/s]

Replacing homoglyph with normal characters complete! Homoglyph data saved to normalized_data.csv





In [8]:
data = pd.read_csv("/kaggle/working/normalized_data.csv")

# Loop through the data using tqdm for progress bar
for text in tqdm(data["Normalized Text"].tolist()):
    normalize = counteract_homoglyph(text)
    homoglyphs_found = find_homoglyphs(normalize)
    if homoglyphs_found:
        print("Found homoglyphs at positions:")
        for index, homo in homoglyphs_found:
            print(f"\t- Index: {index}, Character: ({homo})")
        break;

100%|██████████| 1000/1000 [00:00<00:00, 2884.05it/s]


In [9]:
data = pd.read_csv("/kaggle/input/watermarked-and-unwatermarked-text-truncated/data_trunk.csv")
filtered_data = data[data["label"] == "watermarked"][["Generated Text"]]
normalized_data = pd.read_csv("/kaggle/working/normalized_data.csv")
normalized_data = normalized_data["Normalized Text"].tolist()
count = 0
for i, og_text in enumerate(filtered_data["Generated Text"].tolist()):
    if og_text != normalized_data[i]:
        count +=1
if(count == 0):
    print("Counteracting Successful!")


Counteracting Successful!
