In [1]:
import pandas as pd
import numpy as np
import re
import random
from tqdm import tqdm
from google.colab import files

# Arabic keyboard layout mapping for typos
ARABIC_KEYBOARD = {
    'ض': ['ص', 'ث', 'ق'],
    'ص': ['ض', 'ف', 'ث'],
    'ث': ['ص', 'ق', 'ف'],
    'ق': ['ف', 'غ', 'ث'],
    'ف': ['ق', 'غ', 'ع'],
    'غ': ['ف', 'ع', 'ه'],
    'ع': ['غ', 'ه', 'خ'],
    'ه': ['ع', 'خ', 'ح'],
    'خ': ['ه', 'ح', 'ج'],
    'ح': ['خ', 'ج', 'د'],
    'ج': ['ح', 'د', 'ش'],
    'د': ['ج', 'ش', 'س'],
    'ش': ['د', 'س', 'ي'],
    'س': ['ش', 'ي', 'ب'],
    'ي': ['س', 'ب', 'ل'],
    'ب': ['ي', 'ل', 'ا'],
    'ل': ['ب', 'ا', 'ت'],
    'ا': ['ل', 'ت', 'ن'],
    'ت': ['ا', 'ن', 'م'],
    'ن': ['ت', 'م', 'ك'],
    'م': ['ن', 'ك', 'ط'],
    'ك': ['م', 'ط', 'ئ'],
    'ط': ['ك', 'ئ', 'ء'],
    'ئ': ['ط', 'ء', 'ؤ'],
    'ء': ['ئ', 'ؤ', 'ر'],
    'ؤ': ['ء', 'ر', 'لا'],
    'ر': ['ؤ', 'لا', 'ى'],
    'لا': ['ؤ', 'ر', 'ى'],
    'ى': ['ر', 'لا', 'ة'],
    'ة': ['ى', 'و', 'ز'],
    'و': ['ة', 'ز', 'ظ'],
    'ز': ['و', 'ظ', 'ذ'],
    'ظ': ['ز', 'ذ', 'إ'],
    'ذ': ['ظ', 'إ', 'أ'],
    'إ': ['ذ', 'أ', 'آ'],
    'أ': ['إ', 'آ', 'ـ'],
    'آ': ['أ', 'ـ', 'ض']
}

# Phonetically similar letters in Arabic
PHONETIC_SIMILAR = {
    'ض': ['ظ', 'د'],
    'ص': ['س'],
    'ث': ['ت', 'س'],
    'ذ': ['ز', 'د'],
    'ط': ['ت', 'د'],
    'ظ': ['ض', 'ز'],
    'ك': ['ق'],
    'ق': ['ك'],
    'أ': ['ا', 'إ', 'آ'],
    'إ': ['ا', 'أ', 'آ'],
    'آ': ['ا', 'أ', 'إ'],
    'ا': ['أ', 'إ', 'آ'],
    'ه': ['ة'],
    'ة': ['ه', 'ت'],
    'ي': ['ى'],
    'ى': ['ي'],
    'ء': ['ؤ', 'ئ'],
    'ؤ': ['ء', 'ئ'],
    'ئ': ['ء', 'ؤ']
}

# Hamza variants
HAMZA_VARIANTS = ['أ', 'إ', 'آ', 'ا', 'ء', 'ؤ', 'ئ']

# Tashkeel characters
TASHKEEL = ['َ', 'ً', 'ُ', 'ٌ', 'ِ', 'ٍ', 'ْ', 'ّ']

def keyboard_typo(word):
    """Replace a random character with a nearby key on Arabic keyboard"""
    if len(word) < 2:
        return word

    char_pos = random.randint(0, len(word) - 1)
    char = word[char_pos]

    if char in ARABIC_KEYBOARD:
        nearby_keys = ARABIC_KEYBOARD[char]
        replacement = random.choice(nearby_keys)
        return word[:char_pos] + replacement + word[char_pos + 1:]
    return word

def character_deletion(word):
    """Delete a random character from the word"""
    if len(word) < 2:
        return word

    char_pos = random.randint(0, len(word) - 1)
    return word[:char_pos] + word[char_pos + 1:]

def character_insertion(word):
    """Insert a random Arabic character into the word"""
    arabic_chars = list('ابتثجحخدذرزسشصضطظعغفقكلمنهويءأإآؤئة')

    char_pos = random.randint(0, len(word))
    random_char = random.choice(arabic_chars)

    return word[:char_pos] + random_char + word[char_pos:]

def character_swap(word):
    """Swap two adjacent characters in the word"""
    if len(word) < 2:
        return word

    # Choose a position (not the last character)
    char_pos = random.randint(0, len(word) - 2)

    # Swap characters
    return word[:char_pos] + word[char_pos + 1] + word[char_pos] + word[char_pos + 2:]

def phonetic_mistake(word):
    """Replace a character with a phonetically similar one"""
    if len(word) < 1:
        return word

    char_pos = random.randint(0, len(word) - 1)
    char = word[char_pos]

    if char in PHONETIC_SIMILAR:
        replacement = random.choice(PHONETIC_SIMILAR[char])
        return word[:char_pos] + replacement + word[char_pos + 1:]
    return word

def hamza_confusion(word):
    """Replace a hamza variant with another one"""
    for i, char in enumerate(word):
        if char in HAMZA_VARIANTS:
            other_variants = [h for h in HAMZA_VARIANTS if h != char]
            replacement = random.choice(other_variants)
            return word[:i] + replacement + word[i + 1:]
    return word

def tashkeel_noise(word):
    """Add or remove tashkeel (diacritics)"""
    # Remove tashkeel if present
    has_tashkeel = any(t in word for t in TASHKEEL)

    if has_tashkeel:
        # Remove tashkeel
        return ''.join([c for c in word if c not in TASHKEEL])
    else:
        # Add random tashkeel
        char_pos = random.randint(0, len(word) - 1)
        random_tashkeel = random.choice(TASHKEEL)
        return word[:char_pos] + random_tashkeel + word[char_pos:]

def inject_noise(text, error_probability=0.3):
    """Inject noise into Arabic text based on various error types"""
    if not text or not isinstance(text, str):
        return text

    # Split text into words
    words = text.split()
    noisy_words = []

    for word in words:
        # Skip very short words or non-Arabic words
        if len(word) < 2 or not re.search(r'[\u0600-\u06FF]', word):
            noisy_words.append(word)
            continue

        # Decide whether to inject noise into this word
        if random.random() < error_probability:
            error_type = random.choice([
                keyboard_typo,
                character_deletion,
                character_insertion,
                character_swap,
                phonetic_mistake,
                hamza_confusion,
                tashkeel_noise
            ])

            noisy_word = error_type(word)
            noisy_words.append(noisy_word)
        else:
            noisy_words.append(word)

    return ' '.join(noisy_words)

# # Option 1: Upload the dataset directly to Colab
# print("Please upload your Dataset.csv file")
# uploaded = files.upload()  # This will prompt you to upload a file

# # Get the filename of the uploaded file
# dataset_path = list(uploaded.keys())[0]

# Option 2: If your dataset is in Google Drive, uncomment these lines:
from google.colab import drive
drive.mount('/content/drive')
dataset_path = '/content/drive/MyDrive/NTI Project/Arabic_dataset.csv'  # Update this path


# Load the dataset
df = pd.read_csv(dataset_path)

# Create a new DataFrame with just the two required columns
correct_words = []
incorrect_words = []

# Process each row and inject noise
print("Processing dataset and injecting noise...")
for i in tqdm(range(len(df))):
    # Identify the text column to use
    # Try to find a column that might contain text data
    text_column = None

    # Check if there's any column that looks like it contains text
    for col in df.columns:
        if df[col].dtype == 'object' and isinstance(df.loc[i, col], str):
            text_column = col
            break

    if text_column is None:
        continue  # Skip if no text column found

    original_text = df.loc[i, text_column]
    noisy_text = inject_noise(original_text)

    # Add to our lists
    correct_words.append(original_text)
    incorrect_words.append(noisy_text)

# Create new DataFrame with just the two columns
result_df = pd.DataFrame({
    'correct_words': correct_words,
    'incorrect_words': incorrect_words
})

# Save the result
output_path = 'Dataset_corrected_incorrect.csv'
result_df.to_csv(output_path, index=False)

# Download the file automatically
files.download(output_path)

print(f"Noise injection complete. Dataset saved and downloaded as {output_path}")

# Display some examples
print("\nExamples of correct and incorrect words:")
sample_rows = min(5, len(result_df))
for i in range(sample_rows):
    print(f"Correct:   {result_df.loc[i, 'correct_words']}")
    print(f"Incorrect: {result_df.loc[i, 'incorrect_words']}")
    print("-" * 50)

Mounted at /content/drive
Processing dataset and injecting noise...


100%|██████████| 30574/30574 [00:10<00:00, 2867.76it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Noise injection complete. Dataset saved and downloaded as Dataset_corrected_incorrect.csv

Examples of correct and incorrect words:
Correct:   بين أستوديوهات ورزازات وصحراء مرزوكة وآثار وليلي ثم الرباط والبيضاء انتهى المخرج المغربي سهيل بن بركة من تصوير مشاهد عمله السينمائي الجديد الذي خصصه لتسليط الضوء عن حياة الجاسوس الإسباني دومينغو باديا الذي عاش فترة من القرن التاسع عشر بالمغرب باسم علي باي هذا الفيلم الذي اختار له مخرجه عنوان حلم خليفة يصور حياة علي باي العباسي الذي ما زال أحد أحياء طنجة يحمل اسمه عاش حياة فريدة متنكرا بشخصية تاجر عربي من سلالة الرسول صلى الله عليه وسلم فيما كان يعمل جاسوسا لحساب إسبانيا وكشف مخرج الفيلم سهيل بن بركة في تصريح لهسبريس أن الفيلم السينمائي دخل مرحلة التوضيب التي تتم خارج المغرب مبرزا أن الفيلم الذي يروي حياة الجاسوس الإسباني دومينغو باديا منذ أن قرر من طنجة بدء رحلاته نحو عدد من المناطق في العالم الإسلامي بداية القرن العشرين سيكون جاهزا بعد شهرين ويجمع الفيلم السينمائي عددا من الممثلين من مختلف الجنسيات واختار لدور البطولة الممثلة السينمائية الإيطال