In [2]:
import pandas as pd
import random

RANDOM_SEED = 42

### Load dataset

In [4]:
df = pd.read_csv("./data/sample_data.csv")

### Filter out English & Swedish

In [None]:

df_filtered = df[df['language_code'].isin(['eng', 'swe'])]

# Get all unique languages except English and Swedish
other_languages = df[~df['language_code'].isin(['eng', 'swe'])]['language_code'].unique()

# Calculate how many samples per language (ensuring equal distribution)
num_samples_per_lang = 3000 // len(other_languages)  # Divide equally

# Select equal samples from each remaining language
df_other = df[df['language_code'].isin(other_languages)].groupby('language_code').sample(n=num_samples_per_lang, random_state=42)

# Combine both datasets
df_final = pd.concat([df_filtered, df_other])

# Shuffle the dataset
df_final = df_final.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
df_final

Unnamed: 0,sentence_id,language_code,sentence
0,4325196,por,"você viu o meu telefone? poxa, pior que não."
1,11116788,eng,ziri and rima should nevr have eaten at that r...
2,7380358,eng,"mary is married now, isnt she?"
3,11905621,swe,alla santiagos planer misslyckades.
4,10211752,spa,vosotras sois la razón por la que vine.
...,...,...,...
8995,4371134,swe,tom kommer hit så gott som varje dag.
8996,11938611,swe,undantag kan gälla i vissa fall.
8997,3807254,swe,ett gott samvete är bästa huvudkudden.
8998,3049523,eng,the experiments have been being carried out si...


# Functions to add noise to the data
### Function to get a neighboring key

In [None]:
# QWERTY keyboard adjacency dictionary
KEYBOARD_ADJACENCY = {
    'a': "qwszåä", 'b': "vghn ", 'c': "xdfv ", 'd': "ersfxc", 'e': "wrsd", 'f': "rtdgcv",
    'g': "tyfhvb", 'h': "yugjbn", 'i': "uojk", 'j': "uikmhn", 'k': "ijolm,", 'l': "kopöä-", 'm': "njk,.", 
    'n': "bhjm ", 'o': "ipklö", 'p': "olå", 'q': "wa", 'r': "etdf", 's': "awedxzå", 't': "ryfgh", 'u': "yiokj",
    'v': "cfgb ", 'w': "qeas", 'x': "zsdc ", 'y': "tughj", 'z': "asx", 'å': "äp", 'ä': "åö", 'ö': "äpl-",
    '1': "2q", '2': "13wq", '3': "24erw", '4': "35rte", '5': "46yt", '6': "57yu", '7': "68ui", '8': "79io", 
    '9': "80op", '0': "9p-", '-': "öä.", ',': "mkl.", '.': ",-/",
    ' ': "xcvbnm,.-"
}


def get_adjacent_key(char):
    """Returns a neighboring key with higher probability for close typos."""
    if char in KEYBOARD_ADJACENCY:
        return random.choice(KEYBOARD_ADJACENCY[char])
    return char  # If not found, return the same char


### Frequency-Based Typo Mapping

In [None]:
# Frequency-based typo mapping (more common mistakes occur more often)
COMMON_TYPOS = {
    'å': [('a', 0.7), ('ä', 0.2), ('p', 0.1)],
    'ä': [('a', 0.5), ('ö', 0.3), ('å', 0.2)],
    'ö': [('o', 0.6), ('ä', 0.3), ('l', 0.1)],
    'e': [('r', 0.3), ('w', 0.2), ('d', 0.1)],
    'r': [('e', 0.4), ('t', 0.2), ('f', 0.1)],
    'o': [('i', 0.5), ('p', 0.3), ('ö', 0.2)],
    'n': [('m', 0.4), ('b', 0.2), (' ', 0.1)],
    'm': [('n', 0.5), (',', 0.2), (' ', 0.2)],
    ',': [('m', 0.4), ('k', 0.3), ('.', 0.2), ('l', 0.1)],
    '.': [(',', 0.5), ('-', 0.3), ('/', 0.2)],
    '-': [('.', 0.5), ('ö', 0.3), ('ä', 0.2)],
    ' ': [('m', 0.3), ('n', 0.3), (',', 0.2), ('.', 0.2)],
}

# Function to get a frequent typo replacement
def get_common_typo(char):
    """Returns a common typo with probability-based mistakes."""
    if char in COMMON_TYPOS:
        choices, weights = zip(*COMMON_TYPOS[char]) 
        return random.choices(choices, weights=weights, k=1)[0]
    return char


In [7]:
def swap_characters(word):
    """Swap two adjacent characters with a higher probability for closer keys."""
    if len(word) > 1:
        pos = random.randint(0, len(word) - 2)
        # Swap adjacent characters with 80% probability
        if random.random() < 0.8:
            return word[:pos] + word[pos + 1] + word[pos] + word[pos + 2:]
    return word


def delete_character(word):
    """Randomly delete a character."""
    if len(word) > 1:
        pos = random.randint(0, len(word) - 1)
        return word[:pos] + word[pos + 1:]
    return word


def insert_character(word):
    """Insert a random adjacent character with higher probability for closer keys."""
    pos = random.randint(0, len(word))
    base_char = word[pos - 1] if pos > 0 else random.choice(list(KEYBOARD_ADJACENCY.keys()))
    random_char = get_adjacent_key(base_char)  # More likely to insert adjacent key
    return word[:pos] + random_char + word[pos:]


def replace_character(word):
    """Replace a character with a nearby key instead of a fully random key."""
    if len(word) > 0:
        pos = random.randint(0, len(word) - 1)
        random_char = get_adjacent_key(word[pos])  # Use a neighboring key
        return word[:pos] + random_char + word[pos + 1:]
    return word


In [8]:
test_word = "hello"

print("Original:", test_word)
print("Swapped:", swap_characters(test_word))
print("Deleted:", delete_character(test_word))
print("Inserted:", insert_character(test_word))
print("Replaced:", replace_character(test_word))


Original: hello
Swapped: helol
Deleted: helo
Inserted: herllo
Replaced: helll
