In [3]:
import pandas as pd
import numpy as np
import random
from collections import defaultdict

RANDOM_SEED = 42

### Load dataset

In [4]:
df = pd.read_csv("./data/sample_data.csv")

### Filter out English & Swedish

In [5]:

df_filtered = df[df['language_code'].isin(['eng', 'swe'])]

# Get all unique languages except English and Swedish
other_languages = df[~df['language_code'].isin(['eng', 'swe'])]['language_code'].unique()

# Calculate how many samples per language (ensuring equal distribution)
num_samples_per_lang = 3000 // len(other_languages)  # Divide equally

# Select equal samples from each remaining language
df_other = df[df['language_code'].isin(other_languages)].groupby('language_code').sample(n=num_samples_per_lang, random_state=42)

# Combine both datasets
df_final = pd.concat([df_filtered, df_other])

# Shuffle the dataset
df_final = df_final.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
df_final

Unnamed: 0,sentence_id,language_code,sentence
0,4325196,por,"você viu o meu telefone? poxa, pior que não."
1,11116788,eng,ziri and rima should nevr have eaten at that r...
2,7380358,eng,"mary is married now, isnt she?"
3,11905621,swe,alla santiagos planer misslyckades.
4,10211752,spa,vosotras sois la razón por la que vine.
...,...,...,...
8995,4371134,swe,tom kommer hit så gott som varje dag.
8996,11938611,swe,undantag kan gälla i vissa fall.
8997,3807254,swe,ett gott samvete är bästa huvudkudden.
8998,3049523,eng,the experiments have been being carried out si...



# Define Keyboard/Input
### Mapping the input to 3d space

In [6]:
# Swedish QWERTY Keyboard Layout with Shift and AltGr Layers
swedish_keyboard_layout = {
    # Base layer (unshifted, 0)
    0: [
        ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '+', '´'],
        ['q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', 'å', '¨'],
        ['a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'ö', 'ä', "'"],
        ['z', 'x', 'c', 'v', 'b', 'n', 'm', ',', '.', '-']
    ],

    # Shift layer (uppercase & shifted symbols, 1)
    1: [
        ['!', '"', '#', '¤', '%', '&', '/', '(', ')', '=', '?', '`'],
        ['Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', 'Å', '^'],
        ['A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Ö', 'Ä', '*'],
        ['Z', 'X', 'C', 'V', 'B', 'N', 'M', ';', ':', '_']
    ],

    # AltGr layer (symbols, now 2 instead of 1)
    2: [
        ['§', '@', '£', '$', '€', '{', '[', ']', '}', '\\', '|', '~'],
        ['q', 'w', '€', 'r', 't', 'y', 'u', 'i', 'o', 'p', 'å', '¨'],  # Only '€' differs
        ['ä', 'ß', 'ð', 'đ', 'ŋ', 'ħ', 'j', 'ĸ', 'ł', 'ö', 'ä', "'"],  # Special letters
        ['æ', '©', '¢', 'v', 'b', 'n', 'µ', ',', '.', '-']
    ]
}

# Map each key to its (row, col, layer) coordinates
keyboard_mapping = {}

for layer in [0, 1, 2]:  # Iterate over base, shift, and AltGr layers
    for row_idx, row in enumerate(swedish_keyboard_layout[layer]):
        for col_idx, key in enumerate(row):
            keyboard_mapping[key] = (row_idx, col_idx, layer)


### Get the Euclidean distance between two input nodes

In [7]:
def key_distance(k1, k2, shift_weight=1, altgr_weight=2):
    """
    Computes the Euclidean distance between two keys in a 3D space.
    Shift layer switches are penalized normally, but AltGr switches are penalized more.
    """
    if k1 in keyboard_mapping and k2 in keyboard_mapping:
        x1, y1, l1 = keyboard_mapping[k1]
        x2, y2, l2 = keyboard_mapping[k2]
        
        # Adjust layer distance weight: Shift (1), AltGr (2)
        layer_distance = abs(l1 - l2)
        if layer_distance == 1:  # Shift key switch
            layer_penalty = shift_weight
        elif layer_distance == 2:  # AltGr key switch (more costly)
            layer_penalty = altgr_weight
        else:
            layer_penalty = 0
        
        # Euclidean distance with layer penalty
        return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) + layer_penalty

    return float('inf')  # Large distance for non-existent keys

### Probability of mistyping k1 as k2 based on distance.

In [8]:
def error_probability(k1, k2, alpha=1.5):
    """
    Computes the probability of mistyping k1 as k2 based on keyboard distance.
    Uses an exponential decay function to favor closer mistakes.
    """
    d = key_distance(k1, k2)
    return np.exp(-alpha * d) if d < float('inf') else 0


###  Generate Typo Candidates Across All Layers

In [None]:
def generate_candidates(word, typo_rate=0.2):
    """
    Generates probable mistyped versions of a word based on keyboard adjacency.
    Includes substitutions, insertions, and deletions.
    """
    candidates = defaultdict(float)
    
    for i, letter in enumerate(word):
        if letter in keyboard_mapping:
            # 1. **Substitutions**: Replace letter with a neighboring key
            for key in keyboard_mapping.keys():
                if key != letter:
                    new_word = word[:i] + key + word[i+1:]
                    candidates[new_word] += error_probability(letter, key)
            
            # 2. **Insertions**: Add a random nearby letter
            if random.random() < typo_rate:
                nearby_keys = [k for k in keyboard_mapping.keys() if error_probability(letter, k) > 0.01]
                if nearby_keys:
                    insert_key = random.choice(nearby_keys)
                    new_word = word[:i] + insert_key + word[i:]
                    candidates[new_word] += 0.05  # Small probability for insertions
            
            # 3. **Deletions**: Remove a letter
            if random.random() < typo_rate:
                new_word = word[:i] + word[i+1:]
                candidates[new_word] += 0.05  # Small probability for deletions
    
    # Sort candidates by highest probability
    return dict(sorted(candidates.items(), key=lambda x: -x[1]))

In [10]:
print(generate_candidates("hello"))  # Generates probable typos for "hello"


{'gello': np.float64(0.22313016014842982), 'Hello': np.float64(0.22313016014842982), 'h3llo': np.float64(0.22313016014842982), 'hdllo': np.float64(0.22313016014842982), 'hEllo': np.float64(0.22313016014842982), 'heklo': np.float64(0.22313016014842982), 'heLlo': np.float64(0.22313016014842982), 'helko': np.float64(0.22313016014842982), 'helLo': np.float64(0.22313016014842982), 'helli': np.float64(0.22313016014842982), 'hellp': np.float64(0.22313016014842982), 'hellO': np.float64(0.22313016014842982), 'hell}': np.float64(0.22313016014842982), 'hellł': np.float64(0.22313016014842982), 'mello': np.float64(0.119873250103762), 'h2llo': np.float64(0.119873250103762), 'h4llo': np.float64(0.119873250103762), 'hsllo': np.float64(0.119873250103762), 'hfllo': np.float64(0.119873250103762), 'hellö': np.float64(0.119873250103762), 'hell]': np.float64(0.119873250103762), 'hell\\': np.float64(0.119873250103762), 'hellĸ': np.float64(0.119873250103762), '6ello': np.float64(0.049787068367863944), 'fello'

# Functions to add noise to the data
### Frequency-Based Typo Mapping

In [None]:
# Frequency-based typo mapping (more common mistakes occur more often)
COMMON_TYPOS = {
    'å': [('a', 0.7), ('ä', 0.2), ('p', 0.1)],
    'ä': [('a', 0.5), ('ö', 0.3), ('å', 0.2)],
    'ö': [('o', 0.6), ('ä', 0.3), ('l', 0.1)],
    'e': [('r', 0.3), ('w', 0.2), ('d', 0.1)],
    'r': [('e', 0.4), ('t', 0.2), ('f', 0.1)],
    'o': [('i', 0.5), ('p', 0.3), ('ö', 0.2)],
    'n': [('m', 0.4), ('b', 0.2), (' ', 0.1)],
    'm': [('n', 0.5), (',', 0.2), (' ', 0.2)],
    ',': [('m', 0.4), ('k', 0.3), ('.', 0.2), ('l', 0.1)],
    '.': [(',', 0.5), ('-', 0.3), ('/', 0.2)],
    '-': [('.', 0.5), ('ö', 0.3), ('ä', 0.2)],
    ' ': [('m', 0.3), ('n', 0.3), (',', 0.2), ('.', 0.2)],
}

# Function to get a frequent typo replacement
def get_common_typo(char):
    """Returns a common typo with probability-based mistakes."""
    if char in COMMON_TYPOS:
        choices, weights = zip(*COMMON_TYPOS[char]) 
        return random.choices(choices, weights=weights, k=1)[0]
    return char


In [7]:
def swap_characters(word):
    """Swap two adjacent characters with a higher probability for closer keys."""
    if len(word) > 1:
        pos = random.randint(0, len(word) - 2)
        # Swap adjacent characters with 80% probability
        if random.random() < 0.8:
            return word[:pos] + word[pos + 1] + word[pos] + word[pos + 2:]
    return word


def delete_character(word):
    """Randomly delete a character."""
    if len(word) > 1:
        pos = random.randint(0, len(word) - 1)
        return word[:pos] + word[pos + 1:]
    return word


def insert_character(word):
    """Insert a random adjacent character with higher probability for closer keys."""
    pos = random.randint(0, len(word))
    base_char = word[pos - 1] if pos > 0 else random.choice(list(KEYBOARD_ADJACENCY.keys()))
    random_char = get_adjacent_key(base_char)  # More likely to insert adjacent key
    return word[:pos] + random_char + word[pos:]


def replace_character(word):
    """Replace a character with a nearby key instead of a fully random key."""
    if len(word) > 0:
        pos = random.randint(0, len(word) - 1)
        random_char = get_adjacent_key(word[pos])  # Use a neighboring key
        return word[:pos] + random_char + word[pos + 1:]
    return word


In [8]:
test_word = "hello"

print("Original:", test_word)
print("Swapped:", swap_characters(test_word))
print("Deleted:", delete_character(test_word))
print("Inserted:", insert_character(test_word))
print("Replaced:", replace_character(test_word))


Original: hello
Swapped: helol
Deleted: helo
Inserted: herllo
Replaced: helll
