# **Finding the Optimal Layout**

In [1]:
from collections import defaultdict

In [6]:
def compute_loss(keyboard_layout, corpus):
    key_map = {}
    for i, key in enumerate(keyboard_layout):
        for letter in key:
            key_map[letter] = i

    word_patterns = defaultdict(int)
    for word in corpus:
        pattern = []
        last_key = None
        for letter in word:
            key = key_map[letter]
            if key != last_key:
                pattern.append(key)
                last_key = key
        word_patterns[tuple(pattern)] += 1

    # The loss is the number of words that have the same pattern
    loss = sum(count for count in word_patterns.values() if count > 1)
    return loss

In [7]:
def backtrack(current_layout, remaining_letters, max_keys, best_layout, best_loss, corpus):
    # Check if we have assigned all letters and have at most max_keys keys
    if not remaining_letters and len(current_layout) <= max_keys:
        # Evaluate this layout
        loss = compute_loss(current_layout, corpus)
        if loss < best_loss[0]:
            best_loss[0] = loss
            best_layout[:] = current_layout[:]
        return

    # If already have max_keys and there are still letters left, return
    if len(current_layout) >= max_keys:
        return

    # Try to group remaining letters into the next key, considering all possible splits
    for i in range(1, len(remaining_letters) + 1):
        new_key = remaining_letters[:i]
        new_remaining = remaining_letters[i:]
        backtrack(current_layout + [new_key], new_remaining, max_keys, best_layout, best_loss, corpus)

In [8]:
def find_optimal_keyboard(corpus, max_keys=12):
    alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    best_layout = []
    best_loss = [float('inf')]

    backtrack([], alphabet, max_keys, best_layout, best_loss, corpus)

    return best_layout, best_loss[0]

In [9]:
# Example usage:
corpus = ["HELLO", "WORLD", "KEYBOARD", "CIRCUIT", "EXAMPLE", "PYTHON", "OPTIMIZATION", "TASK"]
optimal_layout, minimal_loss = find_optimal_keyboard(corpus)

print("Optimal Layout:", optimal_layout)
print("Minimal Loss:", minimal_loss)


Optimal Layout: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'LMNOPQRSTUVWXYZ']
Minimal Loss: 0


# **Creating Vocabulary**

We are using the COCA samples

In [79]:
import os
import re
from collections import Counter

# Define the path to your text files
directory = '../data/corpus/coca-samples-text'

# Initialize a counter for the vocabulary
vocabulary = Counter()

# This regex matches only alphabetic sequences (i.e., words)
word_pattern = re.compile(r'\b[a-zA-Z]+\b')

# Read and process each file
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            # Normalize the text
            text = text.lower()  # Convert to lowercase
            # Find all valid words
            words = word_pattern.findall(text)
            
            # Update the vocabulary counter with words
            vocabulary.update(words)

# Print the most common words
print(vocabulary.most_common(100))  # Print the 100 most common words
print(len(vocabulary))  # Print the number of unique words

[('the', 462883), ('to', 238874), ('and', 232584), ('of', 218668), ('a', 206816), ('in', 154602), ('i', 139623), ('that', 124059), ('you', 109927), ('p', 108290), ('s', 107201), ('it', 104072), ('is', 94201), ('for', 79002), ('on', 65490), ('was', 64461), ('with', 59800), ('he', 57779), ('this', 51981), ('t', 51527), ('as', 51304), ('n', 51142), ('we', 47814), ('are', 47246), ('have', 47011), ('be', 46709), ('not', 44061), ('but', 42634), ('they', 42499), ('at', 42245), ('do', 41723), ('what', 35786), ('from', 34702), ('his', 33609), ('by', 32861), ('or', 32280), ('all', 30252), ('she', 30008), ('my', 29416), ('an', 28691), ('about', 27869), ('so', 27507), ('there', 27373), ('one', 27128), ('her', 26401), ('had', 25676), ('if', 25430), ('me', 24875), ('your', 24687), ('who', 23555), ('can', 23406), ('out', 23357), ('their', 23236), ('no', 23179), ('has', 22791), ('up', 22668), ('were', 22508), ('like', 22124), ('when', 21978), ('just', 21765), ('would', 21669), ('more', 20965), ('will'

In [80]:
# Assuming 'vocabulary' is a Counter or set that contains your vocabulary

# Define the tags you want to check
tags = ['@5018041', '@5108241', '@5108341', '@5108141', '<p>', '!', 'p', '5108141', 'test', 'of']  # Example tags

# Check if each tag is in the vocabulary
for tag in tags:
    if tag in vocabulary:
        print(f"Tag '{tag}' is present in the vocabulary.")
    else:
        print(f"Tag '{tag}' is NOT present in the vocabulary.")


Tag '@5018041' is NOT present in the vocabulary.
Tag '@5108241' is NOT present in the vocabulary.
Tag '@5108341' is NOT present in the vocabulary.
Tag '@5108141' is NOT present in the vocabulary.
Tag '<p>' is NOT present in the vocabulary.
Tag '!' is NOT present in the vocabulary.
Tag 'p' is present in the vocabulary.
Tag '5108141' is NOT present in the vocabulary.
Tag 'test' is present in the vocabulary.
Tag 'of' is present in the vocabulary.


In [81]:
for word in vocabulary:
    if word.startswith('@'):
        print(word)  # Print the word starting with '@'
    if not word.isalpha():
        print(word)

In [82]:
# Valid single-letter words
valid_single_letter_words = {'a', 'i'}
valid_two_letter_words = {'am', 'an', 'as', 'at', 'ax', 'be', 'by', 'do', 'go', 'he', 'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of,' 'ok', 'on', 'or', 'ox', 'so', 'to', 'up', 'us', 'we'}

# Function to filter vocabulary
def filter_vocabulary(vocabulary):
    filtered_vocab = Counter()
    
    for word, count in vocabulary.items():
        if (len(word) == 1 and word not in valid_single_letter_words) or (len(word) == 2 and word not in valid_two_letter_words):
            continue  # Skip this word
        filtered_vocab[word] = count
    
    return filtered_vocab   

# Filter the vocabulary
filtered_vocabulary = filter_vocabulary(vocabulary)

# Print the most common words after filtering
print(filtered_vocabulary.most_common(100))
print(len(filtered_vocabulary))

[('the', 462883), ('to', 238874), ('and', 232584), ('a', 206816), ('in', 154602), ('i', 139623), ('that', 124059), ('you', 109927), ('it', 104072), ('is', 94201), ('for', 79002), ('on', 65490), ('was', 64461), ('with', 59800), ('he', 57779), ('this', 51981), ('as', 51304), ('we', 47814), ('are', 47246), ('have', 47011), ('be', 46709), ('not', 44061), ('but', 42634), ('they', 42499), ('at', 42245), ('do', 41723), ('what', 35786), ('from', 34702), ('his', 33609), ('by', 32861), ('or', 32280), ('all', 30252), ('she', 30008), ('my', 29416), ('an', 28691), ('about', 27869), ('so', 27507), ('there', 27373), ('one', 27128), ('her', 26401), ('had', 25676), ('if', 25430), ('me', 24875), ('your', 24687), ('who', 23555), ('can', 23406), ('out', 23357), ('their', 23236), ('no', 23179), ('has', 22791), ('up', 22668), ('were', 22508), ('like', 22124), ('when', 21978), ('just', 21765), ('would', 21669), ('more', 20965), ('will', 20664), ('know', 18882), ('said', 18678), ('did', 17627), ('been', 17483

In [83]:
# Function to filter vocabulary
def filter_vocabulary_min_freq(vocabulary, min_count=2):
    filtered_vocab = Counter()
    
    for word, count in vocabulary.items():
        if count < min_count:
            continue  # Skip this word
        filtered_vocab[word] = count
    
    return filtered_vocab

In [84]:
# Filter the vocabulary
filtered_vocabulary_len = filter_vocabulary_min_freq(filtered_vocabulary, min_count=20)

In [86]:
# Get the least common words by reversing the output of most_common()
least_common_words = filtered_vocabulary_len.most_common()[::-1]

# Print the least common words, e.g., the 100 least common words
print(least_common_words[:100])
print(len(least_common_words))

[('tcr', 20), ('cspi', 20), ('guenther', 20), ('barristers', 20), ('korgano', 20), ('angkatell', 20), ('eun', 20), ('niran', 20), ('iphones', 20), ('soulmate', 20), ('stephanopoulo', 20), ('nair', 20), ('vinita', 20), ('outfront', 20), ('kaine', 20), ('karr', 20), ('todays', 20), ('gaylin', 20), ('lac', 20), ('churkin', 20), ('ntsb', 20), ('brigitte', 20), ('ewell', 20), ('estrogen', 20), ('lainey', 20), ('authoraffiliation', 20), ('teesha', 20), ('solicitors', 20), ('dermot', 20), ('kosnik', 20), ('chtarri', 20), ('shatlow', 20), ('shoo', 20), ('guruji', 20), ('rambling', 20), ('threaded', 20), ('dissipation', 20), ('chromosome', 20), ('originalist', 20), ('farnsworth', 20), ('ptl', 20), ('kompetenz', 20), ('mammography', 20), ('hpv', 20), ('illustrators', 20), ('oncol', 20), ('leicester', 20), ('comaroff', 20), ('winked', 20), ('popov', 20), ('catchment', 20), ('cfd', 20), ('bargmann', 20), ('groundfish', 20), ('causative', 20), ('parallelism', 20), ('veronese', 20), ('emulation', 20

In [87]:
print(len(filtered_vocabulary))
print(len(filtered_vocabulary_len))

122020
19951


In [None]:
# ADD THE CORPUS PHRASES

In [None]:
# Optionally, save the vocabulary to a file
with open('vocabulary.txt', 'w', encoding='utf-8') as vocab_file:
    for word, count in vocabulary.items():
        vocab_file.write(f'{word}: {count}\n')
