In [123]:
%pip install ankipandas numpy plotly kaleido


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.12 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [124]:
%pip install --upgrade nbformat


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.12 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [125]:
# Imports
import pandas as pd
import re
import random
import copy
import math
import json
import plotly.graph_objects as go
import numpy as np
import ankipandas
from collections import Counter

# Load Shared Datasets

This section of code loads the [Google N-Grams Data](https://github.com/orgtre/google-books-ngram-frequency/blob/main/ngrams/1grams_english.csv), along with `ipadict` for [phonetic information](https://github.com/open-dict-data/ipa-dict/blob/master/data/en_US.txt).

In [126]:
# Load in the one-gram information (word frequencies)
one_grams = pd.read_csv("1grams_english.csv")
one_grams_dict = {row[1]['ngram'].lower(): row[1]['freq'] for row in one_grams.iterrows()}

# Scoring methods designed to select one of the options from the IPA dictionary.
# It selects the shortest words with the most schwas (laziest pronunciation).


def score(word):
    word.replace(" ", "")
    return (len(word), -word.count("ə"))


def get_laziest(ipa_list):
    return list(sorted(ipa_list, key=score))[0]


# Loading the dictionary and parsing out the best scoring pronunciation
ipa = pd.read_csv("en_US.txt", sep='\t', names=["eng", "ipa"])
ipa_opt_dict = {row[1]['eng']: row[1]['ipa'].split(', ') for row in ipa.iterrows()}
ipa_dict = {row[1]['eng']: get_laziest(row[1]['ipa'].split(', ')) for row in ipa.iterrows()}

# Write Common Code

This is code that takes in:

1. A list of words,
2. A dictionary of word transformations,
3. An optional function that computes lengths of words (defaults to `len`),
4. An optional function that transforms a transformed word into the constituent "letters".

It then returns the various statistics of that model as a dictionary:
1. The mean length normalized to assume a binary alphabet (multiplied by the $log_2$ of the alphabet size),
2. The reconstruction entropy $H(X|f(X))$,
3. The reconstruction error $\mathbf{P}\{X = \mathrm{argmax}_{y : f(y) =  f(X)} p(y)\}$

In [127]:
def entropy(counts):
    entropy = 0
    total = 0
    for x in counts:
        total += counts[x]
    for x in counts:
        entropy -= (counts[x]/total)*math.log2(counts[x]/total)
    
    return entropy

def transmission_cost(letters,suprisal):
    entropy = 0
    for l in letters:
        entropy += suprisal[l]
    
    return entropy

def entropy_wl(word_list):
    countlist = {w:one_grams_dict[w] for w in word_list}
    return entropy(countlist)

def code_statistics(word_list, word_mapping, length_function = lambda x: len(x), letter_making = lambda x: x, delim = None):
    # Data to track mean outline length
    denom = 0
    total_length = 0

    # Data to track alphabet size
    letter_counts = Counter()

    # Data to track for computing reconstruction entropy and reconstruction probability
    forward = {}
    inverse = {"": {"": float('inf')}}
    most_probable = {}
    totals = {"": float('inf')}

    # iterate over word list keeping track of stats for mean length and the inverse mapping
    for word in word_list:
        count = one_grams_dict[word]

        image = word_mapping[word]
        if delim:
            image += delim

        word_length = length_function(image) * count
        total_length += word_length
        denom += count

        forward[word] = image
        if image not in inverse:
            inverse[image] = {}
            most_probable[image] = word
        inverse[image][word] = count+1
        totals[image] = totals.get(image, 1) + count

        current_letters = letter_making(image)
        letter_counts.update({l:count for l in current_letters})

    letter_total = letter_counts.total()
    suprisal = {letter: -math.log2(letter_counts[letter]/letter_total) for letter in letter_counts}

    # compute the reconstruction entropy and reconstruction probability
    total_reconstruction_entropy = 0
    total_reconstruction_probability = 0
    total_transmission_cost = 0
    for word in word_list:
        count = one_grams_dict[word]
        total_reconstruction_entropy -= float('-inf') if inverse[forward[word]][word]/totals[forward[word]] == 0 else count * math.log2(inverse[forward[word]][word]/totals[forward[word]])
        total_reconstruction_probability += count if word == most_probable[forward[word]] else 0
        total_transmission_cost += count*transmission_cost(letter_making(forward[word]), suprisal)

    running_lengths = 0
    running_count = 1

    for w in one_grams_dict:
        if w not in word_list:
            continue
        prob = one_grams_dict[w]/denom
        running_count += 1
        running_lengths += prob*math.log2(running_count)

    return {"mean_binary_length": math.log2(len(letter_counts))*total_length / denom,
            "mean_binary_excess": math.log2(len(letter_counts))*total_length / denom - running_lengths,
            "mean_entropy_shift": math.log2(len(letter_counts))*total_length / denom - entropy_wl(word_list),
            "mean_transmission_cost": total_transmission_cost / denom,
            "mean_transmission_excess": total_transmission_cost / denom - running_lengths,
            "mean_transmission_shift": total_transmission_cost / denom - entropy_wl(word_list),
            "reconstruction_entropy": total_reconstruction_entropy / denom,
            "reconstruction_error": (1-total_reconstruction_probability / denom),
            "alphabet": " ".join([x for x, _ in letter_counts.most_common()])}

# Pure Spelling

In [128]:
spelling = {x: x for x in one_grams_dict}

In [129]:
code_statistics([x for x in one_grams_dict if x in spelling], spelling)

{'mean_binary_length': 21.740674602773286,
 'mean_binary_excess': 15.093721783000312,
 'mean_entropy_shift': 11.973528232304314,
 'mean_transmission_cost': 18.81181655416222,
 'mean_transmission_excess': 12.164863734389247,
 'mean_transmission_shift': 9.044670183693249,
 'reconstruction_entropy': 0.0,
 'reconstruction_error': 0.0,
 'alphabet': "e t a o n i r s h d l c u f m w g p y b v k x j q z ' é"}

# Pure IPA representation
This removes some of the noise from the IPA representation to allow it to be tested as an encoding.

In [130]:
reduced_ipa = {x: ipa_dict[x].replace("/","").replace("ˌ","").replace("ˈ","") for x in ipa_dict}

In [131]:
code_statistics([x for x in one_grams_dict if x in reduced_ipa], reduced_ipa)

{'mean_binary_length': 20.47519327609292,
 'mean_binary_excess': 13.829919120320977,
 'mean_entropy_shift': 10.710090184552453,
 'mean_transmission_cost': 18.72784030908962,
 'mean_transmission_excess': 12.082566153317675,
 'mean_transmission_shift': 8.96273721754915,
 'reconstruction_entropy': 0.042481294979739674,
 'reconstruction_error': 0.014930232236564533,
 'alphabet': 'ə ɪ t n d s ɹ ɫ i ð ɝ k z ɛ m v p a ʊ æ w f b ʃ e ɑ h ɔ u o ŋ ɡ j ʒ θ'}

# Load Carter Briefwords
The dictionary is available [here](https://www.reddit.com/r/shorthand/comments/xg7k10/a_briefhand_resource/).

In [132]:
briefhand_raw = pd.read_csv("dict.csv")

briefhand = {}

for _,line in briefhand_raw.iterrows():
    briefhand[line['word']] = line['brief']

In [133]:
code_statistics([x for x in one_grams_dict if x in briefhand], briefhand)

{'mean_binary_length': 8.65318533787457,
 'mean_binary_excess': 3.69443099723304,
 'mean_entropy_shift': 0.9394725601340319,
 'mean_transmission_cost': 7.374164216241585,
 'mean_transmission_excess': 2.415409875600055,
 'mean_transmission_shift': -0.33954856149895285,
 'reconstruction_entropy': 1.0613532563363244,
 'reconstruction_error': 0.3048400643277738,
 'alphabet': "t a n e o s r l i d m f c b w g p z u h v k y 1 x j q 2 3 5 0 % 7 '"}

# Dutton Speedwords
The dictionary is available [here](http://www2.cmp.uea.ac.uk/~jrk/conlang.dir/Speedwords.dict).

In [134]:
with open("Speedwords.dict", "r") as file:
    speedwords_raw = file.readlines()

speedwords = {}

for line in speedwords_raw[89:]:
    if line == '\n':
        continue

    splits = line.replace("\n","").split("\t")
    if len(splits) != 2:
        continue

    rep = splits[0]
    words = splits[1]
    word_list = words.split(",")
    for word in word_list:
        speedwords[re.sub(r'[\(\[{].*?[\)\]}]', '', word)] = re.sub(r'[\(\[{].*?[\)\]}]', '', rep.replace("-",""))

In [135]:
code_statistics([x for x in one_grams_dict if x in speedwords], speedwords)

{'mean_binary_length': 9.329723457179032,
 'mean_binary_excess': 4.124412180082545,
 'mean_entropy_shift': 1.2843567331541461,
 'mean_transmission_cost': 8.55031702577631,
 'mean_transmission_excess': 3.3450057486798235,
 'mean_transmission_shift': 0.5049503017514247,
 'reconstruction_entropy': 0.38624091801605515,
 'reconstruction_error': 0.11017558339765876,
 'alphabet': 'i l u e a o s d r n t y v m p & q b g f h k z x c j w O K'}

# Gregg Anniversary

This loads in Gregg Anniversary from [here](https://github.com/grascii/dictionaries).  The key here is that blends should be counted as single alphabet entries, and all is separated by hyphens, so we need custom length and letter functions.

In [136]:
gregg_anniversary_raw = pd.read_csv("anniversary_core.csv")
gregg_anniversary_raw_2= pd.read_csv("anniversary_supplement.csv")

gregg_anniversary = {}

for _,line in gregg_anniversary_raw.iterrows():
    gregg_anniversary[str(line['word']).lower()] = re.sub("^/^-","",line['form'])

for _,line in gregg_anniversary_raw_2.iterrows():
    gregg_anniversary[str(line['word']).lower()] = re.sub("^/^-","",line['form'])

gregg_length = lambda x: x.count("-")+1
gregg_letters = lambda x: x.split("-")

In [137]:
code_statistics([x for x in one_grams_dict if x in gregg_anniversary], gregg_anniversary, length_function = gregg_length, letter_making = gregg_letters)

{'mean_binary_length': 15.425163710803988,
 'mean_binary_excess': 9.260242654630629,
 'mean_entropy_shift': 6.228619368943665,
 'mean_transmission_cost': 11.213200843065076,
 'mean_transmission_excess': 5.048279786891717,
 'mean_transmission_shift': 2.0166565012047535,
 'reconstruction_entropy': 0.25155755894450227,
 'reconstruction_error': 0.07505138574335446,
 'alphabet': 'e a o t n th s u r k m s2 h d l f nt \\ b p sh v i ^ / g pr ch j e2 tn mn th2 td ths pl tm ss ng df a2 o2 ld bl br fr ya ea nk s2s x fl dfl jnt ss2 w ye i2 ia mt ye2 kp c bld y s2h z us as2 s2s2 ya2 >  vr E ot s22 u\\ nd q ea2 ao dm mm < xs \\v oe se rd o1 re dya om s2o es pld nu ge ta rdf'}

# Gregg Notehand

This loads in Gregg Notehand from [here](https://gregg-shorthand.com/2015/09/05/notehand-dictionary/).  The key here is that blends should be counted as single alphabet entries, and all is separated by hyphens, so we need custom length and letter functions.

In [138]:
gregg_notehand_raw = pd.read_csv("Notehand.csv")

gregg_notehand = {}

for _,line in gregg_notehand_raw.iterrows():
    gregg_notehand[str(line['word']).lower()] = str(line['form'])

In [139]:
code_statistics([x for x in one_grams_dict if x in gregg_notehand], gregg_notehand, length_function = gregg_length, letter_making = gregg_letters)

{'mean_binary_length': 15.343301937676637,
 'mean_binary_excess': 9.916704355750431,
 'mean_entropy_shift': 7.050406666597668,
 'mean_transmission_cost': 11.667870112278752,
 'mean_transmission_excess': 6.241272530352546,
 'mean_transmission_shift': 3.3749748411997835,
 'reconstruction_entropy': 0.11181820004163713,
 'reconstruction_error': 0.033954582356027774,
 'alphabet': 'e a o t r s over_th n oo l m left_s k d f nd p b h I v sh under_th g ing ch th nt j  nan ng ld ten rd ted men oi ye ia ngk _ Ia ea md c . mt I_rd over_the x e_p r–s left d–s ya e_d over nk i er a_ ee + e_t a_sh'}

# Gregg Simplified

This is a different encoding of Gregg simplified, from [here](https://www.reddit.com/r/shorthand/comments/1e7ie7g/gregg_simplified_computerreadable_dictionary/)

In [140]:
gregg_simplified_raw = pd.read_csv("LEGS.csv",names=['word','form'],header = None)

gregg_simplified = {}

for _,line in gregg_simplified_raw.iterrows():
    gregg_simplified[str(line['word']).lower()] = str(line['form'])

In [141]:
code_statistics([x for x in one_grams_dict if x in gregg_simplified], gregg_simplified, letter_making = lambda x: x)

{'mean_binary_length': 11.769702509512225,
 'mean_binary_excess': 7.344900086927768,
 'mean_entropy_shift': 4.669998288575941,
 'mean_transmission_cost': 9.675608376709912,
 'mean_transmission_excess': 5.250805954125455,
 'mean_transmission_shift': 2.5759041557736273,
 'reconstruction_entropy': 0.2091399622585796,
 'reconstruction_error': 0.06600137338137413,
 'alphabet': '( e t u n a N s T r o l m A b k d f p z v i h D $ g j ) M : c % G 5 q V L ^ y R w X K 3 ~ J x P *'}

# bref

This loads the bref dictionary from [here](https://www.reddit.com/r/shorthand/comments/esjhdk/bref_shorthand/).

In [142]:
with open("11661 WORDS FORWARD.txt", "r") as file:
    bref_raw = file.readlines()

bref = {}

for line in bref_raw:
    splits = line.split(" = ")
    if len(splits) != 2:
        continue
    rep = splits[1].replace("\n","")
    word = splits[0]
    bref[word] = rep

In [143]:
code_statistics([x for x in one_grams_dict if x in bref], bref)

{'mean_binary_length': 12.321204615957537,
 'mean_binary_excess': 5.900270582891406,
 'mean_entropy_shift': 2.8401015434801753,
 'mean_transmission_cost': 11.283344313958496,
 'mean_transmission_excess': 4.862410280892365,
 'mean_transmission_shift': 1.8022412414811342,
 'reconstruction_entropy': 0.0,
 'reconstruction_error': 0.0,
 'alphabet': 't s r n e o l d c a m p i f w b g u h z x v k j y q'}

# Yublin

This loads the Yublin dictionary from [here](http://jonathanaquino.com/yublin.csv).

In [144]:
with open("yublin.csv", "r") as file:
    raw_yublin = file.readlines()

yublin = {}

for line in raw_yublin:
    splits = line.replace("\n","").split(",")
    if len(splits) != 2:
        continue
    rep = splits[1]
    word = splits[0]

    yublin[word] = rep

for x in one_grams_dict:
    if x not in yublin:
        yublin[x] = x

In [145]:
code_statistics([x for x in one_grams_dict if x in yublin], yublin)

{'mean_binary_length': 16.559197673153815,
 'mean_binary_excess': 9.912244853380841,
 'mean_entropy_shift': 6.792051302684843,
 'mean_transmission_cost': 14.726385582242223,
 'mean_transmission_excess': 8.07943276246925,
 'mean_transmission_shift': 4.9592392117732516,
 'reconstruction_entropy': 0.0,
 'reconstruction_error': 0.0,
 'alphabet': "t e o i n a s r l c d f h u m p g y b w v k j x q z ' é"}

# Cut Spelng

This loads the Cut Spelling dictionary form [here](https://github.com/DanielTillett/CutSpel/blob/master/cutspel.csv).

In [146]:
cut_spelng_raw = pd.read_csv("cutspel.csv",names = ["word","brief"], header = None)

cut_spelng = {}

for _,line in cut_spelng_raw.iterrows():
    cut_spelng[line['word']] = line['brief'].replace(" ","")
cut_spelng['muscle'] = 'musl'

for x in one_grams_dict:
    if x not in cut_spelng:
        cut_spelng[x] = x

In [147]:
code_statistics([x for x in one_grams_dict if x in cut_spelng], cut_spelng)

{'mean_binary_length': 19.852082039357892,
 'mean_binary_excess': 13.205129219584919,
 'mean_entropy_shift': 10.084935668888921,
 'mean_transmission_cost': 17.35120814244379,
 'mean_transmission_excess': 10.704255322670818,
 'mean_transmission_shift': 7.58406177197482,
 'reconstruction_entropy': 0.015095746507235148,
 'reconstruction_error': 0.005325956756667138,
 'alphabet': "t e a o n i r s h d l c u f m p y w b g v k j x q z é '"}

# Taylor

This attempts to construct an allignment of the spelling and of the IPA sounds then construct the Taylor representation of that word.

In [148]:
# Improved drop vowels function to handle stress and long vowels and ensure proper vowel removal
def drop_vowels(word):
    vowels = "aeiouAEIOUɑæɪɔʊəɜɛːɒʌ"
    punctuation_and_spaces = "ˈˌ. ,-'\";:!?()[]{}<>/@#%^&*~`"
    return ''.join([char for char in word if char not in vowels and char not in punctuation_and_spaces])

# Recursive alignment function with debug statements including match identification
def find_allowable_pairs_recursive(str1, str2, allowable_pairs, debug = False):
    def helper(s1, s2, pairs, path):
        if not s1 and not s2:
            return path
        for pair in pairs:
            l1, l2 = len(pair[0]), len(pair[1])
            if s1[:l1] == pair[0] and s2[:l2] == pair[1]:
                if debug:
                    print(f"Matching: {pair} with {s1[:l1]} and {s2[:l2]}")
                result = helper(s1[l1:], s2[l2:], pairs, path + [pair])
                if result is not None:
                    return result
        if debug:
            print(f"Failed to match: {s1} with {s2}")
        return None

    return helper(str1, str2, allowable_pairs, [])

# Function to process the word and find alignment with debug statements
def process_word(word, ipa_dict, consonant_pairs, debug = False):
    if word not in ipa_dict:
        return f"Word '{word}' not found in IPA dictionary."
    
    ipa_representation = ipa_dict[word]
    word_consonants = drop_vowels(word)
    ipa_consonants = drop_vowels(ipa_representation)
    
    if debug:
        print(f"Processing '{word}' -> '{word_consonants}' with IPA '{ipa_representation}' -> '{ipa_consonants}'")
    
    alignment = find_allowable_pairs_recursive(word_consonants, ipa_consonants, consonant_pairs, debug = debug)
    
    if alignment is None:
        return f"No valid alignment found for '{word}'."
    return alignment

# Reordered consonant_pairs to prioritize more common matches first and adding back the "f" matching to itself
consonant_pairs = [
    ("c", "k"),  # voiceless velar plosive, usually in "cat"
    ("t", "t"),  # voiceless alveolar plosive
    ("d", "d"),  # voiced alveolar plosive
    ("d", "t"),  # voiced alveolar plosive
    ("t", "d"),  # voiced alveolar plosive
    ("n", "n"),  # alveolar nasal
    ("s", "s"),  # voiceless alveolar fricative
    ("l", "l"),  # alveolar lateral approximant
    ("l", "ɫ"),
    ("r", "r"),  # alveolar approximant matching itself
    ("r", "ɹ"),  # alveolar approximant
    ("r", "ɝ"),  # alveolar approximant
    ("m", "m"),  # bilabial nasal
    ("b", "b"),  # voiced bilabial plosive
    ("k", "k"),  # voiceless velar plosive
    ("f", "f"),  # voiceless labiodental fricative
    ("g", "g"),  # voiced velar plosive
    ("g", "ɡ"),
    ("v", "v"),  # voiced labiodental fricative
    ("x", "ks"), # voiceless velar-alveolar fricative, "taxi"
    ("z", "z"),  # voiced alveolar fricative
    ("p", "p"),  # voiceless bilabial plosive
    ("h", "h"),  # voiceless glottal fricative
    ("w", "w"),  # voiced labial-velar approximant
    ("y", "j"),  # voiced palatal approximant
    ("j", "dʒ"), # voiced palato-alveolar affricate, often "j" sound
    ("j", "ʤ"), # voiced palato-alveolar affricate, often "j" sound
    ("j", "ʒ"), # voiced palato-alveolar affricate, often "j" sound
    ("ng", "ŋ"), # voiced velar nasal
    ("n", "ŋ"), # voiced velar nasal
    ("ch", "ʧ"), # voiceless palato-alveolar affricate
    ("ch", "tʃ"), # voiceless palato-alveolar fricative
    ("sh", "tʃ"), # voiceless palato-alveolar fricative
    ("sh", "ʧ"), # voiceless palato-alveolar affricate
    ("sh", "ʃ"), # voiceless palato-alveolar fricative
    ("ch", "ʃ"), # voiceless palato-alveolar fricative
    ("th", "θ"), # voiceless dental fricative
    ("th", "ð"), # voiced dental fricative, as in "this"
    ("t", "tʃ"), # voiceless palato-alveolar fricative
    ("g", "ʒ"),  # voiced palato-alveolar fricative
    ("s", "z"),  # voiced alveolar fricative, usually in "dogs"
    ("s", "ʒ"),  # voiced palato-alveolar fricative, as in "measure"
    ("s", "ʃ"),  # voiceless palato-alveolar fricative, as in "champagne"
    ("z", "ʒ"),  # voiced palato-alveolar fricative, as in "measure"
    ("z", "ʃ"),  # voiceless palato-alveolar fricative, as in "champagne"
    ("c", "ʃ"),  # voiceless palato-alveolar fricative, as in "champagne"
    ("ck", "k"), # voiceless velar plosive, as in "back"
    ("ps", "s"), # "ps" matching "s"
    ("q", "kw"), # voiceless velar and labial-velar approximant, usually in "queen"
    ("b", ""),   # silent b
    ("h", ""),   # silent h
    ("w", ""),   # silent w
    ("y", ""),   # sometimes silent y
    ("f", "v"),  # voiced labiodental fricative
    ("ph", "f"), # voiceless labiodental fricative, usually in "phone"
    ("gh", "f"), # voiceless labiodental fricative, as in "laugh"
    ("gh", ""),  # silent "gh", as in "though"
    ("kn", "n"), # silent "k", as in "knee"
    ("gn", "n"), # silent "g", as in "gnome"
    ("mn", "m"), # "mn" matching "m"
    ("mn", "n"), # "mn" matching "n"
    ("wr", "r"), # silent "w", as in "write" with matching "r"
    ("wh", "w"), # voiced labial-velar approximant, as in "what"
    ("sc", "s"), # "sc" matching "s"
    ("c", "s"),  # voiceless alveolar fricative, usually in "cent"
    ("ld", "d"), # "dd" matching "d"
    ("dd", "d"), # "dd" matching "d"
    ("ll", "l"), # "ll" matching "l"
    ("ll", "ɫ"),
    ("mm", "m"), # "mm" matching "m"
    ("nn", "n"), # "nn" matching "n"
    ("rr", "r"), # "nn" matching "n"
    ("pp", "p"), # "pp" matching "p"
    ("ff", "f"), # "pp" matching "p"
    ("tt", "t"), # "tt" matching "t"
    ("ss", "s"), # "ss" matching "s"
    ("ss", "z"), # "ss" matching "z"
    ("ss", "ʒ"), # "ss" matching "ʒ"
    ("ss", "ʃ"), # "ss" matching "ʃ"
    ("t", "ʒ"),  # "t" matching "ʒ"
    ("t", "ʃ"),  # "t" matching "ʃ"
    ("x", "gz"), # voiced velar-alveolar fricative, "exam"
    ("x", "z"),  # voiced alveolar fricative, "xylophone"
    ("zz", "ts"),# "zz" matching "ts"
    ("z", "ts"), # "z" matching "ts"
    ("g", "dʒ"), # voiced palato-alveolar affricate
    ("g", "ʤ"), # voiced palato-alveolar affricate
    ("gg", "g"), # "gg" matching voiced velar plosive
    ("gg", "ʒ"), # "gg" matching voiced palato-alveolar fricative
    ("gg", "dʒ"),# "gg" matching voiced palato-alveolar affricate
    ("gg", "ʤ"),# "gg" matching voiced palato-alveolar affricate
    ("ch", "k"), # voiceless velar plosive, as in "school"
    ("cc", "k"),  # voiceless velar plosive, usually in "cat"
    ("", "j"),   # silent matching "j"
    ("p",""),    # silent "p" in words like "pnumonia" or "coup"
    ("t",""),    # silent "p" in words like "pnumonia" or "coup"
    ("sl","l"),  # silent "s" like in "island"
    ("ch",""),   # to handle "yacht"
    ("q","k"),   # to handle "queue"
    ("l","r"),   # to handle "colonel"
    ("s",""),    # silent "s" in words like "debris"
    ("ct","t"),  # to handle "indict"
    ("l",""),    # silent "l" in words like "salmon"
    ("dn","n"),  # to handle "wednesday"
    ("z",""),    # to handle "rendezvous"
    ("r",""),    # to handle "dossier"
    ("ç","s"),   # to handle "façade"
    ("gm","m"),  # to handle "rendezvous"
    ("","w"),    # to handle "bourgeois"
    ("sc","ʃ"),  # "concience"
    ("d",""),    # "handkerchief"
    ('x', 'ɡz'),
    ('gg', 'ɡ'),
    ("cq","kw"),
    ("ng","n"),
    ("kd","t"),
    ("nm","m"),
    ("d","dʒ"),
    ("x","kʃ"),
    ("x","k"),
    ("x","ʃ"),
    ("l","ɝ"),
    ("x","ɡʒ"),
    ("s","tʃ")
]

In [149]:
taylor_convert = {
    ("c", "k"):"k",  # voiceless velar plosive, usually in "cat"
    ("t", "t"):"t",  # voiceless alveolar plosive
    ("d", "d"):"d",  # voiced alveolar plosive
    ("d", "t"):"t",  # voiced alveolar plosive
    ("t", "d"):"t",  # voiced alveolar plosive
    ("n", "n"):"n",  # alveolar nasal
    ("s", "s"):"s",  # voiceless alveolar fricative
    ("l", "l"):"l",  # alveolar lateral approximant
    ("l", "ɫ"):"l",
    ("r", "r"):"r",  # alveolar approximant matching itself
    ("r", "ɹ"):"r",  # alveolar approximant
    ("r", "ɝ"):"r",  # alveolar approximant
    ("m", "m"):"m",  # bilabial nasal
    ("b", "b"):"b",  # voiced bilabial plosive
    ("k", "k"):"k",  # voiceless velar plosive
    ("f", "f"):"f",  # voiceless labiodental fricative
    ("g", "g"):"g",  # voiced velar plosive
    ("g", "ɡ"):"g",
    ("v", "v"):"f",  # voiced labiodental fricative
    ("x", "ks"):"x", # voiceless velar-alveolar fricative, "taxi"
    ("z", "z"):"s",  # voiced alveolar fricative
    ("p", "p"):"p",  # voiceless bilabial plosive
    ("h", "h"):"h",  # voiceless glottal fricative
    ("w", "w"):"w",  # voiced labial-velar approximant
    ("y", "j"):"y",  # voiced palatal approximant
    ("j", "dʒ"):"g", # voiced palato-alveolar affricate, often "j" sound
    ("j", "ʤ"):"g", # voiced palato-alveolar affricate, often "j" sound
    ("j", "ʒ"):"g", # voiced palato-alveolar affricate, often "j" sound
    ("ng", "ŋ"):"ng", # voiced velar nasal
    ("n", "ŋ"):"n", # voiced velar nasal
    ("ch", "ʧ"):"C", # voiceless palato-alveolar affricate
    ("ch", "tʃ"):"C", # voiceless palato-alveolar fricative
    ("sh", "tʃ"):"S", # voiceless palato-alveolar fricative
    ("sh", "ʧ"):"S", # voiceless palato-alveolar affricate
    ("sh", "ʃ"):"S", # voiceless palato-alveolar fricative
    ("ch", "ʃ"):"C", # voiceless palato-alveolar fricative
    ("th", "θ"):"T", # voiceless dental fricative
    ("th", "ð"):"T", # voiced dental fricative, as in "this"
    ("t", "tʃ"):"t", # voiceless palato-alveolar fricative
    ("g", "ʒ"):"g",  # voiced palato-alveolar fricative
    ("s", "z"):"s",  # voiced alveolar fricative, usually in "dogs"
    ("s", "ʒ"):"s",  # voiced palato-alveolar fricative, as in "measure"
    ("z", "ʒ"):"s",  # voiced palato-alveolar fricative, as in "measure"
    ("z", "ʃ"):"s",  # voiceless palato-alveolar fricative, as in "champagne"
    ("s", "ʃ"):"s",  # voiceless palato-alveolar fricative, as in "champagne"
    ("c", "ʃ"):"c",  # voiceless palato-alveolar fricative, as in "champagne"
    ("ck", "k"):"k", # voiceless velar plosive, as in "back"
    ("ps", "s"):"s", # "ps" matching "s"
    ("q", "kw"):"q", # voiceless velar and labial-velar approximant, usually in "queen"
    ("b", ""):"",   # silent b
    ("h", ""):"",   # silent h
    ("w", ""):"",   # silent w
    ("y", ""):"",   # sometimes silent y
    ("f", "v"):"f",  # voiced labiodental fricative
    ("ph", "f"):"f", # voiceless labiodental fricative, usually in "phone"
    ("gh", "f"):"f", # voiceless labiodental fricative, as in "laugh"
    ("gh", ""):"",  # silent "gh", as in "though"
    ("kn", "n"):"n", # silent "k", as in "knee"
    ("gn", "n"):"n", # silent "g", as in "gnome"
    ("mn", "m"):"m", # "mn" matching "m"
    ("mn", "n"):"n", # "mn" matching "n"
    ("wr", "r"):"r", # silent "w", as in "write" with matching "r"
    ("wh", "w"):"w", # voiced labial-velar approximant, as in "what"
    ("sc", "s"):"s", # "sc" matching "s"
    ("c", "s"):"s",  # voiceless alveolar fricative, usually in "cent"
    ("ld", "d"):"d", # "dd" matching "d"
    ("dd", "d"):"d", # "dd" matching "d"
    ("ll", "l"):"l", # "ll" matching "l"
    ("ll", "ɫ"):"l",
    ("mm", "m"):"m", # "mm" matching "m"
    ("nn", "n"):"n", # "nn" matching "n"
    ("rr", "r"):"r", # "nn" matching "n"
    ("pp", "p"):"p", # "pp" matching "p"
    ("ff", "f"):"f", # "pp" matching "p"
    ("tt", "t"):"t", # "tt" matching "t"
    ("ss", "s"):"s", # "ss" matching "s"
    ("ss", "z"):"s", # "ss" matching "z"
    ("ss", "ʒ"):"s", # "ss" matching "ʒ"
    ("ss", "ʃ"):"s", # "ss" matching "ʃ"
    ("t", "ʒ"):"t",  # "t" matching "ʒ"
    ("t", "ʃ"):"t",  # "t" matching "ʃ"
    ("x", "gz"):"x", # voiced velar-alveolar fricative, "exam"
    ("x", "ɡz"):"x", # voiced velar-alveolar fricative, "exam"
    ("x", "z"):"x",  # voiced alveolar fricative, "xylophone"
    ("zz", "ts"):"z",# "zz" matching "ts"
    ("z", "ts"):"z", # "z" matching "ts"
    ("g", "dʒ"):"g", # voiced palato-alveolar affricate
    ("g", "ʤ"):"g", # voiced palato-alveolar affricate
    ("gg", "g"):"g", # "gg" matching voiced velar plosive
    ("gg", "ɡ"):"g",
    ("gg", "ʒ"):"g", # "gg" matching voiced palato-alveolar fricative
    ("gg", "dʒ"):"g",# "gg" matching voiced palato-alveolar affricate
    ("gg", "ʤ"):"g",# "gg" matching voiced palato-alveolar affricate
    ("ch", "k"):"C", # voiceless velar plosive, as in "school"
    ("cc", "k"):"k",  # voiceless velar plosive, usually in "cat"
    ("", "j"):"",   # silent matching "j"
    ("p",""):"",    # silent "p" in words like "pnumonia" or "coup"
    ("t",""):"",    # silent "p" in words like "pnumonia" or "coup"
    ("sl","l"):"l",  # silent "s" like in "island"
    ("ch",""):"",   # to handle "yacht"
    ("q","k"):"k",   # to handle "queue"
    ("l","r"):"r",   # to handle "colonel"
    ("s",""):"",    # silent "s" in words like "debris"
    ("ct","t"):"t",  # to handle "indict"
    ("l",""):"",    # silent "l" in words like "salmon"
    ("dn","n"):"n",  # to handle "wednesday"
    ("z",""):"",    # to handle "rendezvous"
    ("r",""):"",    # to handle "dossier"
    ("ç","s"):"s",   # to handle "façade"
    ("gm","m"):"m",  # to handle "rendezvous"
    ("","w"):"",    # to handle "bourgeois"
    ("sc","ʃ"):"s",  # "concience"
    ("d",""):"",    # "handkerchief"
    ("cq","kw"):"k",  # "aquire"
    ("ng","n"):"ng",
    ("kd","t"):"kt",
    ("nm","m"):"m", #government
    ("d","dʒ"):"d", #education
    ("x","kʃ"):"x",
    ("x","k"):"x", #excellent
    ("x","ʃ"):"x", #anxious
    ("l","ɝ"):"l", #colonel
    ("x","ɡʒ"):"x", #luxury
    ("s","tʃ"):"s" #tensions
}

In [150]:
def taylor_raw(word):
    consonants = "".join([taylor_convert[x] for x in process_word(word,ipa_dict, consonant_pairs)])
    vowels = "aeiouAEIOUɑæɪɔʊəɜɛːɒʌ"
    ipa = ipa_dict[word]
    punctuation_and_spaces = "ˈˌ. ,-'\";:!?()[]{}<>/@#%^&*~`"
    ipa =  ''.join([char for char in ipa if char not in punctuation_and_spaces])
    initial, final = False, False
    if ipa[0] in vowels:
        initial = True
    if len(consonants) > 0 and ipa[-1] in vowels:
        final = True
    if initial:
        consonants = "'" + consonants
    if final:
        consonants = consonants + "'"
    return consonants

In [151]:
taylor_endings = {
    'ing':(',',2),
    'ings':("_",3),
    'ble':("-b",2),
    'ful':("-f",2),
    'ly':(".",2),
    'ment':("-m",3),
    'ments':("-m",4),
    'ness':("-n",2),
    'rary':("-r",3),
    'self':("-s",3),
    'selvs':("-s",4),
    'ward':("-w",3),
    'ship':("-S",2),
    'ious':("I",1),
    'eous':("I",1),
    'uous':("I",1),
    'tion':("*",2),
    'sion':("*",2),
    'tions':("^",3),
    'sions':("^",3),
}

taylor_endings = {
    'ing':(',',2),
    'ings':("_",3),
    'ble':("b",2),
    'ful':("f",2),
    'ly':(".",2),
    'ment':("m",3),
    'ments':("m",4),
    'ness':("n",2),
    'rary':("r",3),
    'self':("s",3),
    'selvs':("s",4),
    'ward':("w",3),
    'ship':("S",2),
    'ious':("I",1),
    'eous':("I",1),
    'uous':("I",1),
    'tion':("*",2),
    'sion':("*",2),
    'tions':("^",3),
    'sions':("^",3),
}

taylor_briefs = {
    "be":"b",
    "by":"b",
    "been":"b",
    "do":"d",
    "did":"d",
    "of":"f",
    "off":"f",
    "if":"f",
    "god":"g",
    "give":"g",
    "go":"g",
    "have":"h",
    "he":"h",
    "know":"k",
    "known":"k",
    "no":"k",
    "lord":"l",
    "all":"l",
    "me":"m",
    "my":"m",
    "many":"m",
    "hand":"n",
    "and":"n",
    "an":"n",
    "in":"n",
    "peace":"p",
    "person":"p",
    "are":"r",
    "air":"r",
    "our":"r",
    "or":"r",
    "his":"s",
    "is":"s",
    "as":"s",
    "us":"s",
    "that":"t",
    "time":"t",
    "with":"w",
    "which":"w",
    "who":"w",
    "example":"x",
    "except":"x",
    "you":"y",
    "your":"y",
    "year":"y",
    "such":"C",
    "chance":"C",
    "shalt":"S",
    "shall":"S",
    "the":"T",
    "thee":"T",
    "they":"T",
    "conscious":"I",
    "judicious":"I"
}

def taylor_full(word):
    if word in taylor_briefs:
        return taylor_briefs[word]
    for ending in taylor_endings:
        if len(word) >= len(ending) and word[-len(ending):] == ending:
            symb, trim = taylor_endings[ending]
            return taylor_raw(word)[:-trim][:4] + symb
    return taylor_raw(word)

In [152]:
taylor = {}
failures = 0

for word in ipa_dict:
    try:
        taylor[word] = taylor_full(word)
    except:
        failures+=1

failures, "out of", len(ipa_dict), "failed."

(2283, 'out of', 125926, 'failed.')

In [153]:
code_statistics([x for x in one_grams_dict if x in taylor], taylor)

{'mean_binary_length': 12.003069219198045,
 'mean_binary_excess': 5.358130249730938,
 'mean_entropy_shift': 2.2383583238376072,
 'mean_transmission_cost': 9.924498156397696,
 'mean_transmission_excess': 3.27955918693059,
 'mean_transmission_shift': 0.15978726103725904,
 'reconstruction_entropy': 0.7987322326991305,
 'reconstruction_error': 0.2218609124591735,
 'alphabet': "' t n r s f l d T k m p w b g h , * . S C y x ^ q c _ I z"}

# Odell/Times Variant Approximation

This attempts to construct an allignment of the spelling and of the IPA sounds then construct the Taylor representation of that word. This is not a true implementation of Odell/Times Variant, but instead just allows for the the names of initial and final vowels to be kept.

In [154]:
# just kinda madeup lol
vowel_map = {
    "a":"a",
    "e":"e",
    "i":"i",
    "o":"o",
    "u":"u",
    "ɑ":"a",
    "æ":"a",
    "ɪ":"i",
    "ɔ":"o",
    "ʊ":"u",
    "ə":"e",
    "ɜ":"e",
    "ɛ":"e",
    "ɒ":"a",
    "ʌ":"a",
}

def taylor_plus_raw(word):
    consonants = "".join([taylor_convert[x] for x in process_word(word,ipa_dict, consonant_pairs)])
    vowels = "aeiouAEIOUɑæɪɔʊəɜɛːɒʌ"
    ipa = ipa_dict[word]
    punctuation_and_spaces = "ˈˌ. ,-'\";:!?()[]{}<>/@#%^&*~`"
    ipa =  ''.join([char for char in ipa if char not in punctuation_and_spaces])
    initial, final = False, False
    if ipa[0] in vowels:
        initial = vowel_map[ipa[0]]
        if word[0] in vowels:
            initial = word[0]
    if len(consonants) > 0 and ipa[-1] in vowels:
        final = vowel_map[ipa[-1]]
        if word[-1] in vowels:
            final = word[-1]
    if initial:
        consonants = initial + consonants
    if final:
        consonants = consonants + final
    return consonants

def taylor_plus_full(word):
    if word in taylor_briefs:
        return taylor_briefs[word]
    for ending in taylor_endings:
        if len(word) >= len(ending) and word[-len(ending):] == ending:
            symb, trim = taylor_endings[ending]
            return taylor_plus_raw(word)[:-trim][:4] + symb
    return taylor_plus_raw(word)

In [155]:
taylor_plus = {}
failures = 0

for word in ipa_dict:
    try:
        taylor_plus[word] = taylor_plus_full(word)
    except:
        failures+=1

failures, "out of", len(ipa_dict), "failed."

(2283, 'out of', 125926, 'failed.')

In [156]:
code_statistics([x for x in one_grams_dict if x in taylor_plus], taylor_plus)

{'mean_binary_length': 12.463657606792484,
 'mean_binary_excess': 5.818718637325378,
 'mean_entropy_shift': 2.698946711432047,
 'mean_transmission_cost': 10.490265252229612,
 'mean_transmission_excess': 3.845326282762506,
 'mean_transmission_shift': 0.7255543568691749,
 'reconstruction_entropy': 0.6857898653998239,
 'reconstruction_error': 0.1900756428174497,
 'alphabet': 't n r s f l d T k m p i a o w b g h e , * u . S C y x ^ q c _ I z'}

# Characterie

This uses a dictionary I made for [my webpage](https://characterie.neocities.org/).  This requires a custom alphabet and length measure that takes into account the strange way that characterie represents words.

In [157]:
words = {
    'a':['abound','about','accept','accuse','advance','air','again','age','all','almost','also','although','alter','am','amend','anger','anoint','apparel','appertain','appoint','arm','art','as','at','awe','away'], #added awe and away from the table of english words
    'b':['banish','bargain','bear','beast','beat','before','beg','begin','belly','bend','benefit','bestow','between','beware','by','because','bird','bishop','bite','blaze','blows','blood','blue','bless','bone','book','borrow','both','bottom','bread','break','breed','breast','bright','brittle','brother','bruise','burn','busy','but'], #blussy to blush by table guess
    'c':['call','can','captious','care','case','cave','cause','certain','challenge','change','christian','church','choose','kill','kind','circumstance','city','cloth','know','coin','color','command','comfort','common','compare','company','compel','continue','conceive','condition','contained','consider','confess','conscience','constant','convey','content','come','corner','corrupt','cover','council','count','cry','question','quit','compass','cut'], # caii to call and first cause to case by consulting Table.
    'd':['day','danger','deceive','declare','dedicate','dear','defend','delight','deprive','deputy','descend','desire','despise','destitute','destroy','diet','differ','dig','diligence','dissemble','distress','dizzy','do','doubt','draw','dream','dry','drink','drive','drop','due','double'], #dont to doubt by brachyography
    'e':['earth','edge','even','element','eloquence','enough','enter','enterprise','erect','err','escape','ever','example','except','exercise','expect','expert'], #every changed to ever due to overlap with all, second except changed to expect based on Bales
    'f':['face','faith','fair','fall','fare','far','fast','fat','fear','feast','feed','field','fetter','fight','fickle','fill','filthy','fine','find','fire','fish','flatter','flesh','fly','fling','flourish','follow','ford','force','forsake','fortune','foundation','fountain','free','friend','from','frown','fruit','furnish'], #third fair homophone changed to far.  Matches back table and Bales.
    'g':['gape','guard','garment','gather','gentle','guest','get','guide','given','go','god','good','gospel','glass','glory','grace','grass','grain','gravel','grave','great','grief','grove','grow'], #grassy to gravel by back table
    'h':['half','hand','hang','hard','heart','harvest','haste','have','halt','haunt','heal','he','head','help','herb','here','heat','hitherto','heaven','high','hill','history','hit','holy','hollow','honest','hope','how','hold','house','husband','hurt'], #hie to high and missing h to holy based on table
    'i':['yet','if','inheritance','enjoy','innocent','inquire','instrument','entertain','invent','you','join','young','judge','jewel'],
    'l':['labor','last','late','laugh','lean','learn','let','leather','leave','lie','liberality','life','light','like','limit','line','load','loose','love'], #ly to lie, lie to light by consulting back, removed louge for lack of matching backmatter word (all represented after other change)
    'm':['mad','make','man','manner','many','merchant','mark','marry','marvel','mass','master','matter','mean','measure','meet','mercy','merit','message','metal','mind','mine','mirth','mixed','mock','modesty','more','move','mouth','much','murmur'], #mast to master due to use with lord and maste going to rod in table (no master listed), many for money due to back and Bales
    'n':['nail','nature','necessary','neighbor','neither','net','nevertheless','nip','no','noble','nothing','nonetheless','now'],
    'o':['obey','office','offend','offer','oft','oh','omit','one','open','oppose','oppress','or','order','oath','other','over','overmuch','overtake','ought','own','our','out','outward'], 
    'p':['patient','parent','part','pass','peace','people','perfect','persuade','physique','place','plague','play','plain','plead','pledge','point','possible','power','pray','praise','preach','prejudice','prepare','present','pretend','prevail','prevent','prick','prince','promise','prophesy','proportion','prosper','prove','pulpit','punish','purge','purpose'],
    'r':['race','reign','rebuke','reach','recover','read','ready','region','rejoice','religion','remember','reap','repent','reason','resolve','rest','restore','reward','revenge','revile','rich','right','ripe','rob','rod','root','rough','rub','rule','rush'],
    's':['salute','save','scarce','school','slander','see','seed','seem','since','shine','ship','shoot','side','sink','sing','sit','skill','slip','smatter','smoke','sudden','soever','some','sore','sound','space','spare','spark','speak','spice','spit','spring','stay','start','step','steward','stone','strain','strong','study','stuff','stumble','substance','such','sweet','swell','surfeit','sun'], #shore to shoot to match back table, string to spring to also increase alignment
    't':['tame','taste','tear','temper','tempest','thank','that','then','thence','there','thither','thine','thing','think','this','thrive','tidings','till','time','together','tongue','touch','trade','treason','tree','tribute','triumph','trouble','true','turn'], # second tree changed to true (matches English Words)
    'w':['way','vain','wait','wear','warn','watch','water','weapon','weary','venture','very','virtue','vessel','weather','what','where','wherefore','which','whore','will','vine','wind','winter','violence','wife','visit','witness','wood','word','world','worship','worthy','up','uproar','wrinkle','write','use'] #wayne to way, vnrove to uproar based on back
}

particles = ['the', 'we', 'i', 'well', 'etc', 'be', 'fie', 'hence', 'they', 'myself', 'ourselves', 'so', 'so_as', 'and', 'in', 'of', 'to', 'a', 'for', 'with', 'it', 'it_is', 'it_self', 'as_it_were', 'that_is_to_say', 'that', 'least_that', 'thou', 'ye', 'self', 'ward', 'amen']

encode = {
    "0":"aa",
    "1":"ba",
    "2":"ca",
    "3":"da",
    "4":"ea",
    "5":"fa",
    "6":"ga",
    "7":"ha",
    "8":"ia",
    "9":"la"
}

num_encode = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ@#$%^&*()[]{}"

for letter in words:
    for i in range(len(words[letter])):
        encode[words[letter][i]] = num_encode[i] + letter

for i in range(len(particles)):
    encode["." + particles[i]] = num_encode[i] + "." 

In [158]:
with open('manual_curate.json') as f:
    characterie_raw = json.load(f)

def characterie_length(in_string):
    count = 0
    words = in_string.split(" ")
    for word in words:
        parts = word.split("-")
        count += len(parts)-1
        segments = parts[0].split(">")
        for segment in segments:
            if "." in segment:
                count += 1
            elif segment.isupper():
                count += len(segment)
            else:
                count += 2
    return count

parts_map = {
    "past":"-", #left dot
    "future":"+", #right dot
    "mark":"+",
    "er":":",
    "s":"+",
    "ing":"_",
    "not":"!",
    "ship":" As",
    "hood":" As",
}

def characterie_encode(in_word):
    in_string = characterie_raw[in_word]
    output = []
    words = in_string.split(" ")
    for word in words:
        parts = word.split("-")
        segments = parts[0].split(">")
        out_seg = []
        for segment in segments:
            if segment.isupper():
                out_seg.append(segment.lower())
            else:
                out_seg.append(encode[segment])
        output_word = "".join(out_seg)
        for i in range(1,len(parts)):
            output_word += parts_map[parts[i]]
        output.append(output_word)
    return " ".join(output)
    
characterie = {w:characterie_encode(w) for w in characterie_raw}
    
# # the letters, left dot, right dot, right double dot, under double dot, negation
# # the 48 foot/orientation combinations possible
# # the 32 particles (some also letters, and thus somewhat overcounted)
# characterie_alphabet = set()
# characterie_alphabet = characterie_alphabet | set("abcdefghilmnoprsztu1.:_!")
# characterie_alphabet = characterie_alphabet | set(f"o{i}" for i in range(48))
# characterie_alphabet = characterie_alphabet | set(f"p{i}" for i in range(32))

In [159]:
code_statistics([x for x in one_grams_dict if x in characterie], characterie)

{'mean_binary_length': 15.229091785638134,
 'mean_binary_excess': 9.756127863469656,
 'mean_entropy_shift': 6.8783592319593065,
 'mean_transmission_cost': 13.069736590440831,
 'mean_transmission_excess': 7.596772668272353,
 'mean_transmission_shift': 4.719004036762003,
 'reconstruction_entropy': 0.21794984827213726,
 'reconstruction_error': 0.0662984787750377,
 'alphabet': '. 0 t s a h E w c F o B D G b m f 5 H p i 8 r l I 1 d 7 n e 2 - 6 K g J M 9 C S + N A 3 P L Z   T % 4 _ V O Q R W : @ * X $ # Y U ( ] ) & { ^ ['}

# Teeline

Using the Anki deck from [here](https://ankiweb.net/shared/info/310534731).

In [160]:
from ankipandas import Collection

col = Collection("collection.anki21")

In [161]:
teeline = {}

stuff_map = {'+':'',
 '+C':' C',
 '+CH':' CH',
 '+CHmn':' CHmn',
 '+CHr':' CHr',
 '+CHs':' CHs',
 '+Cd':' Cd',
 '+Ci':' Ci',
 '+Cs':' Cs',
 '+ING':' i',
 '+N':' N',
 '+ONG':' o',
 '+S':' S',
 '+Si':' Si',
 '+W':' W',
 '+and':' and',
 '+ang':' a',
 '+ange':' ae',
 '+c':' c',
 '+ch':' ch',
 '+ci':' ci',
 '+cl':' cl',
 '+cr':' cr',
 '+eng':' e',
 '+iN':' iN',
 '+ibble':' I',
 '+ing':' i',
 '+ingC':' iC',
 '+ingc':' ic',
 '+inge':' ie',
 '+ingle':' ile',
 '+ingr':' ir',
 '+l':' l',
 '+li':' li',
 '+ly':' ly',
 '+nch':' nch',
 '+ong':' o',
 '+ung':' u'}

for _,row in col.notes.iterrows():
    info = row['nflds']
    head = info[1].replace("&nbsp;","")
    tail = info[2].replace("&nbsp;","")
    
    if "<" in info[1]:
        continue

    output = tail.split(" ")
    for i in range(len(output)):
        part = output[i]
        if len(part) > 0 and part[0] == '+':
            output[i] = stuff_map[part]
        else:
            output[i] = " "+output[i]
    final = "".join(output)[1:]

    if len(final) == 0:
        continue
        
    for word in head.split(" "):
        teeline[word] = final.lower()

teeline['things'] = 'th is'
teeline['sung'] = 's u'
teeline['compelling'] = 'cmpl i' # teeline['compelling'] = 'CMPL i'
teeline['mangle'] = 'm al'
teeline['tongue'] = 't o'
teeline['tongs'] = 't os'

In [162]:
code_statistics([x for x in one_grams_dict if x in teeline], teeline)

{'mean_binary_length': 13.596477070553554,
 'mean_binary_excess': 8.630874974793834,
 'mean_entropy_shift': 5.8566119086553075,
 'mean_transmission_cost': 11.298005285326841,
 'mean_transmission_excess': 6.332403189567121,
 'mean_transmission_shift': 3.5581401234285943,
 'reconstruction_entropy': 0.21836014216752608,
 'reconstruction_error': 0.054781492737337456,
 'alphabet': 'n s t i p r v h a w l d c m   o f b e u g y k x j q z / 8 -'}

# Dearborn Speedwriting

A dictionary of the original speedwriting from [here](https://pastebin.com/Ca0aRexr)

In [163]:
with open("dearborn.txt", "r") as file:
    speedwriting_raw = file.readlines()

speedwriting = {}

for line in speedwriting_raw:
    splits = line.replace("\n","").split(" = ")
    if len(splits) != 2:
        continue

    word = splits[0]
    rep = splits[1]
    speedwriting[word.lower()] = rep

In [164]:
code_statistics([x for x in one_grams_dict if x in speedwriting], speedwriting)

{'mean_binary_length': 9.891620202359249,
 'mean_binary_excess': 4.734476718881333,
 'mean_entropy_shift': 1.9143101331526058,
 'mean_transmission_cost': 7.722310811477187,
 'mean_transmission_excess': 2.565167327999271,
 'mean_transmission_shift': -0.2549992577294562,
 'reconstruction_entropy': 0.6123703821986793,
 'reconstruction_error': 0.18216280866243562,
 'alphabet': "t a s o n v l m w b r k f d i h p e u g , j c - z / y Z x S O ; K ' q N M A V W L U T E B F R P C G D H 2 J 3 X Q Y I"}

# Keyscript

Using a Keyscript dictionary extracted from the original PDF.

In [165]:
with open('keyscript.json') as f:
    keyscript_raw = json.load(f)

keyscript = {}
for word in keyscript_raw:
    keyscript[word.lower()] = keyscript_raw[word]

In [166]:
code_statistics([x for x in one_grams_dict if x in keyscript], keyscript)

{'mean_binary_length': 6.332050766621434,
 'mean_binary_excess': 2.061107371693759,
 'mean_entropy_shift': -0.5181941605179468,
 'mean_transmission_cost': 5.808707367137027,
 'mean_transmission_excess': 1.5377639722093521,
 'mean_transmission_shift': -1.0415375600023538,
 'reconstruction_entropy': 1.0871969584544927,
 'reconstruction_error': 0.27275805499274164,
 'alphabet': 'h a s v t i r c n w l o e x y u z m q d k p f j b g 8 9'}

# Superwrite

A superwrite dictionary created and contributed to this project by [Keith Rowe](https://github.com/keithrowe).

In [167]:
superwrite_raw = pd.read_csv("superwrite.csv",names=['form','word'],header = None)

superwrite = {}

for _,line in superwrite_raw.iterrows():
    superwrite[str(line['word']).lower()] = str(line['form'])

In [168]:
code_statistics([x for x in one_grams_dict if x in superwrite], superwrite)

{'mean_binary_length': 9.580136164172522,
 'mean_binary_excess': 5.153402737882618,
 'mean_entropy_shift': 2.561370833984843,
 'mean_transmission_cost': 8.089925367627396,
 'mean_transmission_excess': 3.663191941337492,
 'mean_transmission_shift': 1.071160037439717,
 'reconstruction_entropy': 0.3157422119141371,
 'reconstruction_error': 0.12144321942914205,
 'alphabet': 'n | t o s r a l w m f e d b u c h p i y g S v C k - 1 j x   q 2 O U N z T'}

# QC-Line Base

In [169]:
# Pattern to remove vowels that are not the first or last letter, and double consonants
vowel_pattern = r"(?<!^)[aeiou](?!$)"
double_consonant_pattern = r"(?<=(.))\1"

def replace_qc(s): #this is approximate, but should be good enough to plot it
    # Replace 'c' followed by 'e', 'i', or 'y' with 's'
    s = re.sub(r"c(?=[eiy])", "s", s)
    # Replace remaining 'c's with 'k' and 'q's with 'k'
    s = re.sub(r"[cq]", "k", s)
    return s

# Apply the vowel removal pattern first, then the double consonant pattern
qc_line = {word:re.sub(vowel_pattern, '', re.sub(double_consonant_pattern, '', replace_qc(word))) for word in one_grams_dict}

In [170]:
code_statistics([x for x in one_grams_dict if x in qc_line], qc_line)

{'mean_binary_length': 15.39536827849569,
 'mean_binary_excess': 8.748415458722716,
 'mean_entropy_shift': 5.628221908026719,
 'mean_transmission_cost': 13.517469480496352,
 'mean_transmission_excess': 6.8705166607233785,
 'mean_transmission_shift': 3.750323110027381,
 'reconstruction_entropy': 0.2327331929045948,
 'reconstruction_error': 0.0647610299624436,
 'alphabet': "t n s r h e d l k a o f m w g p y i b v u x j z ' é"}

# Yash

A rough approximation to Yash.

In [171]:
# THIS IS THE ONE THAT WORKS MOSTLY

# Remove punctuation and stress markers
def clean_word(word):
    punctuation_and_spaces = "ˈˌ. ,-'\";:!?()[]{}<>/@#%^&*~`"
    return ''.join([char for char in word if char not in punctuation_and_spaces])

# Recursive alignment function with debug statements
def find_allowable_pairs_recursive(str1, str2, allowable_pairs, debug=False):
    def helper(s1, s2, pairs, path):
        if not s1 and not s2:
            return path
        for pair in pairs:
            l1, l2 = len(pair[0]), len(pair[1])
            if s1[:l1] == pair[0] and s2[:l2] == pair[1]:
                if debug:
                    print(f"Matching: {pair} with {s1[:l1]} and {s2[:l2]}")
                result = helper(s1[l1:], s2[l2:], pairs, path + [pair])
                if result is not None:
                    return result
        if debug:
            print(f"Failed to match: {s1} with {s2}")
        return None

    return helper(str1, str2, allowable_pairs, [])

# Function to process the word and find alignment with debug statements
def process_word(word, ipa_representation, allowable_pairs, debug=False):
    cleaned_word = clean_word(word)
    cleaned_ipa = clean_word(ipa_representation)
    
    if debug:
        print(f"Processing '{word}' -> '{cleaned_word}'")
        print(f"IPA '{ipa_representation}' -> '{cleaned_ipa}'")
    
    alignment = find_allowable_pairs_recursive(cleaned_word, cleaned_ipa, allowable_pairs, debug=debug)
    
    return alignment

# Combined allowable pairs for both consonants and vowels
allowable_pairs = [
    # Consonant Combinations
    ("ch", "ʧ"), ("ch", "tʃ"), ("sh", "tʃ"), ("sh", "ʧ"), ("sh", "ʃ"),
    ("ch", "ʃ"), ("th", "θ"), ("th", "ð"), ("t", "tʃ"), ("g", "ʒ"),
    ("s", "z"), ("s", "ʒ"), ("s", "ʃ"), ("z", "ʒ"), ("z", "ʃ"), ("c", "ʃ"),
    ("ck", "k"), ("ps", "s"), ("q", "kw"), ("gh", "f"), ("ph", "f"),
    ("kn", "n"), ("gn", "n"), ("wr", "r"), ("wh", "w"), ("sc", "s"),
    ("c", "s"), ("ld", "d"), ("dd", "d"), ("ll", "l"), ("ll", "ɫ"),
    ("mm", "m"), ("nn", "n"), ("mn","m"), ("rr", "r"), ("pp", "p"), ("ff", "f"),
    ("tt", "t"), ("ss", "s"), ("ss", "z"), ("ss", "ʒ"), ("ss", "ʃ"),
    ("t", "ʒ"), ("t", "ʃ"), ("x", "gz"), ("x", "z"), ("zz", "ts"),
    ("z", "ts"), ("g", "dʒ"), ("g", "ʤ"), ("gg", "g"), ("gg", "ʒ"),
    ("gg", "dʒ"), ("gg", "ʤ"), ("ch", "k"), ("cc", "k"), 
    ("sl", "l"), ("q", "k"), ("l", "r"), 
    ("ct", "t"), ("dn", "n"),("ç", "s"),
    ("gm", "m"),("sc", "ʃ"), ("x", "ɡz"), ("gg", "ɡ"),
    ("cq", "kw"), ("ng", "n"), ("kd", "t"), ("nm", "m"), ("d", "dʒ"),
    ("x", "kʃ"), ("x", "k"), ("x", "ʃ"), ("l", "ɝ"), ("x", "ɡʒ"), ("s", "tʃ"),
    ("f", "v"), ("fth","θ"), ("cz","tʃ"), ("k","keɪ"),
    ("kn", "n"), ("gn", "n"), ("wr", "r"), ("mb", "m"),
    ("gu", "g"), ("que", "k"),("wh","hw"), ("g", "k"), ("c","tʃ"),("an","æm"),("th","tθ"),

    # Consonants
    ("c", "k"), ("t", "t"), ("d", "d"), ("d", "t"), ("t", "d"), ("n", "n"),
    ("s", "s"), ("l", "l"), ("l", "ɫ"), ("r", "r"), ("r", "ɹ"), ("r", "ɝ"),
    ("m", "m"), ("b", "b"), ("k", "k"), ("f", "f"), ("g", "g"), ("g", "ɡ"),
    ("v", "v"), ("x", "ks"), ("z", "z"), ("p", "p"), ("h", "h"), ("w", "w"), ("w", "hw"),
    ("y", "j"), ("j", "dʒ"), ("j", "ʤ"), ("j", "ʒ"), ("ng", "ŋ"), ("n", "ŋ"),

    # Vowel combinations
    ("ee","ɪ"),("ee","ə"),
    ("oo", "u"), ("oo", "ʊ"), ("ea", "i"), ("ee", "i"), 
    ("ai", "eɪ"), ("ay", "eɪ"), ("ei", "eɪ"), ("ey", "eɪ"), ("ai","ɛ"), ("ai","iɪ"),
    ("ie", "aɪ"), ("igh", "aɪ"),
    ("oa", "oʊ"), ("ow", "oʊ"), ("ow","aʊ"),
    ("ew", "ju"), ("ue", "ju"),
    ("au", "ɔ"), ("aw", "ɔ"), ("au","aʊ"), ('au', 'ə'), ("au","ɑ"),
    ("ou", "aʊ"), ("ou", "u"), ("ou", "ʊ"), ("ou","ɔ"),
    ("oi", "ɔɪ"), ("oy", "ɔɪ"), ("oo","ɪ"), ("oo","wɑ"), ("oo","wɔ"),
    ("ea","ɪ"),("ea","iə"),
    ("ay", "i"),("ay","ɛ"),
    ("eau","oʊ"),
    ("ui","wɪ"),
    ("ua","wə"),
    ("ua","weɪ"),("ua","wɪ"),
    ("oi","wɑ"), ("ui","wi"),
    
    # Vowels
    ("a", "æ"), ("a", "ə"), ("a", "ɑ"), ("a", "ɔ"), ("a", "eɪ"), ("a", "ɒ"), ("a","ɛ"), ("a","ɪ"), ("a","aɪ"), ("a", "ə"), 
    ("e", "ɛ"), ("e", "i"), ("e", "ə"), ("e", "ɪ"), ("e","æ"), ("e", "ɑ"), ("e","eɪ"),
    ("i", "ɪ"), ("i", "aɪ"), ("i", "i"), ("i", "ɜ"), ("i", "ə"), ("i","j"), ("i","jɪ"),
    ("o", "ɔ"), ("o", "ə"), ("o", "oʊ"), ("o", "ɒ"), ("o", "ɑ"), ("o", "u"), ("o","ʊ"), ("o","ɪ"), ("o", "wə"),
    ("u", "ʊ"), ("u", "ə"), ("u", "ju"), ("u", "ʌ"), ("u", "ɜ"), ("u", "u"), ("u", "ɪ"), ("u","j"), ("u","jʊ"), ("u","ɛ"), ("u","jəw"), ("u", "juw"), ("u","ɑ"), ("u","uw"),("u","ɔ"),
    ("y", "ɪ"), ("y", "aɪ"), ("y", "i"), ("y", "ə"),

    # stupid r/w stuff
    ("ar", "ɝ"),  ("er", "ɝ"),  ("or", "ɝ"),  ("yr", "ɝ"),  ("re", "ɝ"),  ("ir", "ɝ"),  ("ir", "aɪɝ"),
    ("ar", "ɹɝ"), ("er", "ɹɝ"), ("or", "ɹɝ"), ("yr", "ɹɝ"), ("re", "ɹɝ"), ("ir", "ɹɝ"), ("ir", "aɪɹɝ"),
    ("ar", "ɝɹ"), ("er", "ɝɹ"), ("or", "ɝɹ"), ("yr", "ɝɹ"), ("re", "ɝɹ"), ("ir", "ɝɹ"), ("ir", "aɪɝɹ"),
    ("u", "ɝ"), ("ew", "u"),
    
    # Silent Fallbacks and magic schwas
    ("t", ""), ("l", ""), ("ch", ""), ("s", ""), ("z", ""), ("r", ""), ("d", ""), ("n",""), ("k",""), ("p", ""),
    ("a", ""), ("e", ""), ("i", ""), ("o", ""), ("u", ""), ("gh", ""), ("b", ""), ("h", ""), ("w", ""), 
    ("", "ə"), 

    # Magic appearing "z" ??
    ("","z"),

    # Speaking the letters
    ("s","ɛs"), ("m","ɛm"), ("d","di"), ("c", "si")
]

In [172]:
def aligned(word):
    if word not in ipa_dict:
        return None
    return process_word(word, ipa_dict[word], allowable_pairs, debug=False)

In [173]:
yash_pair_mapping = {
    # Consonant Combinations
    ("ch", "ʧ"):'j', ("ch", "tʃ"):'j', ("sh", "tʃ"):'j', ("sh", "ʧ"):'j', ("sh", "ʃ"):'c',
    ("ch", "ʃ"):'c', ("th", "θ"):'y', ("th", "ð"):'y', ("t", "tʃ"):'t', ("g", "ʒ"):'g',
    ("s", "z"):'s', ("s", "ʒ"):'s', ("s", "ʃ"):'s', ("z", "ʒ"):'ts', ("z", "ʃ"):'ts', ("c", "ʃ"):'s',
    ("ck", "k"):'k', ("ps", "s"):'ps', ("q", "kw"):'kv', ("gh", "f"):'f', ("ph", "f"):'f',
    ("kn", "n"):'n', ("gn", "n"):'n', ("wr", "r"):'r', ("wh", "w"):'v', ("sc", "s"):'sc',
    ("c", "s"):'s', ("ld", "d"):'ld', ("dd", "d"):'d', ("ll", "l"):'l', ("ll", "ɫ"):'l',
    ("mm", "m"):'m', ("nn", "n"):'n', ("mn","m"):'mn', ("rr", "r"):'r', ("pp", "p"):'p', ("ff", "f"):'f',
    ("tt", "t"):'t', ("ss", "s"):'s', ("ss", "z"):'s', ("ss", "ʒ"):'s', ("ss", "ʃ"):'s',
    ("t", "ʒ"):'c', ("t", "ʃ"):'c', ("x", "gz"):'ks', ("x", "z"):'ks', ("zz", "ts"):'ts',
    ("z", "ts"):'ts', ("g", "dʒ"):'g', ("g", "ʤ"):'g', ("gg", "g"):'g', ("gg", "ʒ"):'g',
    ("gg", "dʒ"):'g', ("gg", "ʤ"):'g', ("ch", "k"):'k', ("cc", "k"):'k', 
    ("sl", "l"):'sl', ("q", "k"):'k', ("l", "r"):'l', 
    ("ct", "t"):'ct', ("dn", "n"):'dn',("ç", "s"):'s',
    ("gm", "m"):'gm',("sc", "ʃ"):'sc', ("x", "ɡz"):'ks', ("gg", "ɡ"):'g',
    ("cq", "kw"):'kv', ("ng", "n"):'ng', ("kd", "t"):'kd', ("nm", "m"):'nm', ("d", "dʒ"):'d',
    ("x", "kʃ"):'ks', ("x", "k"):'ks', ("x", "ʃ"):'ks', ("l", "ɝ"):'l', ("x", "ɡʒ"):'ks', ("s", "tʃ"):'s',
    ("f", "v"):'f', ("fth","θ"):'fy', ("cz","tʃ"):'cts', 
    ("kn", "n"):'kn', ("gn", "n"):'gn', ("wr", "r"):'r', ("mb", "m"):'mb',
    ("gu", "g"):'g', ("que", "k"):'k',("wh","hw"):'v', ("g", "k"):'g', ("c","tʃ"):'c',("an","æm"):'n',("th","tθ"):'y',

    # Consonants
    ("c", "k"):'k', ("t", "t"):'t', ("d", "d"):'d', ("d", "t"):'d', ("t", "d"):'t', ("n", "n"):'n',
    ("s", "s"):'s', ("l", "l"):'l', ("l", "ɫ"):'l', ("r", "r"):'r', ("r", "ɹ"):'r', ("r", "ɝ"):'r',
    ("m", "m"):'m', ("b", "b"):'b', ("k", "k"):'k', ("f", "f"):'f', ("g", "g"):'g', ("g", "ɡ"):'g',
    ("v", "v"):'v', ("x", "ks"):'ks', ("z", "z"):'ts', ("p", "p"):'p', ("h", "h"):'h', ("w", "w"):'v', ("w", "hw"):'v',
    ("y", "j"):'i', ("j", "dʒ"):'j', ("j", "ʤ"):'j', ("j", "ʒ"):'i', ("ng", "ŋ"):'q', ("n", "ŋ"):'n',

    # Vowel combinations
    ("ee","ɪ"):'i', ("ee","ə"):'',
    ("oo", "u"):'o', ("oo", "ʊ"):'o', ("ea", "i"):'i', ("ee", "i"):'i', 
    ("ai", "eɪ"):'a', ("ay", "eɪ"):'a', ("ei", "eɪ"):'e', ("ey", "eɪ"):'e', ("ai","ɛ"):'a', ("ai","iɪ"):'i',
    ("ie", "aɪ"):'i', ("igh", "aɪ"):'a',
    ("oa", "oʊ"):'o', ("ow", "oʊ"):'v', ("ow","aʊ"):'v',
    ("ew", "ju"):'v', ("ue", "ju"):'u',
    ("au", "ɔ"):'o', ("aw", "ɔ"):'v', ("au","aʊ"):'a', ('au', 'ə'):'a', ("au","ɑ"):'a',
    ("ou", "aʊ"):'o', ("ou", "u"):'u', ("ou", "ʊ"):'o', ("ou","ɔ"):'o',
    ("oi", "ɔɪ"):'i', ("oy", "ɔɪ"):'i', ("oo","ɪ"):'i', ("oo","wɑ"):'o', ("oo","wɔ"):'o',
    ("ea","ɪ"):'i', ("ea","iə"):'e',
    ("ay", "i"):'i', ("ay","ɛ"):'e',
    ("eau","oʊ"):'o',
    ("ui","wɪ"):'i',
    ("ua","wə"):'e',
    ("ua","weɪ"):'a',("ua","wɪ"):'i',
    ("oi","wɑ"):'o', ("ui","wi"):'u',
    
    # Vowels
    ("a", "æ"):'.', ("a", "ə"):'', ("a", "ɑ"):'.', ("a", "ɔ"):'.', ("a", "eɪ"):'.', ("a", "ɒ"):'.', ("a","ɛ"):'', ("a","ɪ"):'', ("a","aɪ"):'.', ("a", "ə"):'', 
    ("e", "ɛ"):'', ("e", "i"):'.', ("e", "ə"):'', ("e", "ɪ"):'', ("e","æ"):'.', ("e", "ɑ"):'.', ("e","eɪ"):'..',
    ("i", "ɪ"):'.', ("i", "aɪ"):'.', ("i", "i"):'.', ("i", "ɜ"):'', ("i", "ə"):'', ("i","j"):'.', ("i","jɪ"):'.',
    ("o", "ɔ"):'.', ("o", "ə"):'', ("o", "oʊ"):'.', ("o", "ɒ"):'.', ("o", "ɑ"):'.', ("o", "u"):'.', ("o","ʊ"):'.', ("o","ɪ"):'', ("o", "wə"):'.',
    ("u", "ʊ"):'.', ("u", "ə"):'', ("u", "ju"):'.', ("u", "ʌ"):'', ("u", "ɜ"):'', ("u", "u"):'.', ("u", "ɪ"):'', ("u","j"):'.', ("u","jʊ"):'.', ("u","ɛ"):'', ("u","jəw"):'.', ("u", "juw"):'.', ("u","ɑ"):'.', ("u","uw"):'.',("u","ɔ"):'.',
    ("y", "ɪ"):'.', ("y", "aɪ"):'.', ("y", "i"):'i', ("y", "ə"):'',

    # stupid r/w stuff
    ("ar", "ɝ"):'r',  ("er", "ɝ"):'r',  ("or", "ɝ"):'r',  ("yr", "ɝ"):'r',  ("re", "ɝ"):'r',  ("ir", "ɝ"):'r',  ("ir", "aɪɝ"):'r',
    ("ar", "ɹɝ"):'r', ("er", "ɹɝ"):'r', ("or", "ɹɝ"):'r', ("yr", "ɹɝ"):'r', ("re", "ɹɝ"):'r', ("ir", "ɹɝ"):'r', ("ir", "aɪɹɝ"):'r',
    ("ar", "ɝɹ"):'r', ("er", "ɝɹ"):'r', ("or", "ɝɹ"):'r', ("yr", "ɝɹ"):'r', ("re", "ɝɹ"):'r', ("ir", "ɝɹ"):'r', ("ir", "aɪɝɹ"):'r',
    ("u", "ɝ"):'', ("ew", "u"):'v',
    
    # Silent Fallbacks and magic schwas
    ("t", ""):'', ("l", ""):'', ("ch", ""):'', ("s", ""):'', ("z", ""):'', ("r", ""):'', ("d", ""):'', ("n",""):'', ("k",""):'', ("p", ""):'',
    ("a", ""):'', ("e", ""):'', ("i", ""):'', ("o", ""):'', ("u", ""):'', ("gh", ""):'', ("b", ""):'', ("h", ""):'', ("w", ""):'', 
    ("", "ə"):'', 

    # Magic appearing "z" ??
    ("","z"):'',

    # Speaking the letters
    ("s","ɛs"):'s', ("m","ɛm"):'m', ("d","di"):'d', ("c", "si"):'s', ("k","keɪ"):'k'
}

In [174]:
def yash_aligned(word, verbose = False):
    if word == "a":
        return "a"
    if word == "i":
        return "i"
    if word == "not":
        return "x"
    
    base = aligned(word)
    if base == None:
        return None

    if verbose: 
        print(base)

    mapped = "".join([yash_pair_mapping[p] for p in base])

    if verbose: 
        print(mapped)

    mapped = mapped.replace("nd","x").replace("nt","x") # X rule
    mapped = mapped.replace("st","z") # Z rule
    mapped = mapped.replace("rld","w").replace("rt","w").replace("tr","w").replace("rd","w").replace("ld","w").replace("dl","w").replace("lt","w").replace("td","w") # W rules

    if verbose:
        print(mapped)

    # not a listed rule, but he seems to almost always leave off the 
    if mapped[-2:] == "cn":
        mapped = mapped[:-1]

    if mapped[-3:] == "cns":
        mapped = mapped[:-2] + "s"

    mapped = mapped.replace(".","") # get rid of significant vowel markes

    return mapped

In [175]:
yash = {x:yash_aligned(x) for x in one_grams_dict if yash_aligned(x) != None}

In [176]:
for w in list(yash):
    if yash[w] == '':
        del yash[w]

In [177]:
code_statistics([x for x in one_grams_dict if x in yash], yash)

{'mean_binary_length': 11.047253230410739,
 'mean_binary_excess': 4.403937749275396,
 'mean_entropy_shift': 1.2842536453698834,
 'mean_transmission_cost': 10.24735486754008,
 'mean_transmission_excess': 3.6040393864047378,
 'mean_transmission_shift': 0.48435528249922477,
 'reconstruction_entropy': 0.6985334759219873,
 'reconstruction_error': 0.18388474099533814,
 'alphabet': 's r n t y l v k f m x i d p w b h a g z c o q j u e'}

# Jeake Philosophical Transaction No. 487

[This](https://royalsocietypublishing.org/doi/epdf/10.1098/rstl.1748.0041) system is very simple, and related closely to an experimental system I had code for already.

In [178]:
def translate_jotter(string, rules, default = "."):
    temp = "".join(map(lambda x: rules.get(x,""), string))
    if len(temp) > 0:
        return temp
    else:
        return default

def compile_rules_jotter(merge_list):
    rule_dict = {}
    for equiv_class in merge_list:
        head = equiv_class[0]
        for letter in equiv_class:
            rule_dict[letter] = head
    return rule_dict

In [179]:
jeake_rule = ['dt','lr','mn','uvw','sxz','bfp','qkg','y','c'] # Jeake Philosophical Transaction No. 487
compiled_jeake = compile_rules_jotter(jeake_rule)
jeake = {w:translate_jotter(w,compiled_jeake) for w in one_grams_dict}

def replace_c(s): #this is approximate, but should be good enough to plot it
    # Replace 'c' followed by 'e', 'i', or 'y' with 's'
    s = re.sub(r"c(?=[eiy])", "s", s)
    # Replace remaining 'c's with 'k'
    s = re.sub(r"c", "g", s)
    return s

def combine_letters(s):
    # Use regex to find consecutive identical letters and replace them with a single capital letter
    result = re.sub(r"(.)\1+", lambda m: m.group(1).upper(), s)
    return result

jeake = {w:combine_letters(replace_c(jeake[w])) for w in jeake}

In [180]:
code_statistics([x for x in one_grams_dict if x in jeake], jeake)

{'mean_binary_length': 10.531745445747445,
 'mean_binary_excess': 3.8847926259744714,
 'mean_entropy_shift': 0.7645990752784737,
 'mean_transmission_cost': 8.440936714765089,
 'mean_transmission_excess': 1.793983894992115,
 'mean_transmission_shift': -1.3262096557038827,
 'reconstruction_entropy': 1.4819684205694112,
 'reconstruction_error': 0.35139657535538305,
 'alphabet': 'd m l s u b g q y . D L M S B U G Q'}

# A Readable Polyphonic Cipher
Not shorthand per se, but related, [here](https://digitalcommons.butler.edu/wordways/vol8/iss1/16/)

In [181]:
polyphonic_rule = ['e','txz','acq','ilb','ogj','npkv','ryw','sfm','hdu']
compiled_polyphonic = compile_rules_jotter(polyphonic_rule)
polyphonic = {w:translate_jotter(w,compiled_polyphonic) for w in one_grams_dict}

In [182]:
code_statistics([x for x in one_grams_dict if x in polyphonic], polyphonic)

{'mean_binary_length': 14.335438221763479,
 'mean_binary_excess': 7.688485401990505,
 'mean_entropy_shift': 4.568291851294507,
 'mean_transmission_cost': 14.312929725431069,
 'mean_transmission_excess': 7.665976905658095,
 'mean_transmission_shift': 4.545783354962097,
 'reconstruction_entropy': 0.08207522153529219,
 'reconstruction_error': 0.023091815341297117,
 'alphabet': 'h e i a s t n o r'}

# Pitman 2000

This uses the output from [this online translator](https://steno.tu-clausthal.de/Pitman.php) with `proof:7` which returns the underlying ascii information that it uses to feed into metafont to draw the proper Pitman form.  This online translator uses a machine-generated outline, and is known to not be particularly accurate, so this dot location should taken with a grain of salt.  This is also essentially unabbreviated Pitman (aside from some brief forms), so it is not fully representative of true practical performance.

In [183]:
pitman2k_raw = pd.read_csv("Pitman.txt",sep=r"\|\|",engine='python',names=['word','outline'],header=None)

pitman2k = {}
pitman2k_novowel = {}

pitman_translation_dictionary = {
    # initial positional vowels
    "{1}":["1"],
    "{2}":["2"],
    "{3}":["3"],
    # connecters (ignoring concatenation)
    "&":[],
    "/":["/"],
    " ":[" "],
    "=":["="],
    # basic consonants
    "(b)":["b"],
    "(p)":["p"],
    "(d)":["d"],
    "(t)":["t"],
    "(v)":["v"],
    "(f)":["f"],
    "(dh)":["dh"],
    "(th)":["th"],
    "(zh)":["zh"],
    "(sh)":["sh"],
    "(ng)":["ng"],
    "(n)":["n"],
    "(m)":["m"],
    "(l)":["l"],
    "(r)":["r"],
    "(w)":["w"],
    "(hw)":["hw"],
    "(y)":["y"],
    "(h)":["h"],
    "(s)":["s"],
    "(z)":["z"],
    "(ch)":["ch"],
    "(jh)":["jh"],
    "(k)":["k"],
    "(g)":["g"],
    "(_r)":["_r"],
    # combining consonants with l
    "(b,l)":["bl"],
    "(p,l)":["pl"],
    "(d,l)":["dl"],
    "(t,l)":["tl"],
    "(v,l)":["vl"],
    "(f,l)":["fl"],
    "(dh,l)":["dhl"],
    "(th,l)":["thl"],
    "(zh,l)":["zhl"],
    "(sh,l)":["shl"],
    "(ng,l)":["ngl"],
    "(n,l)":["nl"],
    "(m,l)":["ml"],
    "(l,l)":["ll"],
    "(r,l)":["rl"],
    "(w,l)":["wl"],
    "(hw,l)":["hwl"],
    "(y,l)":["yl"],
    "(h,l)":["hl"],
    "(s,l)":["sl"],
    "(z,l)":["zl"],
    "(ch,l)":["chl"],
    "(jh,l)":["jhl"],
    "(k,l)":["kl"],
    "(g,l)":["gl"],
    "(_r,l)":["_rl"],
    # combining consonants with r
    "(b,r)":["br"],
    "(p,r)":["pr"],
    "(d,r)":["dr"],
    "(t,r)":["tr"],
    "(v,r)":["vr"],
    "(f,r)":["fr"],
    "(dh,r)":["dhr"],
    "(th,r)":["thr"],
    "(zh,r)":["zhr"],
    "(sh,r)":["shr"],
    "(ng,r)":["ngr"],
    "(n,r)":["nr"],
    "(m,r)":["mr"],
    "(l,r)":["lr"],
    "(r,r)":["rr"],
    "(w,r)":["wr"],
    "(hw,r)":["hwr"],
    "(y,r)":["yr"],
    "(h,r)":["hr"],
    "(s,r)":["sr"],
    "(z,r)":["zr"],
    "(ch,r)":["chr"],
    "(jh,r)":["jhr"],
    "(k,r)":["kr"],
    "(g,r)":["gr"],
    "(_r,r)":["_rr"],
    # basic vowels
    "[a]":["a"],
    "[aa]":["aa"],
    "[o]":["o"],
    "[oo]":["oo"],
    "[e]":["e"],
    "[ei]":["ei"],
    "[uh]":["uh"],
    "[ou]":["ou"],
    "[i]":["i"],
    "[ii]":["ii"],
    "[u]":["u"],
    "[uu]":["uu"],
    "[ai]":["ai"],
    "[oi]":["oi"],
    "[ow]":["ow"],
    "[yuu]":["yuu"],
    # punctuation
    "(_period_)":["punc:."],
    # briefs
    "(_i_)":["brief:i"],
    "(_a_)":["brief:a"],
    "(_all_)":["brief:all"],
    "(_and_)":["brief:and"],
    "(_as_)":["brief:as"],
    "(_but_)":["brief:but"],
    "(_you_)":["brief:_u_"],
    "(_first_)":["brief:first"],
    "(_would_)":["brief:would"],
    "(_with_)":["brief:with"],
    "(_is_)":["brief:is"],
    "(_how_)":["brief:how"],
    "(_in_)":["brief:in"],
    "(_who_)":["brief:who"],
    "(_you_)":["brief:you"],
    "(_to_)":["brief:to"],
    "(_too_)":["brief:too"],
    "(_of_)":["brief:of"],
    "(_oh_)":["brief:oh"],
    "(_on_)":["brief:on"],
    "(_should_)":["brief:should"],
    "(_the_)":["brief:the"],
    "(_u_)":["brief:_u_"],
    # numbers
    "(_zero_)":["#0"],
    "(_one_)":["#1"],
    "(_two_)":["#2"],
    "(_three_)":["#3"],
    "(_four_)":["#4"],
    "(_five_)":["#5"],
    "(_six_)":["#6"],
    "(_seven_)":["#7"],
    "(_eight_)":["#8"],
    "(_nine_)":["#9"],
    # decorations (need to fix to attach to previous consonant?)
    ",_str":["dec:str"],
    ",st":["dec:st"],
    ",ses":["dec:ses"],
    ",sis":["dec:sis"],
    ",sais":["dec:sais"],
    ",s":["dec:s"],
    ",S":["dec:s"],
    ";n":["dec:n"],
    ";sishn":["dec:sishn"],
    ";shn":["dec:shn"],
    ";Shn":["dec:shn"],
    ";f":["dec:f"],
    ";v":["dec:v"],
    # prefix
    "^con":["prefix:con"],
    "^h":["prefix:h"],
    # suffix
    "~ing":["suffix:ing"],
    # sizes
    ":tr":["size:2"],
    ":t":["size:0.5"],
    ":dr":["size:2"],
    ":dhr":["size:2"],
    ":d":["size:0.5"],
    # unknown bits, might be translated wrong
    "[_ow]":["ow"],
    "[ow_]":["ow"],
    "[_yuu]":["yuu"],
    "[i_]":["i"],
    "[ii_]":["ii"],
    "[ai_]":["ai"],
    "[_ai]":["ai"],
    "(_v,r)":["vr"],
    "(_v,l)":["vl"],
    "(k,w)":["kw"],
    "(g,w)":["gw"],
    "[ai{(0u,-13u)}]":["ai"],
    "[ou{(5u,-5u)}]":["ou"],
    "[e{(10u,5u)}]":["e"],
    "[i{(2u,10u)}]":["i"],
    "[ai{(-2u,-25u)}]":["ai"],
    "[e{(5u,0u)}]":["e"],
    "[e{(-7u,7u)}]":["e"],
    "(_dh,r)":["dhr"],
    "(_l_)":["l"],
    "(_sh)":["sh"],
    "(_l)":["l"],
    "(l_)":["l"],
    "{_three_}":["3"],
    "(m_)":["m"],
    "(n_)":["n"],
    "(_m)":["m"],
    "(_n)":["n"],
    "[ou_]":["ou"],
    "(_zh)":["zh"],
    "[uu_]":["uu"]
}

pitman_vowels = {"a","aa","o","oo","e","ei","uh","ou","i","ii","u","uu","ai","oi","ow","yuu"}

def pitman_translate(outline, verbose = False, vowel_drop_percent = 0.0):
    pos = 0
    out = []
    
    while pos < len(outline):
        matched = False
        for head in pitman_translation_dictionary:
            if head == outline[pos:pos+len(head)]:
                out+=pitman_translation_dictionary[head]
                pos += len(head)
                matched = True
                break
        if not matched:
            if verbose:
                print(f"Failed in {outline} at {outline[pos:]}")
            return None
    vowel_dropped = []
    for x in out:
        if x not in pitman_vowels or random.random() > vowel_drop_percent:
            vowel_dropped.append(x)
    #print(out,vowel_dropped)
    return vowel_dropped

for _,line in pitman2k_raw.iterrows():
    translated = pitman_translate(line['outline'],verbose=True,vowel_drop_percent=0.0)
    translated_novowel = pitman_translate(line['outline'],verbose=True,vowel_drop_percent=1.0)
    if translated == None:
        #print(f"{line['word']:<10}{line['outline']:<20}{translated}")
        continue
    else:
        pitman2k[line['word']] = tuple(translated)
        pitman2k_novowel[line['word']] = tuple(translated_novowel)

print(len(pitman2k),len(pitman2k_raw))

999 999


In [184]:
[pitman2k[x] for x in pitman2k if pitman2k[x][0] in ['1','2','3']]

[('1', 'brief:all', 'l', 's'),
 ('1', 'brief:all', 'w', 'dec:s'),
 ('1', 'n'),
 ('3', 'd', 'dec:f'),
 ('3', 'dr', 'e', 'k', 'size:0.5'),
 ('1', 'd'),
 ('1', 'f'),
 ('1', 'd'),
 ('3', 'h', 'w', 'dec:s'),
 ('3', 'r'),
 ('3', 'f'),
 ('3', 'dec:s', 't'),
 ('1', 'jh'),
 ('1', 'mr'),
 ('3', 'r'),
 ('1', 'pr', 'size:0.5'),
 ('2', 'pr', 'h', 'a', 'p', 'dec:s'),
 ('3', 'p'),
 ('1', 'shr', 'size:0.5'),
 ('1', 'th'),
 ('1', 'dh', 'size:0.5'),
 ('3', 'l'),
 ('3', '_r')]

In [185]:
code_statistics([x for x in one_grams_dict if x in pitman2k], pitman2k)

{'mean_binary_length': 13.634455336659894,
 'mean_binary_excess': 9.008772205004034,
 'mean_entropy_shift': 6.377524380264968,
 'mean_transmission_cost': 11.624689840821999,
 'mean_transmission_excess': 6.999006709166139,
 'mean_transmission_shift': 4.367758884427073,
 'reconstruction_entropy': 0.06492057146871243,
 'reconstruction_error': 0.0188530058822729,
 'alphabet': 'brief:the e size:0.5 dec:s m dec:n brief:of l brief:and 1 brief:to dh _r i ii brief:a d f k n a uh b ai ei t r ou brief:in w o p prefix:h dec:st brief:as brief:i v ch s aa ow hw brief:with g 3 brief:on sh brief:_u_ uu _rr brief:is zr brief:all size:2 jh yuu h brief:but th ng vr oo u pr tr pl dhr gr brief:would mr bl y brief:too brief:who thr br prefix:con dec:v brief:how dr kl oi brief:first tl fr kw brief:should dec:shn suffix:ing dec:ses nr dec:f kr vl shl gl fl jhr   #0 nl gw shr 2 dec:sishn dec:str brief:oh dl #1 #2 zhr dec:sais ml z zh'}

In [186]:
code_statistics([x for x in one_grams_dict if x in pitman2k_novowel], pitman2k_novowel)

{'mean_binary_length': 10.107191714684433,
 'mean_binary_excess': 5.481508583028573,
 'mean_entropy_shift': 2.850260758289507,
 'mean_transmission_cost': 8.376784683619775,
 'mean_transmission_excess': 3.751101551963915,
 'mean_transmission_shift': 1.1198537272248492,
 'reconstruction_entropy': 0.41180440117056166,
 'reconstruction_error': 0.12174817795441739,
 'alphabet': 'brief:the size:0.5 dec:s m dec:n brief:of l brief:and 1 brief:to dh _r brief:a d f k n b t r brief:in w p prefix:h dec:st brief:as brief:i v ch s hw brief:with g 3 brief:on sh brief:_u_ _rr brief:is zr brief:all size:2 jh h brief:but th ng vr pr tr pl dhr gr brief:would mr bl y brief:too brief:who thr br prefix:con dec:v brief:how dr kl brief:first tl fr kw brief:should dec:shn suffix:ing dec:ses nr dec:f kr vl shl gl fl jhr   #0 nl gw shr 2 dec:sishn dec:str brief:oh dl #1 #2 zhr dec:sais ml z zh'}

In [187]:
from itertools import combinations

# Define the iterator class
class VowelFilterIterator:
    def __init__(self, outline, vowel_list):
        self.outline = outline
        self.vowel_list = set(vowel_list)
        self.vowel_indices = [i for i, ch in enumerate(outline) if ch in self.vowel_list]
        self.current_count = 0
        self.to_keep_combinations = []

    def __iter__(self):
        return self

    def __next__(self):
        # If all combinations for the current count are exhausted, increment count
        if not self.to_keep_combinations:
            if self.current_count > len(self.vowel_indices):
                raise StopIteration
            # Generate new combinations for the current count
            self.to_keep_combinations = list(combinations(self.vowel_indices, self.current_count))
            self.current_count += 1

        # Pick the next combination of vowel indices to keep
        keep_indices = set(self.to_keep_combinations.pop(0))
        sub_list = [ch for i, ch in enumerate(self.outline) if i not in self.vowel_indices or i in keep_indices]

        return sub_list

# Test case as per user's example
outline = list(pitman2k["populate"])
vowel_list = pitman_vowels

# Run the iterator and collect results for analysis
iterator = VowelFilterIterator(outline, vowel_list)
results = list(iterator)
results

[['p', 'p', 'l', 'size:0.5'],
 ['p', 'o', 'p', 'l', 'size:0.5'],
 ['p', 'p', 'yuu', 'l', 'size:0.5'],
 ['p', 'p', 'l', 'ei', 'size:0.5'],
 ['p', 'o', 'p', 'yuu', 'l', 'size:0.5'],
 ['p', 'o', 'p', 'l', 'ei', 'size:0.5'],
 ['p', 'p', 'yuu', 'l', 'ei', 'size:0.5'],
 ['p', 'o', 'p', 'yuu', 'l', 'ei', 'size:0.5']]

In [188]:
pitman_vowel_optimized = {}
used = set()

failures = 0
for word in one_grams_dict:
    if word not in pitman2k:
        continue
    
    for candidate in VowelFilterIterator(list(pitman2k[word]), pitman_vowels):
        test = tuple(candidate)
        if test not in used:
            pitman_vowel_optimized[word] = test
            used.add(test)
            break
    
    if word not in pitman_vowel_optimized:
        failures += 1
        pitman_vowel_optimized[word] = pitman2k[word]

failures

24

In [189]:
code_statistics([x for x in one_grams_dict if x in pitman_vowel_optimized], pitman_vowel_optimized)

{'mean_binary_length': 11.136021518086377,
 'mean_binary_excess': 6.510338386430517,
 'mean_entropy_shift': 3.8790905616914513,
 'mean_transmission_cost': 9.291889834666065,
 'mean_transmission_excess': 4.6662067030102055,
 'mean_transmission_shift': 2.03495887827114,
 'reconstruction_entropy': 0.07683250655124771,
 'reconstruction_error': 0.022772203395222124,
 'alphabet': 'brief:the size:0.5 dec:s m dec:n brief:of l brief:and 1 brief:to dh _r brief:a d f k n b t r brief:in w p prefix:h ai dec:st ii brief:as brief:i v ch s ei hw brief:with g 3 brief:on sh brief:_u_ ou _rr brief:is zr e brief:all size:2 jh h brief:but th a ng vr pr tr pl uh o dhr yuu gr brief:would mr bl ow u y brief:too brief:who i thr br prefix:con dec:v oo brief:how dr kl brief:first tl fr aa kw brief:should dec:shn uu suffix:ing dec:ses nr dec:f kr vl shl gl fl oi jhr   #0 nl gw shr 2 dec:sishn dec:str brief:oh dl #1 #2 zhr dec:sais ml z zh'}

# Swiftograph - Abbott 15 

This is a simple orthographic system which can be efficiently implemented.

In [274]:
swiftograph_briefs = abbreviations = {
    "a": ["a"],
    "an": ["a"],
    "about": ["a", "b", "t"],
    "always": ["a", "l", "s"],
    "am": ["m"],
    "amount": ["a", "m", "t"],
    "and": ["n", "d"],
    "are": ["r"],
    "be, but": ["b"],
    "been": ["b", "n"],
    "because": ["b", "c"],
    "between": ["b", "t"],
    "can": ["c", "n"],
    "could": ["c", "d"],
    "do": ["d"],
    "ever": ["e", "v"],
    "for": ["f"],
    "from": ["f", "r"],
    "friend": ["f", "d"],
    "good, go": ["g"],
    "great": ["g", "r"],
    "government": ["g", "v"],
    "had": ["a", "d"],
    "have": ["v"],
    "he": ["h"],
    "in": ["n"],
    "no": ["n"],
    "made": ["m", "d"],
    "man": ["m", "n"],
    "any": ["a", "y"],
    "more": ["m", "o"],
    "must": ["m", "s", "t"],
    "most": ["m", "s", "t"],
    "name": ["n", "a"],
    "not": ["n", "t"],
    "of": ["o"],
    "only": ["n", "l", "y"],
    "parliament": ["p", "a", "r"],
    "people": ["p"],
    "question": ["q", "u"],
    "said": ["s", "d"],
    "shall": ["s", "h"],
    "should": ["s", "u"],
    "to": ["t"],
    "the": ["t", "h"],
    "than": ["t", "n"],
    "they": ["t", "i"],
    "that": ["t", "a"],
    "this": ["t", "s"],
    "their": ["t", "r"],
    "there": ["t", "r"],
    "through": ["t", "u"],
    "under": ["u"],
    "very": ["v", "r"],
    "was": ["w"],
    "what": ["w", "t"],
    "whatsoever": ["w", "t", "s", "r"],
    "where": ["w", "r"],
    "were": ["w", "r"],
    "when": ["w", "e", "n"],
    "who": ["w", "o"],
    "whole": ["o", "l"],
    "which": ["c", "h"],
    "will": ["l", "o"],
    "with": ["w", "t", "h"],
    "world": ["w", "l", "d"],
    "would": ["w", "d"],
    "you": ["y"]
}


swiftograph_blends = {
    "ai":"ai", "ea":"ea", "ao":"ao", "ei":"ei", "ie":"ie", "eu":"eu", "sg":"sg",
    "em":"m", "im":"m",
    "en":"n", "in":"n",
    "te":"t", "ti":"t",
    "de":"d", "di":"d",
    "bd":"bd", "bt":"bt",
    "ch":"ch", "th":"th",
    "au":"au", "oi":"oi", "ou":"ou"
}

def make_swiftographic(word):
    # handle briefs
    if word in swiftograph_briefs:
        sequence = swiftograph_briefs[word]
        pos = 0
        out = []
        while pos < len(sequence):
            if pos+1 < len(sequence) and sequence[pos] + sequence[pos+1] in swiftograph_blends:
                out.append(swiftograph_blends[sequence[pos] + sequence[pos+1]])
                pos += 2
            else:
                out.append(sequence[pos])
                pos += 1
        return tuple(out), tuple(out)
    
    # process word endings
    patterns = [
        (r'ed\b', 'd'),   # Replace "ed" at the end with "d"
        (r'ing\b', 'n'),  # Replace "ing" at the end with "n"
        (r'tion\b', 'un'), # Replace "tion" at the end with "un"
        (r'ight\b', ' t') # Replace "ight" at the end with " t"
    ]
    
    previous_word = None
    while word != previous_word:  # Keep applying replacements until no more changes
        previous_word = word
        for pattern, replacement in patterns:
            word = re.sub(pattern, replacement, word)

    # additional abbreviation principles
    word = word.replace("ee","e") # first trim double e's to a single "e"
    word = word.replace("qu","q") # u always assumed after q
    word = re.sub(r'(\w)\1', r'\1_', word) # replace all other double letters with the letter followed by an underscore
    word = re.sub(r'(?<!\b)[ao](?=[mn])', '', word) # remove a and o before m or n except when it is initial

    # implement blends
    sequence = list(word)

    pos = 0
    out = []
    while pos < len(sequence):
        if pos+1 < len(sequence) and sequence[pos] + sequence[pos+1] in swiftograph_blends:
            out.append(swiftograph_blends[sequence[pos] + sequence[pos+1]])
            pos += 2
        else:
            out.append(sequence[pos])
            pos += 1

    # don't write i with a dot or underscores
    out2 = []
    for i in out:
        if i == "i":
            out2.append("e")
        elif i == "_":
            continue
        else:
            out2.append(i)

    return tuple(out), tuple(out2[:5]) #return both the "by the book" version and the "by the examples" version

In [275]:
swiftograph_trim

{'the': ('th',),
 'of': ('o',),
 'and': ('n', 'd'),
 'to': ('t',),
 'a': ('a',),
 'in': ('n',),
 'that': ('t', 'a'),
 'is': ('e', 's'),
 'for': ('f',),
 'i': ('e',),
 'it': ('e', 't'),
 'was': ('w',),
 'as': ('a', 's'),
 'not': ('n', 't'),
 'with': ('w', 'th'),
 'he': ('h',),
 'on': ('o', 'n'),
 'you': ('y',),
 'be': ('b', 'e'),
 'his': ('h', 'e', 's'),
 'this': ('t', 's'),
 'by': ('b', 'y'),
 'or': ('o', 'r'),
 'are': ('r',),
 'at': ('a', 't'),
 'her': ('h', 'e', 'r'),
 'from': ('f', 'r'),
 'she': ('s', 'h', 'e'),
 'had': ('a', 'd'),
 'but': ('b', 'u', 't'),
 'have': ('v',),
 'an': ('a',),
 'they': ('te',),
 'we': ('w', 'e'),
 'were': ('w', 'r'),
 'one': ('o', 'n', 'e'),
 'all': ('a', 'l'),
 'can': ('c', 'n'),
 'which': ('ch',),
 'their': ('t', 'r'),
 'my': ('m', 'y'),
 'if': ('e', 'f'),
 'what': ('w', 't'),
 'do': ('d',),
 'there': ('t', 'r'),
 'when': ('w', 'en'),
 'would': ('w', 'd'),
 'him': ('h', 'em'),
 'so': ('s', 'o'),
 'more': ('m', 'o'),
 'me': ('m', 'e'),
 'will': ('l', 'o'

In [276]:
make_swiftographic("constant")

(('c', 'n', 's', 't', 'n', 't'), ('c', 'n', 's', 't', 'n'))

In [277]:
swiftograph = {w:make_swiftographic(w)[0] for w in one_grams_dict}
swiftograph_trim = {w:make_swiftographic(w)[1] for w in one_grams_dict}

In [278]:
code_statistics([x for x in one_grams_dict if x in swiftograph], swiftograph)

{'mean_binary_length': 18.635976768037462,
 'mean_binary_excess': 11.989023948264489,
 'mean_entropy_shift': 8.868830397568491,
 'mean_transmission_cost': 15.414464477003271,
 'mean_transmission_excess': 8.767511657230298,
 'mean_transmission_shift': 5.6473181065343,
 'reconstruction_entropy': 0.09461940533612552,
 'reconstruction_error': 0.02466666732051115,
 'alphabet': "n t s r e a o d l i c th m p u w y _ f h b g v k ou ea ch ai au x ie j q bt   z oi ei eu bd ao ' sg é"}

In [279]:
code_statistics([x for x in one_grams_dict if x in swiftograph_trim], swiftograph_trim)

{'mean_binary_length': 15.903743546645053,
 'mean_binary_excess': 9.25679072687208,
 'mean_entropy_shift': 6.136597176176082,
 'mean_transmission_cost': 12.896174290303671,
 'mean_transmission_excess': 6.249221470530697,
 'mean_transmission_shift': 3.1290279198346997,
 'reconstruction_entropy': 0.39707130141218433,
 'reconstruction_error': 0.10909772718267308,
 'alphabet': "e n t r s a o d l th c m p w u f h b y g v k ea ou ch ai x ie j au q bt   oi ei z eu bd ao ' sg é"}

# Plotting

Now we plot!  I will also include an allowable region based upon using Fano's inequality to this problem. 

In [307]:
def word_list(dict):
    return [x for x in one_grams_dict if x in dict]

scores = []
scores.append(("Spelling",code_statistics(word_list(spelling), spelling),'top center'))
scores.append(("IPA",code_statistics(word_list(reduced_ipa), reduced_ipa),'top center'))
scores.append(("Carter</br></br>Briefhand",code_statistics(word_list(briefhand), briefhand),'top center'))
scores.append(("Dutton</br></br>Speedwords",code_statistics(word_list(speedwords), speedwords),'bottom center'))
scores.append(("Gregg</br></br>Anniversary",code_statistics(word_list(gregg_anniversary), gregg_anniversary, length_function = gregg_length, letter_making = gregg_letters),'top center'))
scores.append(("Gregg</br></br>Notehand",code_statistics(word_list(gregg_notehand), gregg_notehand, length_function = gregg_length, letter_making = gregg_letters),'bottom left'))
scores.append(("Gregg</br></br>Simplified",code_statistics(word_list(gregg_simplified), gregg_simplified),'bottom center'))
scores.append(("bref",code_statistics(word_list(bref), bref),'top center'))
scores.append(("Yublin",code_statistics(word_list(yublin), yublin),'top center'))
scores.append(("Cut Spelng",code_statistics(word_list(cut_spelng), cut_spelng),'top center'))
scores.append(("Taylor",code_statistics(word_list(taylor), taylor),'top center'))
scores.append(("Taylor+",code_statistics(word_list(taylor_plus), taylor_plus),'top center'))
scores.append(("Characterie",code_statistics(word_list(characterie), characterie),'top center'))
scores.append(("Teeline",code_statistics(word_list(teeline), teeline),'bottom center'))
scores.append(("Keyscript",code_statistics(word_list(keyscript), keyscript),'top right'))
scores.append(("Dearborn</br></br>Speedwriting",code_statistics(word_list(speedwriting), speedwriting),'top center'))
scores.append(("QC-line",code_statistics(word_list(qc_line), qc_line),'middle right'))
scores.append(("Yash",code_statistics(word_list(yash), yash),'bottom center'))
scores.append(("Jeake</br></br>Ph. Tr. 487",code_statistics(word_list(jeake), jeake),'top right'))
scores.append(("Polyphonic</br></br>Cipher",code_statistics(word_list(polyphonic), polyphonic),'top right'))
scores.append(("Pitman</br></br>2000",code_statistics(word_list(pitman2k), pitman2k),'bottom left'))
scores.append(("Pitman 2000</br></br>(No Vowel)",code_statistics(word_list(pitman2k_novowel), pitman2k_novowel),'top right'))
#scores.append(("Pitman 2000</br></br>(Optimal Vowel)",code_statistics(word_list(pitman_vowel_optimized), pitman_vowel_optimized),'bottom left'))
scores.append(("Superwrite",code_statistics(word_list(superwrite), superwrite),'top left'))
scores.append(("Swiftograph</br></br>Curtailed",code_statistics(word_list(swiftograph_trim), swiftograph_trim),'top center'))
scores.append(("Swiftograph</br></br>Full",code_statistics(word_list(swiftograph), swiftograph),'middle right'))

In [308]:
with open(f"system_scores_clean.json", "w") as f:
            f.write(json.dumps(scores))

In [309]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[max(0,-x/4) for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', name = "Excluded Region", fillcolor='#EF553B'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[-1 for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', showlegend=False, fillcolor='#EF553B'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[0 for x in np.linspace(-1.5,10,24)], mode='lines', fill='none', showlegend=False,line=dict(color='#EF553B')))

fig.add_trace(go.Scatter(x=[x[1]['mean_transmission_shift'] for x in scores],
                         y=[x[1]['reconstruction_error'] for x in scores],
                         text=[x[0] for x in scores], textposition=[x[2] for x in scores],
                         mode='markers+text', name="Shorthand Systems", line=dict(color='#636EFA')))
fig.update_layout(width=1280, height=1024, xaxis_range=[-1.5, 10], yaxis_range=[-0.01, 0.4],
                  xaxis_title="Average Outline Complexity Overhead (bits)",
                  yaxis_title="Reconstruction Error (probability)")
fig.show()

In [193]:
fig.write_image("comparison_graph.svg")

In [194]:
print(" ".join(word for word in list(one_grams_dict.keys())[0:2048]))

the of and to a in that is for i it was as not with he on you be his this by or are at her from she had but have an they we were one all can which their my if what do there when would him so more me will been out up about who has no into time other them your said could did then some these also like than its may only see new how two such over our first any just after back now through even people where well most between way because know should before many down very made life work those us here get use make being good much each both man right while used world same must long go years still day own does too under take another part state little however off three around think need never come might different again without god system going want hand social during place thought against something found high case say why away information head within let eyes number left old every great example since data came men set look children water university few face research chapter small things year based 