In [1]:
# Imports
import pandas as pd
import re
import random
import copy
import math
import json
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from collections import Counter

# Load Shared Datasets

This section of code loads the [Google N-Grams Data](https://github.com/orgtre/google-books-ngram-frequency/blob/main/ngrams/1grams_english.csv), along with `ipadict` for [phonetic information](https://github.com/open-dict-data/ipa-dict/blob/master/data/en_US.txt).

In [2]:
# Load in the one-gram information (word frequencies)
one_grams = pd.read_csv("1grams_english.csv")
one_grams_dict = {row[1]['ngram'].lower(): row[1]['freq'] for row in one_grams.iterrows()}

# Scoring methods designed to select one of the options from the IPA dictionary.
# It selects the shortest words with the most schwas (laziest pronunciation).


def score(word):
    word.replace(" ", "")
    return (len(word), -word.count("ə"))


def get_laziest(ipa_list):
    return list(sorted(ipa_list, key=score))[0]


# Loading the dictionary and parsing out the best scoring pronunciation
ipa = pd.read_csv("en_US.txt", sep='\t', names=["eng", "ipa"])
ipa_opt_dict = {row[1]['eng']: row[1]['ipa'].split(', ') for row in ipa.iterrows()}
ipa_dict = {row[1]['eng']: get_laziest(row[1]['ipa'].split(', ')) for row in ipa.iterrows()}

In [3]:
# Print out all our letters
[(x,'') for x in {y for word in ipa_dict.values() for y in word}]

[('p', ''),
 ('ð', ''),
 ('ə', ''),
 ('ɡ', ''),
 ('θ', ''),
 ('ɛ', ''),
 ('ɑ', ''),
 ('ɫ', ''),
 ('n', ''),
 ('t', ''),
 ('ɹ', ''),
 ('ɪ', ''),
 ('h', ''),
 ('v', ''),
 ('a', ''),
 ('ˈ', ''),
 ('j', ''),
 ('/', ''),
 ('e', ''),
 ('ˌ', ''),
 ('i', ''),
 ('m', ''),
 ('k', ''),
 ('u', ''),
 ('f', ''),
 ('æ', ''),
 ('s', ''),
 ('ɔ', ''),
 ('w', ''),
 ('ʃ', ''),
 ('ʒ', ''),
 ('z', ''),
 ('ŋ', ''),
 ('ɝ', ''),
 ('o', ''),
 ('ʊ', ''),
 ('d', ''),
 ('b', '')]

In [48]:
# Code to do basic string transformations
def translate_add(string, rules, verbose = False): 
    out = []
    pos = 0
    while pos < len(string):
        matched = False
        for rule_head, rule_tail in rules:
            if string[pos:pos+len(rule_head)] == rule_head:
                if rule_tail != None:
                    out.append(rule_tail)
                matched = True
                pos += len(rule_head)
                if verbose:
                    print(f"Matched: {rule_head} -> {rule_tail}")
                break
        if not matched:
            if verbose:
                print(f"Skipped: {string[pos]}")
            pos += 1
    return out

def translate(word, rules, verbose = False):
    return_string = "".join(translate_add("~" + ipa_dict[word].replace('/','').replace('ˈ','').replace('ˌ','') + "~",rules, verbose))
    if len(return_string) == 0:
        return 'e'
    else:
        return return_string

In [49]:
# Consonants without variable representation
unpaired_consonants = [('ɫ', 'l'),
 ('ɹ', 'r'),
 ('m', 'm'),
 ('ŋ', 'ŋ'),
 ('n', 'n'),
 ('w', 'w'),
 ('h', 'h'),
 ('j', 'y')]

# ch and j
full_affrictives = [
 ('dʒ', 'j'),
 ('tʃ', 'C')]

merged_affrictives = [
 ('dʒ', 'C'),
 ('tʃ', 'C')]

# Frictives
full_frictives = [
 ('s', 's'),
 ('ʒ', 'ʒ'),
 ('ʃ', 'ʃ'),
 ('v', 'v'),
 ('θ', 'θ'),
 ('f', 'f'),
 ('z', 'z'),
 ('ð', 'ð')]

merged_frictives = [
 ('s', 's'),
 ('ʒ', 'ʃ'),
 ('ʃ', 'ʃ'),
 ('v', 'f'),
 ('θ', 'θ'),
 ('f', 'f'),
 ('z', 's'),
 ('ð', 'θ')]

# Plosives
full_plosives = [
 ('t', 't'),
 ('b', 'b'),
 ('ɡ', 'g'),
 ('p', 'p'),
 ('d', 'd'),
 ('k', 'k')]

merged_plosives = [
 ('t', 't'),
 ('b', 'p'),
 ('ɡ', 'k'),
 ('p', 'p'),
 ('d', 't'),
 ('k', 'k')]

consonant_types = {
    'full consonants': full_affrictives + unpaired_consonants + full_frictives + full_plosives,
    'full plosives, merged frictives': full_affrictives + unpaired_consonants + merged_frictives + full_plosives,
    'merged consonants': merged_affrictives + unpaired_consonants + merged_frictives + merged_plosives,
}

short_consonant_names = {
    'full consonants': 'FC',
    'full plosives, merged frictives': 'FPMF',
    'merged consonants': 'MC',
}

In [50]:
# Schwa's Name
schwa_name = 'e'

# Stupid R
r_with_vowels = [('ɝ', schwa_name + 'r')]
r_with_lateral_vowels = [('~ɝ', schwa_name + 'r'),('ɝ', 'r')]
r_with_flattened_lateral_vowels = [('~ɝ', 'er'),('ɝ', 'r')]
r_without_vowels = [('ɝ', 'r')]

In [51]:
# Long Vowels
long_vowels = [
 ('i', 'e'),
 ('eɪ', 'a'),
 ('oʊ', 'o'),
 ('aɪ', 'i'),
 ('ju', 'u'),
]

lateral_long_vowels = [
 ('~i', 'e'),
 ('~eɪ', 'a'),
 ('~oʊ', 'o'),
 ('~aɪ', 'i'),
 ('~ju', 'u'),
 ('i~', 'e'),
 ('eɪ~', 'a'),
 ('oʊ~', 'o'),
 ('aɪ~', 'i'),
 ('ju~', 'u'),
]

flattened_lateral_long_vowels = [
 ('~i', 'e'),
 ('~eɪ', 'e'),
 ('~oʊ', 'e'),
 ('~aɪ', 'e'),
 ('~ju', 'e'),
 ('i~', 'e'),
 ('eɪ~', 'e'),
 ('oʊ~', 'e'),
 ('aɪ~', 'e'),
 ('ju~', 'e'),
]

# Schwa
schwa = [
 ('ə', schwa_name),
]

lateral_schwa = [
 ('~ə', schwa_name),
 ('ə~', schwa_name),
]

flattened_lateral_schwa = [
 ('~ə', 'e'),
 ('ə~', 'e'),
]

# remaining vowels
remaining_vowels = [
 ('ɑ', 'a'),
 ('æ', 'a'),
 ('e', 'e'),
 ('ʊ', 'u'),
 ('ɛ', 'e'),
 ('ɔ', 'o'),
 ('a', 'a'),
 ('o', 'o'),
 ('ɪ', 'i'),
 ('u', 'u'),
]

lateral_remaining_vowels = [
 ('~ɑ', 'a'),
 ('~æ', 'a'),
 ('~e', 'e'),
 ('~ʊ', 'u'),
 ('~ɛ', 'e'),
 ('~ɔ', 'o'),
 ('~a', 'a'),
 ('~o', 'o'),
 ('~ɪ', 'i'),
 ('~u', 'u'),
 ('ɑ~', 'a'),
 ('æ~', 'a'),
 ('e~', 'e'),
 ('ʊ~', 'u'),
 ('ɛ~', 'e'),
 ('ɔ~', 'o'),
 ('a~', 'a'),
 ('o~', 'o'),
 ('ɪ~', 'i'),
 ('u~', 'u'),
]

flattened_lateral_remaining_vowels = [
 ('~ɑ', 'e'),
 ('~æ', 'e'),
 ('~e', 'e'),
 ('~ʊ', 'e'),
 ('~ɛ', 'e'),
 ('~ɔ', 'e'),
 ('~a', 'e'),
 ('~o', 'e'),
 ('~ɪ', 'e'),
 ('~u', 'e'),
 ('ɑ~', 'e'),
 ('æ~', 'e'),
 ('e~', 'e'),
 ('ʊ~', 'e'),
 ('ɛ~', 'e'),
 ('ɔ~', 'e'),
 ('a~', 'e'),
 ('o~', 'e'),
 ('ɪ~', 'e'),
 ('u~', 'e'),
]

# kill 'y' as part of long vowels
no_long_u = [('ju','')]

vowel_types = {
    'full vowels': r_with_vowels + long_vowels + schwa + remaining_vowels,
    'schwa suppressed vowels': r_with_lateral_vowels + long_vowels + lateral_schwa + remaining_vowels,
    'short suppressed vowels': r_with_lateral_vowels + long_vowels + lateral_schwa + lateral_remaining_vowels,
    'long vowels': r_without_vowels + long_vowels,
    'lateral vowels': r_with_lateral_vowels + lateral_long_vowels + lateral_schwa + lateral_remaining_vowels + no_long_u,
    'flattened lateral vowels': r_with_flattened_lateral_vowels + flattened_lateral_long_vowels + lateral_schwa + flattened_lateral_remaining_vowels + no_long_u,
    'no vowels': r_without_vowels + no_long_u 
}

short_vowel_names = {
    'full vowels': 'FV',
    'schwa suppressed vowels': 'SSV',
    'short suppressed vowels': 'ShSV',
    'long vowels': 'LoV',
    'lateral vowels': 'LV',
    'flattened lateral vowels': 'FLV',
    'no vowels': 'NV' 
}

In [52]:
# quick test code
word = random.choice(list(ipa_dict.keys()))
consonant_type = random.choice(list(consonant_types.keys()))
vowel_type = random.choice(list(vowel_types.keys()))
print(f"{word} -> {ipa_dict[word].replace('/','').replace('ˈ','').replace('ˌ','')} -> {translate(word,vowel_types[vowel_type] + consonant_types[consonant_type],verbose=False)} ({consonant_type}, {vowel_type})")

rich -> ɹɪtʃ -> rC (full plosives, merged frictives, long vowels)


In [116]:
sentence = "a robot may not injure a human being or through inaction allow a human being to come to harm"
for vowel_type in vowel_types:
    for consonant_type in consonant_types:
        print(" ".join([translate(word,vowel_types[vowel_type] + consonant_types[consonant_type]) for word in sentence.split()])+f" ({consonant_type}, {vowel_type})")

e robet ma nat injer e umen beiŋ er θru inakʃen elau e umen beiŋ te kem te harm (full consonants, full vowels)
e robet ma nat injer e umen beiŋ er θru inakʃen elau e umen beiŋ te kem te harm (full plosives, merged frictives, full vowels)
e ropet ma nat inCer e umen peiŋ er θru inakʃen elau e umen peiŋ te kem te harm (merged consonants, full vowels)
e robt ma nat injr e umn beiŋ er θru inakʃn elau e umn beiŋ te km te harm (full consonants, schwa suppressed vowels)
e robt ma nat injr e umn beiŋ er θru inakʃn elau e umn beiŋ te km te harm (full plosives, merged frictives, schwa suppressed vowels)
e ropt ma nat inCr e umn peiŋ er θru inakʃn elau e umn peiŋ te km te harm (merged consonants, schwa suppressed vowels)
e robt ma nt injr e umn beŋ er θru inkʃn elu e umn beŋ te km te hrm (full consonants, short suppressed vowels)
e robt ma nt injr e umn beŋ er θru inkʃn elu e umn beŋ te km te hrm (full plosives, merged frictives, short suppressed vowels)
e ropt ma nt inCr e umn peŋ er θru inkʃn e

# Write Common Code

This is code that takes in:

1. A list of words,
2. A dictionary of word transformations,
3. An optional function that computes lengths of words (defaults to `len`),
4. An optional function that transforms a transformed word into the constituent "letters".

It then returns the various statistics of that model as a dictionary:
1. The mean length normalized to assume a binary alphabet (multiplied by the $log_2$ of the alphabet size),
2. The reconstruction entropy $H(X|f(X))$,
3. The reconstruction error $\mathbf{P}\{X = \mathrm{argmax}_{y : f(y) =  f(X)} p(y)\}$

In [54]:
import math
from collections import Counter

def entropy(counts):
    total = sum(counts.values())
    entropy_value = 0
    for x in counts:
        p_x = counts[x] / total
        if p_x > 0:
            entropy_value -= p_x * math.log2(p_x)
    return entropy_value

def transmission_cost(letters, suprisal):
    total_suprisal = 0
    for l in letters:
        total_suprisal += suprisal[l]
    return total_suprisal

def entropy_wl(word_list, one_grams_dict):
    countlist = {w: one_grams_dict[w] for w in word_list}
    return entropy(countlist)

def code_statistics(word_list, word_mapping,
                    length_function=lambda x: len(x),
                    letter_making=lambda x: x, delim=None):
    # Data to track mean outline length
    denom = 0
    total_length = 0

    # Data to track alphabet size
    letter_counts = Counter()

    # Data to track for computing reconstruction entropy and reconstruction probability
    forward = {}
    inverse = {}
    most_probable = {}
    totals = {}

    # Iterate over word list keeping track of stats for mean length and the inverse mapping
    for word in word_list:
        count = one_grams_dict[word]

        image = word_mapping[word]
        if delim:
            image += delim

        word_length = length_function(image) * count
        total_length += word_length
        denom += count

        forward[word] = image
        if image not in inverse:
            inverse[image] = {}
            totals[image] = 0

        inverse[image][word] = count
        totals[image] += count

        # Update most probable word for each image
        if image not in most_probable or inverse[image][word] > inverse[image][most_probable[image]]:
            most_probable[image] = word

        current_letters = letter_making(image)
        letter_counts.update({l:count for l in current_letters})

    letter_total = sum(letter_counts.values())
    suprisal = {letter: -math.log2(letter_counts[letter] / letter_total) for letter in letter_counts}

    # Compute the reconstruction entropy and reconstruction probability
    total_reconstruction_entropy = 0
    total_reconstruction_probability = 0
    total_transmission_cost = 0

    for word in word_list:
        count = one_grams_dict[word]
        image = forward[word]
        prob = inverse[image][word] / totals[image]
        if prob > 0:
            total_reconstruction_entropy -= count * math.log2(prob)
        total_reconstruction_probability += count if word == most_probable[image] else 0
        total_transmission_cost += count * transmission_cost(letter_making(image), suprisal)

    # Compute running lengths (assuming words are sorted by frequency)
    sorted_words = sorted(
        [w for w in word_list],
        key=lambda w: one_grams_dict[w],
        reverse=True
    )

    running_lengths = 0
    running_count = 1

    for w in sorted_words:
        prob = one_grams_dict[w] / denom
        running_lengths += prob * math.log2(running_count)
        running_count += 1

    # Mean binary length based on fixed-length encoding of letters
    mean_binary_length = (total_length / denom) * math.log2(len(letter_counts))

    # Entropy of the word list
    word_list_entropy = entropy_wl(word_list, one_grams_dict)

    return {
        "mean_binary_length": mean_binary_length,
        "mean_binary_excess": mean_binary_length - running_lengths,
        "mean_entropy_shift": mean_binary_length - word_list_entropy,
        "mean_transmission_cost": total_transmission_cost / denom,
        "mean_transmission_excess": (total_transmission_cost / denom) - running_lengths,
        "mean_transmission_shift": (total_transmission_cost / denom) - word_list_entropy,
        "reconstruction_entropy": total_reconstruction_entropy / denom,
        "reconstruction_error": 1 - (total_reconstruction_probability / denom),
        "alphabet": " ".join([x for x, _ in letter_counts.most_common()])
    }


# Plotting

Now we plot!  I will also include an allowable region based upon using Fano's inequality to this problem. 

In [55]:
def word_list(dict):
    return [x for x in one_grams_dict if x in dict]

scores = []
for vowel_type in vowel_types:
    for consonant_type in consonant_types:
        dictionary = {w:translate(w,vowel_types[vowel_type] + consonant_types[consonant_type],verbose=False) for w in ipa_dict}
        scores.append((short_consonant_names[consonant_type] + "," + short_vowel_names[vowel_type],code_statistics(word_list(dictionary), dictionary),'top right'))

In [56]:
fig = go.Figure()

#fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[max(0,(1-x)/4) for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', name = "Hard Region", fillcolor='rgba(255,0,0,0.5)'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[max(0,-x/4) for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', name = "Excluded Region", fillcolor='#EF553B'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[-1 for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', showlegend=False, fillcolor='#EF553B'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[0 for x in np.linspace(-1.5,10,24)], mode='lines', fill='none', showlegend=False,line=dict(color='#EF553B')))

fig.add_trace(go.Scatter(x=[x[1]['mean_transmission_shift'] for x in scores],
                         y=[x[1]['reconstruction_error'] for x in scores],
                         text=[x[0] for x in scores], textposition=[x[2] for x in scores],
                         mode='markers+text', name="Shorthand Systems", line=dict(color='#636EFA')))
# fig.update_layout(width=1280, height=1024, xaxis_range=[-1.5, 10], yaxis_range=[-0.01, 0.4],
#                   xaxis_title="Average Outline Complexity Overhead (bits)",
#                   yaxis_title="Reconstruction Error (probability)")
fig.update_layout(width=1280, height=1024, xaxis_range=[-1.5, 10], yaxis_range=[-0.01, 0.4],
                  xaxis_title="Average Outline Complexity Overhead (bits)",
                  yaxis_title="Reconstruction Error (probability)")
fig.show()

In [106]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[max(0,-x/4) for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', name = "Excluded Region", fillcolor='#EF553B'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[-1 for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', showlegend=False, fillcolor='#EF553B'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[0 for x in np.linspace(-1.5,10,24)], mode='lines', fill='none', showlegend=False,line=dict(color='#EF553B')))

for i in range(0,len(scores),3):
    fig.add_trace(go.Scatter(x=[x[1]['mean_transmission_shift'] for x in scores[i:i+3]],
                         y=[x[1]['reconstruction_error'] for x in scores[i:i+3]],
                         text=['',list(short_vowel_names)[i//3].replace(' ','<br>'),''] if i!=0 else [list(short_vowel_names)[i//3].replace(' ','<br>'),'',''], textposition=['bottom left' for x in scores[i:i+3]] if i!=0 else ['bottom center' for x in scores[i:i+3]],
                         mode='markers+lines+text', line=dict(color='#636EFA'), showlegend=False))
    
fig.add_trace(go.Scatter(x=[x[1]['mean_transmission_shift'] for x in scores[:3]],
                    y=[x[1]['reconstruction_error'] for x in scores[:3]],
                    text=[x.replace(' ','<br>')+"<br>" for x in list(short_consonant_names)], textposition=['top right', 'top center', 'top left'],
                    mode='markers+lines', line=dict(color='#636EFA'), name="Consonant Representation"))
for i in range(3):
    fig.add_annotation(x=scores[i][1]['mean_transmission_shift'], y=scores[i][1]['reconstruction_error']+0.002,
                text=list(short_consonant_names)[i].replace(' ','<br>')+"<br>",
                showarrow=True,
                arrowhead=2,ay=-50, ax=20*(2-i)-10)

fig.update_layout(width=1280, height=1024, xaxis_range=[-1.5, 10], yaxis_range=[-0.01, 0.4],
                  xaxis_title="Average Outline Complexity Overhead (bits)",
                  yaxis_title="Reconstruction Error (probability)")
fig.show()

In [107]:
fig.write_image("principle_comparison_graph.svg")

In [98]:
with open('system_scores_clean.json', 'r') as file:
        comparison_data = json.load(file)

In [109]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[max(0,-x/4) for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', name = "Excluded Region", fillcolor='#EF553B'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[-1 for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', showlegend=False, fillcolor='#EF553B'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[0 for x in np.linspace(-1.5,10,24)], mode='lines', fill='none', showlegend=False,line=dict(color='#EF553B')))

for i in range(0,len(scores),3):
    fig.add_trace(go.Scatter(x=[x[1]['mean_transmission_shift'] for x in scores[i:i+3]],
                         y=[x[1]['reconstruction_error'] for x in scores[i:i+3]],
                         text=['',list(short_vowel_names)[i//3].replace(' ','<br>'),''], textposition=['bottom left' for x in scores[i:i+3]],
                         mode='lines', line=dict(color='#636EFA'), showlegend=False))
    
fig.add_trace(go.Scatter(x=[x[1]['mean_transmission_shift'] for x in scores[:3]],
                         y=[x[1]['reconstruction_error'] for x in scores[:3]],
                         mode='lines', line=dict(color='#636EFA'), name = "Basic Systems"))
    
fig.add_trace(go.Scatter(x=[x[1]['mean_transmission_shift'] for x in comparison_data],
                         y=[x[1]['reconstruction_error'] for x in comparison_data],
                         text=[x[0] for x in comparison_data], textposition=[x[2] for x in comparison_data],
                         mode='markers+text', name="Shorthand Systems", line=dict(color='#636EFA')))

fig.update_layout(width=1280, height=1024, xaxis_range=[-1.5, 10], yaxis_range=[-0.01, 0.4],
                  xaxis_title="Average Outline Complexity Overhead (bits)",
                  yaxis_title="Reconstruction Error (probability)")
fig.show()

# Analysis of Curtailment

Now, we will add simple one-sided trimming, and try to extract the front.

In [60]:
def curtail(word,l):
    return word[:l]

curtailed_scores = []
for l in ([100] + [x for x in range(1,10)]):
    for vowel_type in vowel_types:
        for consonant_type in consonant_types:
            dictionary = {w:curtail(translate(w,vowel_types[vowel_type] + consonant_types[consonant_type],verbose=False),l) for w in ipa_dict}
            curtailed_scores.append((short_consonant_names[consonant_type] + "," + short_vowel_names[vowel_type] + "," + str(l),code_statistics(word_list(dictionary), dictionary),'top right'))

null_dict = {w:'.' for w in ipa_dict}
curtailed_scores.append(('Null',code_statistics(word_list(null_dict), null_dict),'top right'))

full_ipa_dict = {w:ipa_dict[w].replace('/','').replace('ˈ','').replace('ˌ','') for w in ipa_dict}
curtailed_scores.append(('IPA',code_statistics(word_list(full_ipa_dict), full_ipa_dict),'top right'))

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[max(0,-x/4) for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', name = "Excluded Region", fillcolor='#EF553B'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[-1 for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', showlegend=False, fillcolor='#EF553B'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[0 for x in np.linspace(-1.5,10,24)], mode='lines', fill='none', showlegend=False,line=dict(color='#EF553B')))

fig.add_trace(go.Scatter(x=[x[1]['mean_transmission_shift'] for x in curtailed_scores],
                         y=[x[1]['reconstruction_error'] for x in curtailed_scores],
                         text=[x[0] for x in curtailed_scores], textposition=[x[2] for x in curtailed_scores],
                         mode='markers', name="Curtailed Systems", line=dict(color='#FFAA50')))

for i in range(0,len(scores),3):
    fig.add_trace(go.Scatter(x=[x[1]['mean_transmission_shift'] for x in scores[i:i+3]],
                         y=[x[1]['reconstruction_error'] for x in scores[i:i+3]],
                         text=['',list(short_vowel_names)[i//3].replace(' ','<br>'),''], textposition=['bottom left' for x in scores[i:i+3]],
                         mode='lines', line=dict(color='#636EFA'), showlegend=False))

fig.add_trace(go.Scatter(x=[x[1]['mean_transmission_shift'] for x in scores[:3]],
                         y=[x[1]['reconstruction_error'] for x in scores[:3]],
                         mode='lines', line=dict(color='#636EFA'), name = "Basic Systems"))

# fig.add_trace(go.Scatter(x=[x[1]['mean_transmission_shift'] for x in comparison_data],
#                          y=[x[1]['reconstruction_error'] for x in comparison_data],
#                          text=[x[0] for x in comparison_data], textposition=[x[2] for x in comparison_data],
#                          mode='markers+text', name="Shorthand Systems", line=dict(color='#636EFA')))

fig.update_layout(width=1280, height=1024, xaxis_range=[-1.5, 10], yaxis_range=[-0.01, 0.4],
                  xaxis_title="Average Outline Complexity Overhead (bits)",
                  yaxis_title="Reconstruction Error (probability)")
fig.show()

In [62]:
# ChatGPT implementation of the extraction of supported points

from collections import namedtuple

Point = namedtuple("Point", ["x", "y", "metadata"])  # Extendable metadata

class SupportedParetoFront:
    def __init__(self, points):
        """Initialize with a list of Point(x, y, metadata)."""
        self.points = sorted(points, key=lambda p: (p.x, p.y))  # Sort by x
        self.supported_points = self.compute_lower_hull()

    def cross_product(self, o, a, b):
        """Cross product of vector OA and OB (O is origin).
        Returns positive if counter-clockwise turn, negative if clockwise, 0 if collinear."""
        return (a.x - o.x) * (b.y - o.y) - (a.y - o.y) * (b.x - o.x)

    def compute_lower_hull(self):
        """Finds the lower convex hull of the points (supported Pareto front)."""
        lower_hull = []
        for p in self.points:
            while len(lower_hull) >= 2 and self.cross_product(lower_hull[-2], lower_hull[-1], p) <= 0:
                lower_hull.pop()
            lower_hull.append(p)
        return lower_hull

    def get_supported_points(self):
        """Return the supported points with metadata."""
        return self.supported_points

In [63]:
# Sample points with metadata
points = [Point(x[1]['mean_transmission_shift'], x[1]['reconstruction_error'], x[0]) for x in curtailed_scores]

pareto = SupportedParetoFront(points)
result = pareto.get_supported_points()

# Print the supported points
for p in result:
    print(f"Point({p.x}, {p.y}), Metadata: {p.metadata}")

Point(-9.765103091540468, 0.931164129364203), Metadata: Null
Point(0.8883777311644074, 0.16594955358722174), Metadata: MC,LoV,100
Point(1.592305546304365, 0.1299224631398923), Metadata: FPMF,LoV,100
Point(2.476816578043813, 0.09118930364096578), Metadata: MC,ShSV,100
Point(3.1806016360919287, 0.0683186320679362), Metadata: FPMF,ShSV,100
Point(4.249192760675465, 0.04347157613251906), Metadata: MC,SSV,100
Point(4.953369763511537, 0.02965160691192459), Metadata: FPMF,SSV,100
Point(5.46272074018883, 0.025486654793430263), Metadata: FC,SSV,100
Point(8.96273721754915, 0.014930232236564533), Metadata: IPA


In [64]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[max(0,-x/4) for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', name = "Excluded Region", fillcolor='#EF553B'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[-1 for x in np.linspace(-1.5,10,24)], mode='none', fill='tozeroy', showlegend=False, fillcolor='#EF553B'))
fig.add_trace(go.Scatter(x=np.linspace(-1.5,10,24), y=[0 for x in np.linspace(-1.5,10,24)], mode='lines', fill='none', showlegend=False,line=dict(color='#EF553B')))

fig.add_trace(go.Scatter(x=[p.x for p in result],
                        y=[p.y for p in result],
                        text=[p.metadata for p in result],
                        mode='lines', line=dict(color='#636EFA'), showlegend=False))
    
fig.add_trace(go.Scatter(x=[x[1]['mean_transmission_shift'] for x in comparison_data],
                         y=[x[1]['reconstruction_error'] for x in comparison_data],
                         text=[x[0] for x in comparison_data], textposition=[x[2] for x in comparison_data],
                         mode='markers+text', name="Shorthand Systems", line=dict(color='#636EFA')))

fig.update_layout(width=1280, height=1024, xaxis_range=[-1.5, 10], yaxis_range=[-0.01, 0.4],
                  xaxis_title="Average Outline Complexity Overhead (bits)",
                  yaxis_title="Reconstruction Error (probability)")
fig.show()

In [65]:
result

[Point(x=-9.765103091540468, y=0.931164129364203, metadata='Null'),
 Point(x=0.8883777311644074, y=0.16594955358722174, metadata='MC,LoV,100'),
 Point(x=1.592305546304365, y=0.1299224631398923, metadata='FPMF,LoV,100'),
 Point(x=2.476816578043813, y=0.09118930364096578, metadata='MC,ShSV,100'),
 Point(x=3.1806016360919287, y=0.0683186320679362, metadata='FPMF,ShSV,100'),
 Point(x=4.249192760675465, y=0.04347157613251906, metadata='MC,SSV,100'),
 Point(x=4.953369763511537, y=0.02965160691192459, metadata='FPMF,SSV,100'),
 Point(x=5.46272074018883, y=0.025486654793430263, metadata='FC,SSV,100'),
 Point(x=8.96273721754915, y=0.014930232236564533, metadata='IPA')]