# Our Goal:
Be able to determine, given a consecutive string of letters, if it is pronounceable by english-speaking people. 


# Problem statement
- We want to improve at classifying "words" as being pronounceable or not
- We will measure our progress by the percentage of words correctly classified
- based on our database of a literal dictionary and randomly-ish generated non-pronounceable words


# Methodology
Train several different models on our dataset, trying to teach them what "pronounceable" words look like
This will include a manually designed heuristic and different ML models

## Input Formulation (ML Models)
We need to transform words into input vectors, since we need to have quantifiable data. 

Since we need to store the bigrams of the words, and we care about order, we decided to define the features of our vectors as a list of all possible bigrams within the english alphabet. This results in a 26*26 = 676-dimensional space. We will not be able to encode the order of the bigrams, because any meaningful encoding of this would result in either difficulty plotting the data or skewed data. For example, if the first bigram in the word was given a value of 1, the second was given a value of 2, et cetera, then the feature vectors of longer words would become further and further from the origin in the dimensions of their later bigrams.

If our model seems to be less accurate than we would like, we will experiment with finding a way to encode the order.


### For example, 
Our feature vector will take the following shape:
`["aa": int, "ab": int, "ac": int, "ad": int .. "zz": int]`

So for a word like "abba", which contains the bigrams `["ab", "bb", "ba"]`,
Our feature vector would be:

`["aa": 0, "ab": 1, "ac": 0 ... "ba": 1, "bb": 1, "bc": 0 ...]`





## Helper Methods

In [53]:
import time, sys
from IPython.display import clear_output

def update_progress(progress, label: str = ""):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress, 0))

    clear_output(wait = True)
    text = label+"\nProgress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)


def get_bigrams(word:str) -> list:
    return [i+j for i, j in \
            zip(word, word[1:])]


import functools
def foldl(func, xs, acc):
    return functools.reduce(func, xs, acc)

def is_probability(input):
    return isinstance(input, float) and \
        input >= 0 and \
            input <= 1



# Static variables

In [38]:
letter_likelihood = {}
accuracy_dict = {}

# Vectorization Code

In [39]:
from itertools import combinations_with_replacement
import string


def generate_feature_vector(input):
    # we will be outsourcing this to a c subprocess to increase perf
    if("str" in str(type(input))):
        return generate_feature_vector(get_bigrams(input))
    elif("list" in str(type(input))):
        feature_vector = {
            str(bigram) : 1 if (str(bigram[0])+str(bigram[1])) in input else 0 \
                for bigram in \
                    [i+j for i in string.ascii_lowercase for j in string.ascii_lowercase]
        }
        return feature_vector
    else:
        print(input)
        raise TypeError(f"Requires either 'str' or List[str] as input for generate_feature_vector(), found {type(input)}.")
        
import pandas as pd
import numpy as np
def get_vectors_for_series(data: pd.Series, label: str):
    vectors = []
    length = len(data)
    for i in range(length):
        word = data.loc[i]
        if i % 5 == 0 or i == length:
            update_progress(i / length, label=label)
        vectors.append(np.asarray(list(generate_feature_vector(get_bigrams(word)).values())))
    return pd.Series(vectors)




# Dataset Creation Code

In [40]:
import os
import re
import random
import typing
import math

def get_n_pronounceable_words(n: int) -> typing.Set[str]:
    data_path = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath("word_pronounceability.ipynb")), '..', 'unigram_freq.csv'))
    dataframe = pd.read_csv(data_path)
    dataframe = dataframe[dataframe.word.str.len() >= 3]
    dataframe = dataframe.set_axis(range(0, dataframe.shape[0]), axis=0)
    
    sample_size_n = dataframe.sample(n = n)
    return set(sample_size_n["word"])

def get_n_unpronounceable_words(n: int) -> typing.Set[str]:
    def norm(vector:list):
        return math.sqrt(sum([i*i for i in vector]))
        
    words: typing.Set[str] = set()
    while len(words) < n:
        possible_word = ''.join(random.choice(string.ascii_lowercase) for _ in range(random.choice(list(range(3,15)))))

        violates_q_u_rule = "q" in possible_word and "qu" not in possible_word

        num_consecutive_consonants = foldl(lambda x, y: x+1 if y not in list("aeiouy") else 0, possible_word, 0)
        
        contains_no_vowels = num_consecutive_consonants == len(possible_word)

        incorrectness_vector = [0.8*(1 if contains_no_vowels else 0), 0.4*(num_consecutive_consonants/4)]
        incorrectness_vector.append(0.8*(norm(incorrectness_vector)) if violates_q_u_rule else norm(incorrectness_vector))
        # ADJUST IF THE WORDS ARE TOO PRONOUNCEABLE
        if norm(incorrectness_vector) > 0.7 and possible_word not in words:
            words.add(possible_word)
    return words


def get_dataset(size: int) -> pd.DataFrame:
    """Generates a dataset of "words." Half are pronounceable, half are not

    Args:
        size (int): Size of each half of the dataset (pronounceable / unpronounceable)

    Returns:
        pd.DataFrame: dataframe with two columns: "word" and "is_pronounceable" with `2*size` rows.
    """
    data_set = pd.DataFrame(list(get_n_pronounceable_words(size)) + (list((get_n_unpronounceable_words(size)))), columns=["word"])
    data_set["is_pronounceable"] = data_set.index < size
    return data_set


# Heuristic Model
Our initial hypothesis was that the pronounceability of a word correlated very strongly with it's *likelihood*. This is to say, if it is statistically probable that a sequence of letters could make a word, it is also statistically probable that it can be pronounced. This is not without caveats, however: especially because we have idiomatically accepted brand names like "Exxon" which contain strings of letters which no (or at least very few) dictionary words contain. This fact would drive down the likelihood that these such strings of letters would appear, yet we can pronounce them perfectly fine. However, despite these caveats, we feel that this is a reasonable heuristic.

## Heuristic Model Code

In [42]:
from contextlib import contextmanager
from statistics import mean
def pronounceable_score_heuristic(letters: str) -> float:
    """ Generates a numerical score representing the likelihood that a word is pronounceable.

    Args:
        letters (str): string of length 2 (bigram) containing only alphabetical characters to check our dataset for occurrences of

    Returns:
        float: a score representing the likelihood that we can pronounce this string of letters. 0.5 is generally pronounceable, 0.2 is not.
    """
    assert len(letters) == 2
    # if we have already checked this bigram, it'll be in our letter_likelihood dictionary, we can return it
    if letters in letter_likelihood:
        return letter_likelihood[letters]

    # otherwise,
    # check dataset for occurrences of [letters].
    data = list(get_n_pronounceable_words(9000))
    proportion = dict(in_line=0, not_in=0)
    for word in data:
        proportion['in_line' if letters in word else "not_in"] += 1
    proportion['in_line'] -= 1 if (not proportion['in_line'] > 0) else 0
    # if the set of letters is never found, then it almost certainly can't be pronounced, or possibly is simply not in our dataset.
    # return the amount of times it was found divided by the total lines in the file (multiply by 10 to trim leading zeroes)
    letter_likelihood[letters] = (proportion['in_line'] / sum([proportion[key] for key in proportion]))*10 
    return letter_likelihood[letters]
def is_pronounceable_heuristic(word: str) -> bool:
    # Threshold of 0 gives 50% accuracy, anything above  0.5 gives 50% accuracy
    THRESHOLD = 0.1

    # turns word into list of bigrams into their likelihood of showing up in our dataset
    # "hello" ->  ["he", "el", "ll", "lo"] -> [0.45..., 0.62..., 0.57..., 0.44...]
    average_score = mean([pronounceable_score_heuristic(bigram) for bigram in get_bigrams(word)])

    # if the average pronounceability score is too low, we assume it isn't pronounceable.
    return average_score >= THRESHOLD


# this function allows for more concise and readable code in our test flow.
# uses a contextlib contextmanager to implement __enter__ and __exit__ for our function so we can use it in 'with' statements.
@contextmanager
def heuristic_function():
    function = is_pronounceable_heuristic
    try:
        yield function
    finally:
        pass

Now that we have defined our heuristic model, we can test it on a large segment of data and check to see if it appropriately classifies incoming data.


## Heuristic Tests

In [43]:
skip_heuristic = False
with heuristic_function() as is_pronounceable:
    if not skip_heuristic:
        test_data = pd.DataFrame(columns=["word", "is_pronounceable"])
        pronounceable_data = pd.DataFrame(columns=["word"], data=pd.Series(list(get_n_pronounceable_words(1000))))
        pronounceable_data["is_pronounceable"] = True

        unpronounceable_data = pd.DataFrame(columns=["word"], data=pd.Series(list(get_n_unpronounceable_words(1000))))
        unpronounceable_data["is_pronounceable"] = False

        test_data = pd.concat(objs=[pronounceable_data, unpronounceable_data])
        scoring = dict(right= 0, total = 0)
        for idx, row in test_data.iterrows():
            update_progress(scoring["total"] / test_data.shape[0])
            if(is_pronounceable(row["word"]) == row["is_pronounceable"]):
                # print("guessed correctly that " + row["word"] + " is " + ("not " if not row["is_pronounceable"] else "") + "pronounceable")
                scoring["right"] += 1
            # else:
            #     print("guessed incorrectly that " + row["word"] + " is " + ("" if not row["is_pronounceable"] else "not ") + "pronounceable")
            scoring["total"] += 1
        
        accuracy_dict["Heuristic Model"] = (scoring["right"] / scoring["total"])

 ## ML Code


 Now that we have the ability to turn words into vectors (through one-hot encoding), we can begin to train models on these vectors. Since we are looking for a one-vs-one classification, we can use, for example:
 1. Naive Bayes
 2. K Nearest Neighbors
 3. Semi-supervised learning
 4. Support Vector Machines
 5. essentially any model in the scikit-learn `multiclass` package

 ### Training / Testing Data

In [7]:
from sklearn.model_selection import train_test_split
import numpy as np
size = 3000 # size of each half of the dataset
data_set = get_dataset(size)

X_vectors = list(get_vectors_for_series(data_set["word"], label=""))

# Convert from list of np arrays to single 2darray
X = np.array([x for x in X_vectors])
y = np.array([np.array(1 if i else 0) for i in data_set["is_pronounceable"]])
X_train, X_test, y_train, y_test = train_test_split(X, list(y), test_size=0.25, random_state=7)

Progress: [####################] 99.9%


### Naive Bayes

In [8]:
from sklearn.naive_bayes import BernoulliNB

naive_bayes_model = BernoulliNB()
naive_bayes_model.fit(X_train, y_train)

accuracy_dict["Naive Bayes"] = naive_bayes_model.score(X_test, y_test)

### K Nearest Neighbors

In [9]:
from sklearn.neighbors import KNeighborsClassifier

n_neighbors = 10
knn_model = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X_train, y_train)
accuracy_dict[f"K-Nearest-Neighbors ({n_neighbors} neighbors)"] = knn_model.score(X_test, y_test)

### Semi-supervised Learning


In [10]:
from sklearn.semi_supervised import LabelPropagation

# get a "LOT" of extra data, and label it as -1

words_unlabeled = pd.Series(list(get_n_pronounceable_words(4500).union(get_n_unpronounceable_words(4500))))
X_unlabeled = np.array([i for i in list(get_vectors_for_series(words_unlabeled))])
y_unlabeled = np.array([-1 for _ in X_unlabeled])


X_mixed = np.concatenate((X_train, X_unlabeled), axis=0)

y_mixed = np.concatenate((y_train, y_unlabeled), axis=0)


semi_supervised_model = LabelPropagation().fit(X_mixed, y_mixed)


accuracy_dict["Semi Supervised (Label Propagation)"] = (semi_supervised_model.score(X_test, y_test))

Progress: [####################] 100.0%


### Support Vector Machine

In [11]:
from sklearn.svm import NuSVC

#TODO Tune parameters
svc_model = NuSVC().fit(X_train, y_train)
accuracy_dict["Support Vector Machine (NuSVC)"]= (svc_model.score(X_test, y_test))

### Neural Network

In [12]:
from sklearn.neural_network import MLPClassifier

#  TODO Tune parameters:
nn_model = MLPClassifier(learning_rate = "invscaling").fit(X_train, y_train)
accuracy_dict["Neural Network (Multi-Layer Perceptron)"] = (nn_model.score(X_test, y_test))

In [45]:
print("Stats for most recent run:")
for model in accuracy_dict:
    print(f"Accuracy for {model}: {str(accuracy_dict[model]*100)[:5]}%")

Stats for most recent run:
Accuracy for Heuristic Model: 88.7%


# Hypothesis Testing

In [50]:
accuracies = []
# increase sample size by increasing n
best_model = NuSVC()
n = 30
for i in range(n):
    size = 1500 # size of each half of the dataset
    data_set = get_dataset(size)

    X_vectors = list(get_vectors_for_series(data_set["word"], label=f"Iteration {i+1}/{n}"))

    # Convert from list of np arrays to single 2darray
    X = np.array([x for x in X_vectors])
    y = np.array([np.array(1 if i else 0) for i in data_set["is_pronounceable"]])
    X_train, X_test, y_train, y_test = train_test_split(X, list(y), test_size=0.25, random_state=7)
    model = best_model.fit(X_train, y_train)
    accuracies.append(svc_model.score(X_test, y_test))
    


Iteration 30/30
Progress: [####################] 99.8%


In [54]:
from scipy import stats

assert len(accuracies) == 30
# all values in accuracies list are probabilities
assert max([0 if is_probability(sample) else 1 for sample in accuracies]) == 0
assert is_probability(accuracy_dict["Heuristic Model"])

one_sample = stats.ttest_1samp(accuracies, accuracy_dict["Heuristic Model"])
print("The t-statistic is %.3f and the p-value is %.3f." % one_sample)

The t-statistic is 66.922 and the p-value is 0.000.
