In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
from keras.utils import to_categorical
from nltk.tag import hmm
from nltk.probability import LaplaceProbDist
import random

from finger_letter_mapping import letter_finger, finger_letter

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import nltk
nltk.download('abc')
nltk.download('punkt')

[nltk_data] Downloading package abc to
[nltk_data]     C:\Users\Home\AppData\Roaming\nltk_data...
[nltk_data]   Package abc is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Home\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def wordsplit_to_sent(data) : 
    return [' '.join(sent)[:-2].lower() for sent in data]

In [4]:
sentences_wordsplit = nltk.corpus.abc.sents()[:9000]
tester_wordsplit = nltk.corpus.abc.sents()[10000:11000]

In [5]:
train_sentences = wordsplit_to_sent(sentences_wordsplit)
test_sentences = wordsplit_to_sent(tester_wordsplit)

In [6]:
train_sentences[:2]

['pm denies knowledge of awb kickbacks the prime minister has denied he knew awb was paying kickbacks to iraq despite writing to the wheat exporter asking to be kept fully informed on iraq wheat sales',
 'letters from john howard and deputy prime minister mark vaile to awb have been released by the cole inquiry into the oil for food program']

In [7]:
test_sentences[:2]

['compo for murray valley irrigators ruled out the new south wales government has ruled out compensation for murray valley irrigators affected by water cut backs',
 'high security allocations and carry over water in the murray valley have been cut by 20 per cent because of record low inflows']

In [8]:
training = train_sentences
testing = test_sentences

In [9]:
# f1 = open('training.txt', 'r')
# f2 = open('testing.txt', 'r')
# training = f1.read().lower().splitlines()
# testing = f2.read().lower().splitlines()
# f1.close()
# f2.close()

In [10]:
print('Letter space (hidden states):\n ', letter_finger.keys())
print('Finger space (observed states): \n', finger_letter.keys())

Letter space (hidden states):
  dict_keys(['q', 'a', 'z', 'w', 's', 'x', 'e', 'd', 'c', 'r', 'f', 'v', 't', 'g', 'b', 'p', 'o', 'l', 'i', 'k', 'u', 'j', 'm', 'y', 'h', 'n', ' '])
Finger space (observed states): 
 dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8])


In [11]:
print(training[0])

pm denies knowledge of awb kickbacks the prime minister has denied he knew awb was paying kickbacks to iraq despite writing to the wheat exporter asking to be kept fully informed on iraq wheat sales


In [12]:
def text_to_finger(data): 
    output = [] 
    for row in data:
        output.append(np.array([letter_finger[c] for c in row.lower() if c in letter_finger.keys()]))
    return np.array(output)
def text_to_label(data): 
    output = []
    for row in data: 
        output.append(np.array(list(filter(lambda c: c in letter_finger.keys(), row))))
    return np.array(output)

In [13]:
mapped_training = text_to_finger(training)
mapped_testing = text_to_finger(testing)
labels_training = text_to_label(training)
labels_testing = text_to_label(testing)

# list(map(lambda c: letter_finger[c] if c in letter_finger.keys() else None, sample_text.lower()))
print('reality check:', mapped_training.size, len(labels_training))
print(mapped_training[0].size, labels_training[0].size)

reality check: 9000 9000
198 198


In [14]:
def finger_to_onehot(data): 
    return np.array([to_categorical(row) for row in data])

In [15]:
onehot_training = finger_to_onehot(mapped_training)
onehot_testing = finger_to_onehot(mapped_testing)

In [16]:
# TO DO: add noise to one hot encoding 

# to turn one hot back to label: np.argmax(mapped_one_hot[0])
# mapped_noise = [letter + NOISE for letter in mapped_one_hot]
# for letter in mapped_one_hot: 
    

turn finger mapping back to text using HMM

In [17]:
trainer = hmm.HiddenMarkovModelTrainer(states = letter_finger.keys(), symbols = finger_letter.keys())
# symbols: observations ; states: hidden states

In [18]:
est = LaplaceProbDist

In [19]:
def make_seq(finger, labels): 
    out = []
    for i in range(len(finger)): 
        out.append([(ss, target) for ss, target in zip(finger[i], labels[i])])
    return out

In [20]:
seq = make_seq(mapped_training, labels_training)
test_seq = make_seq(mapped_testing, labels_testing)

In [21]:
tagger = trainer.train_supervised(seq, estimator=est)

In [22]:
tagger.test(test_seq)

accuracy over 133408 tokens: 78.84


In [23]:
def letters_to_sents(tuple_list):  # turn [[(finger, letter), (.,.)], [...]] to list of sentences 
    return [''.join([y for (x,y) in entry]) for entry in tuple_list]

In [24]:
test_true_output = letters_to_sents(test_seq)

In [25]:
test_input = [[x for (x,y) in entry] for entry in test_seq]

In [26]:
test_out = [tagger.tag(sent) for sent in test_input]

In [27]:
test_out_sents = letters_to_sents(test_out)

In [28]:
from spell import correction

In [107]:
# fig1, ax1 = plt.subplots()
# ax1.pie([78.39, 100-78.39], explode=[0.1,0], autopct='%1.1f%%', shadow=True)
# ax1.axis('equal')
# plt.title("HMM Accuracy")
# plt.show()

In [186]:
letters_to_sents([tagger.tag([letter_finger[c] for c in 'hello this is a testing open computer'])])[0]

'helll this is a besting open compurer'