HMM for the ideal case, i.e. where are given the mapped finger with probability 1 for each letter in the input string, according to the touch type mapping defined in `finger_letter_mapping`. 
Input dataset: `nltk 'abc'` sentences. 

In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
from keras.utils import to_categorical
from nltk.tag import hmm
from nltk.probability import LaplaceProbDist
import random

from finger_letter_mapping import letter_finger, finger_letter
from test_generation import * # importantly, text_to_label (for generating labels) and text_to_realistic

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import nltk
nltk.download('abc')
nltk.download('nps_chat')

[nltk_data] Downloading package abc to
[nltk_data]     C:\Users\Home\AppData\Roaming\nltk_data...
[nltk_data]   Package abc is already up-to-date!
[nltk_data] Downloading package nps_chat to
[nltk_data]     C:\Users\Home\AppData\Roaming\nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!


True

In [13]:
# nltk.corpus.abc.sents()[:30]

In [3]:
from smartcheck.smartcheck import Smartcheck

In [3]:
def wordsplit_to_sent(data) : 
    ''' Input array of sentences where sentence = array of words. 
    Output array of sentences where sentence = string.
    '''
    return [' '.join(sent)[:-2].lower() for sent in data]

In [4]:
# get 9000 sentences each as array split by word, for training 
sentences_wordsplit = nltk.corpus.abc.sents()[:9000]   
# get 1000 sentences for testing 
tester_wordsplit = nltk.corpus.abc.sents()[10000:11000]   

In [5]:
# turn each sentence from a word array into a sentence string
train_sentences = wordsplit_to_sent(sentences_wordsplit)  
test_sentences = wordsplit_to_sent(tester_wordsplit)

In [6]:
print(train_sentences[:2])
print(test_sentences[:2])

['pm denies knowledge of awb kickbacks the prime minister has denied he knew awb was paying kickbacks to iraq despite writing to the wheat exporter asking to be kept fully informed on iraq wheat sales', 'letters from john howard and deputy prime minister mark vaile to awb have been released by the cole inquiry into the oil for food program']
['compo for murray valley irrigators ruled out the new south wales government has ruled out compensation for murray valley irrigators affected by water cut backs', 'high security allocations and carry over water in the murray valley have been cut by 20 per cent because of record low inflows']


In [7]:
# rename 
training = train_sentences
testing = test_sentences

In [8]:
# f1 = open('training.txt', 'r')
# f2 = open('testing.txt', 'r')
# training = f1.read().lower().splitlines()
# testing = f2.read().lower().splitlines()
# f1.close()
# f2.close()

In [9]:
print('Letter space (hidden states):\n ', letter_finger.keys())
print('Finger space (observed states): \n', finger_letter.keys())

Letter space (hidden states):
  dict_keys(['q', 'a', 'z', 'w', 's', 'x', 'e', 'd', 'c', 'r', 'f', 'v', 't', 'g', 'b', 'p', 'o', 'l', 'i', 'k', 'u', 'j', 'm', 'y', 'h', 'n', ' '])
Finger space (observed states): 
 dict_keys([10, 9, 8, 7, 1, 5, 4, 3, 2])


In [10]:
print(training[0])

pm denies knowledge of awb kickbacks the prime minister has denied he knew awb was paying kickbacks to iraq despite writing to the wheat exporter asking to be kept fully informed on iraq wheat sales


In [11]:
print(text_to_finger('hi, this is a TEST'))
print(text_to_label('hi, this is a TEST'))

[ 2  3  1  7  2  3  9  1  3  9  1 10  1  7  8  9  7]
['h' 'i' ' ' 't' 'h' 'i' 's' ' ' 'i' 's' ' ' 'a' ' ' 't' 'e' 's' 't']


In [24]:
# ideal mapping, no repeats or probabilities
mapped_training = text_to_finger(training)
mapped_testing = text_to_finger(testing)
labels_training = text_to_label(training)
labels_testing = text_to_label(testing)

# list(map(lambda c: letter_finger[c] if c in letter_finger.keys() else None, sample_text.lower()))
print('reality check:', mapped_training.size, len(labels_training))
print(mapped_training[0].size, labels_training[0].size)

reality check: 9000 9000
198 198


In [25]:
# no repeats, one-hot encoding
onehot_training = finger_to_onehot(mapped_training)
onehot_testing = finger_to_onehot(mapped_testing)

In [204]:
out = text_to_realistic(training[:1000])

In [26]:
# important one! repeats + probability vector
siggy_generation = text_to_realistic(training)  # 18 secs 

In [17]:
# to turn one hot back to label: np.argmax(mapped_one_hot[0])

turn finger mapping back to text using HMM

In [None]:
trainer = hmm.HiddenMarkovModelTrainer(states = letter_finger.keys(), symbols = finger_letter.keys())
# symbols: observations ; states: hidden states

In [None]:
est = LaplaceProbDist

In [20]:
def make_seq(finger, labels): 
    out = []
    for i in range(len(finger)): 
        out.append([(ss, target) for ss, target in zip(finger[i], labels[i])])
    return out

In [21]:
seq = make_seq(mapped_training, labels_training)
test_seq = make_seq(mapped_testing, labels_testing)

In [22]:
tagger = trainer.train_supervised(seq, estimator=est)

In [23]:
tagger.test(test_seq)

accuracy over 133408 tokens: 78.84


In [24]:
def letters_to_sents(tuple_list):  # turn [[(finger, letter), (.,.)], [...]] to list of sentences 
    return [''.join([y for (x,y) in entry]) for entry in tuple_list]

In [25]:
test_true_output = letters_to_sents(test_seq)

In [26]:
test_input = [[x for (x,y) in entry] for entry in test_seq]

In [27]:
test_out = [tagger.tag(sent) for sent in test_input]

In [28]:
test_out_sents = letters_to_sents(test_out)

In [29]:
% cd smartcheck 
% pwd
checker = Smartcheck()

C:\Users\Home\Desktop\Projects\NeuroTech2020\NeuroTech-ML\siggy\smartcheck


In [30]:
checker.correction('sentance', 'the')

'sentence'

In [43]:
print(test_out_sents[1:6])
print(test_true_output[1:6])

['mith wedutith allleatious and catry orer warer in the ungran tallen hare been cut th  per ceng becanse or becore lls intolss', 'the hathral besongees minister  ian haccomale  wans the warer will be pake tack in ththre  wo there is mo hede tor compensation', 'he wans more heasures to ncop the tathing counmmith will be anmounded in the hest torthitht', 'ctontht cominares hatiomal ticod cans hes and inmorative wans or coping with the ctontht are the most populat eshitits at this heat  s hatiomal ticod cans  engrenton under wan at orange in cengral  westery hes wonth wales', 'anazinton  the long  thuning ctontht has mor caupende the spitits or thowe atrending the hatiomal ticod cans at torenore']
['high security allocations and carry over water in the murray valley have been cut by  per cent because of record low inflows', 'the natural resources minister  ian macdonald  says the water will be paid back in future  so there is no need for compensation', 'he says more measures to help the f

In [58]:
spellchecked = [checker.correct_sentence(sentence) for sentence in test_out_sents[:100]]

In [59]:
# from spell import correction
# letters_to_sents([tagger.tag([letter_finger[c] for c in 'hello this is a testing open computer'])])[0]

In [62]:
def percent_correct(out_list, true_list): 
    total_count = 0
    for i in range(len(out_list)): 
        count = sum(1 for a, b in zip(out_list[i].split(), true_list[i].split()) if a == b)
        total_count += count
    total = sum(len(a.split()) for a in true_list)
    return total_count / total

In [64]:
percent_correct(spellchecked, test_true_output[:100])*100

54.091300602928506