In [20]:
from __future__ import absolute_import, division, print_function, unicode_literals
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.utils as utils
from keras.models import Sequential
import keras.layers
import numpy as np
import os

In [29]:
def prepareSequences(encodedLines, maxLength=None):
    sequences = list()
    for encodedLine in encodedLines:
        for i in range(1, len(encodedLine)):
            seq = encodedLine[:i + 1]
            sequences.append(seq)
    maxlen = maxLength if (maxLength) else max([len(seq) for seq in sequences])
    sequences = pad_sequences(sequences, maxlen=maxlen, padding='pre')

    # split into input and output elements
    sequences = np.array(sequences)
    x = sequences[:, :-1]
    y = sequences[:, -1]
    return x, y

def getAccuracy(predSentences, trueSentences):
    accuracy = 0
    for i in range(len(predSentences)):
        if (predSentences[i] == trueSentences[i]):
            accuracy += 1
    return accuracy / len(predSentences)

def processSentences(sentences, model, tokenizer, maxLength):
    correctedSentences = []
    vocab = tokenizer.word_index
    vocab_inv = {v: k for k, v in vocab.items()}
    for sentence in sentences:
        encoded = tokenizer.texts_to_sequences([sentence])[0]
        x, y = prepareSequences([encoded], maxLength)
        probs = model.predict(x, verbose=0)
        sentence_prob = get_sentence_prob(probs, y)
        bestEncoding = encoded
        for i, token in enumerate(encoded):
            encodedToks = tokenizer.texts_to_sequences(REMOVABLE_TOKENS)[0]
            for tok in encodedToks:
                tmpEncoded = list(encoded)
                encoded.insert(i, tok)
                tmp_x, tmp_y = prepareSequences([encoded], maxLength)
                new_probs = model.predict(tmp_x, verbose=0)
                new_sentence_prob = get_sentence_prob(new_probs, tmp_y)
                if (new_sentence_prob > sentence_prob):
                    sentence_prob = new_sentence_prob
                    bestEncoding = encoded
                encoded = tmpEncoded
        correctedSentences.append(bestEncoding)
    return correctedSentences
        # log_p_sentence = 0
        # for i, prob in enumerate(probs):
        #     word = vocab_inv[y_test[i]]
        #     history = ' '.join([vocab_inv[w] for w in x_test[i, :] if w != 0])
        #     prob_word = prob[y_test[i]]
        #     log_p_sentence += np.log(prob_word)
        #     print('P(w={} | h={})={}'.format(word, history, prob_word))
        # print('Prob. sentence: {}'.format(np.exp(log_p_sentence)))

def decodeSentence(tokens, vocab):
    return ' '.join([vocab[token] for token in tokens])

def get_sentence_prob(probs, encodedSentence):
    log_p_sentence = 0
    for i, prob in enumerate(probs):
        prob_word = prob[encodedSentence[i]]
        log_p_sentence += np.log(prob_word)
    return np.exp(log_p_sentence)

def prepareIncorrectSentences(sentences):
    return [removeToken(x) for x in sentences]

def removeToken(sentence):
    words = sentence.split(' ')
    delete_i = -1
    for i, word in enumerate(words):
        if (word in REMOVABLE_TOKENS):
            print('removing ' + word)
            delete_i = i
            break
    if (delete_i != -1):
        words.pop(delete_i)
    return ' '.join([x for x in words])

def getIncorrectSentences(sentences, num_sentences):
    x = []
    y = []
    for sent in sentences:
        new_sent = removeToken(sent)
        if (sent != new_sent):
            x.append(new_sent)
            y.append(sent)
        if (len(x) == num_sentences):
            break
    return x, y

In [22]:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
NUM_SENTENCES = 1000
REMOVABLE_TOKENS = ['the', '\'ve', '\'m', '\'s', '\'d', '\'ll', '\'nt']
rawDataFile = 'C:/Users/phili/GrammarChecker/rawtext.txt'
text = open(rawDataFile, 'r').read()
sentences = text.split('\n')
trainSentences = sentences[:NUM_SENTENCES]
incorrectSentences, trueSentences = getIncorrectSentences(sentences[NUM_SENTENCES:int(NUM_SENTENCES * 2)], int(NUM_SENTENCES * 0.2))

removing 's
removing the
removing the
removing the
removing 's
removing 'm
removing 's
removing the
removing the
removing the
removing 'll
removing the
removing 'll
removing 'll
removing the
removing 's
removing 's
removing 'm
removing 's
removing 's
removing 'm
removing 's
removing 'm
removing 's
removing 've
removing 's
removing 've
removing 've
removing 's
removing 'm
removing 'm
removing 's
removing the
removing 'm
removing 's
removing the
removing 'm
removing the
removing 'm
removing 's
removing 'll
removing 'd
removing 'll
removing the
removing the
removing 's
removing the
removing the
removing 's
removing the
removing the
removing 've
removing 'd
removing 'd
removing 'm
removing the
removing 's
removing the
removing 've
removing 's
removing 'm
removing 'm
removing 's
removing the
removing 's
removing 's
removing 's
removing 's
removing 's
removing 's
removing 'm
removing 'm
removing 's
removing 's
removing the
removing 's
removing 's
removing the
removing the
removing 'll
removi

In [23]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.concatenate((trainSentences, trueSentences)))
trainEncoded = tokenizer.texts_to_sequences(trainSentences)
trueSentencesEncoded = tokenizer.texts_to_sequences(trueSentences)
vocab_size = len(tokenizer.word_index) + 1
print('vocab size: ' + str(vocab_size))

maxLength = max([len(x) for x in np.concatenate((trainEncoded, trueSentencesEncoded))])
x_train, y_train = prepareSequences(trainEncoded, maxLength)
x_test, y_test = prepareSequences(trueSentencesEncoded, maxLength)
y_train = utils.to_categorical(y_train, num_classes=vocab_size)
y_test = utils.to_categorical(y_test, num_classes=vocab_size)

model = Sequential()
model.add(keras.layers.Embedding(vocab_size, 10, input_length=maxLength - 1))
model.add(keras.layers.GRU(20, recurrent_dropout=0.1, dropout=0.1))
model.add(keras.layers.Dense(vocab_size, activation='softmax'))

vocab size: 2307


In [24]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=300, verbose=2)
loss, accuracy = model.evaluate(x_train, y_train, verbose=1)
print('training loss: ' + str(loss) + ', accuracy: ' + str(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=1)
print('test loss: ' + str(loss) + ', accuracy: ' + str(accuracy))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/300
 - 36s - loss: 6.7674 - accuracy: 0.0365
Epoch 2/300
 - 35s - loss: 6.1682 - accuracy: 0.0350
Epoch 3/300
 - 35s - loss: 6.0951 - accuracy: 0.0356
Epoch 4/300
 - 35s - loss: 6.0277 - accuracy: 0.0378
Epoch 5/300
 - 35s - loss: 5.9643 - accuracy: 0.0389
Epoch 6/300
 - 35s - loss: 5.9072 - accuracy: 0.0446
Epoch 7/300
 - 35s - loss: 5.8603 - accuracy: 0.0445
Epoch 8/300
 - 35s - loss: 5.8125 - accuracy: 0.0474
Epoch 9/300
 - 35s - loss: 5.7675 - accuracy: 0.0481
Epoch 10/300
 - 35s - loss: 5.7214 - accuracy: 0.0575
Epoch 11/300
 - 35s - loss: 5.6730 - accuracy: 0.0603
Epoch 12/300
 - 35s - loss: 5.6243 - accuracy: 0.0620
Epoch 13/300
 - 35s - loss: 5.5753 - accuracy: 0.0627
Epoch 14/300
 - 35s - loss: 5.5216 - accuracy: 0.0632
Epoch 15/300
 - 35s - loss: 5.4663 - accuracy: 0.0671
Epoch 16/300
 - 35s - loss: 5.4132 - accuracy: 0.0720
Epoch 17/300
 - 35s - loss: 5.3567 - accuracy: 0.0795
Epoch 18/300
 - 35s - loss: 5.3022 - accuracy: 0.0834
Epoch 19/300
 - 35s - loss: 5.2491 - 

Epoch 152/300
 - 31s - loss: 3.4226 - accuracy: 0.2687
Epoch 153/300
 - 30s - loss: 3.4119 - accuracy: 0.2683
Epoch 154/300
 - 30s - loss: 3.4064 - accuracy: 0.2772
Epoch 155/300
 - 31s - loss: 3.3920 - accuracy: 0.2742
Epoch 156/300
 - 30s - loss: 3.3905 - accuracy: 0.2789
Epoch 157/300
 - 31s - loss: 3.4018 - accuracy: 0.2770
Epoch 158/300
 - 30s - loss: 3.3954 - accuracy: 0.2733
Epoch 159/300
 - 30s - loss: 3.3736 - accuracy: 0.2799
Epoch 160/300
 - 31s - loss: 3.3844 - accuracy: 0.2778
Epoch 161/300
 - 31s - loss: 3.3740 - accuracy: 0.2784
Epoch 162/300
 - 31s - loss: 3.3646 - accuracy: 0.2799
Epoch 163/300
 - 31s - loss: 3.3749 - accuracy: 0.2788
Epoch 164/300
 - 30s - loss: 3.3596 - accuracy: 0.2832
Epoch 165/300
 - 31s - loss: 3.3641 - accuracy: 0.2787
Epoch 166/300
 - 30s - loss: 3.3570 - accuracy: 0.2827
Epoch 167/300
 - 31s - loss: 3.3462 - accuracy: 0.2817
Epoch 168/300
 - 30s - loss: 3.3610 - accuracy: 0.2842
Epoch 169/300
 - 31s - loss: 3.3514 - accuracy: 0.2808
Epoch 170/

training loss: 3.011261841139971, accuracy: 0.3491867482662201
test loss: 8.52145636308658, accuracy: 0.12420670688152313


In [43]:
# incorrectSentences, trueSentences = getIncorrectSentences(sentences[NUM_SENTENCES:int(NUM_SENTENCES * 2)], int(NUM_SENTENCES * 0.2))
# trueSentencesEncoded = tokenizer.texts_to_sequences(trueSentences)
incorrectSentences, trueSentences = getIncorrectSentences(['it \'s that hot rod joey , right ? that \'s who you want me to bend my rules for ?'], 1)
trueSentencesEncoded = tokenizer.texts_to_sequences(trueSentences)
correctedSentences = processSentences(incorrectSentences, model, tokenizer, maxLength)

vocab = tokenizer.word_index
vocab_inv = {v: k for k, v in vocab.items()}

for i in range(len(correctedSentences)):
    print('orig: ' + incorrectSentences[i])
    print('pred: ' + decodeSentence(correctedSentences[i], vocab_inv))
    print('true: ' + decodeSentence(trueSentencesEncoded[i], vocab_inv))


# print(getAccuracy(correctedSentences, tokenizer.texts_to_sequences(['he \'ll never get it right . try the log ride !'])))
print(getAccuracy(correctedSentences, trueSentencesEncoded))

removing 's
orig: it that hot rod joey , right ? that 's who you want me to bend my rules for ?
pred: it that the hot rod joey right that 's who you want me to bend my rules for
true: it 's that hot rod joey right that 's who you want me to bend my rules for
0.0


In [46]:
# evaluate
testText = 'it \'s that hot rod joey , right ? that \'s who you want me to bend my rules for ?'
testEncoded = tokenizer.texts_to_sequences(testText.split('\n'))
x_test, y_test = prepareSequences(testEncoded, maxLength)
probabilities = model.predict(x_test, verbose=0)

vocab = tokenizer.word_index
vocab_inv = {v: k for k, v in vocab.items()}
log_p_sentence = 0
for i, prob in enumerate(probabilities):
    word = vocab_inv[y_test[i]]  # Index 0 from vocab is reserved to <PAD>
    history = ' '.join([vocab_inv[w] for w in x_test[i, :] if w != 0])
    prob_word = prob[y_test[i]]
    log_p_sentence += np.log(prob_word)
    print('P(w={}|h={})={}'.format(word, history, prob_word))
print('Prob. sentence: {}'.format(np.exp(log_p_sentence)))

P(w='s|h=it)=0.4375620484352112
P(w=that|h=it 's)=0.010767229832708836
P(w=hot|h=it 's that)=1.9072000156938884e-07
P(w=rod|h=it 's that hot)=2.3153397601949344e-11
P(w=joey|h=it 's that hot rod)=2.749241390098689e-10
P(w=right|h=it 's that hot rod joey)=5.533802323043346e-07
P(w=that|h=it 's that hot rod joey right)=0.018543928861618042
P(w='s|h=it 's that hot rod joey right that)=0.08000627905130386
P(w=who|h=it 's that hot rod joey right that 's)=4.960089086125663e-07
P(w=you|h=it 's that hot rod joey right that 's who)=0.001036177622154355
P(w=want|h=it 's that hot rod joey right that 's who you)=0.02686837688088417
P(w=me|h=it 's that hot rod joey right that 's who you want)=0.008253298699855804
P(w=to|h=it 's that hot rod joey right that 's who you want me)=0.46150097250938416
P(w=bend|h=it 's that hot rod joey right that 's who you want me to)=2.244991519073647e-11
P(w=my|h=it 's that hot rod joey right that 's who you want me to bend)=0.0015792143531143665
P(w=rules|h=it 's tha

In [None]:
# 'console' with 'the': .35111
# 'console' without 'the': .04584