In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.utils as utils
from keras.models import Sequential
import keras.layers
import numpy as np
import os

In [52]:
def prepareSequences(encodedLines, maxLength=None):
    sequences = list()
    for encodedLine in encodedLines:
        for i in range(1, len(encodedLine)):
            seq = encodedLine[:i + 1]
            sequences.append(seq)
    maxlen = maxLength if (maxLength) else max([len(seq) for seq in sequences])
    sequences = pad_sequences(sequences, maxlen=maxlen, padding='pre')

    # split into input and output elements
    sequences = np.array(sequences)
    x = sequences[:, :-1]
    y = sequences[:, -1]
    return x, y

def getAccuracy(predSentences, trueSentences):
    accuracy = 0
    for i in range(len(predSentences)):
        if (predSentences[i] == trueSentences[i]):
            accuracy += 1
    return accuracy / len(predSentences)

def processSentences(sentences, model, tokenizer, maxLength):
    correctedSentences = []
    vocab = tokenizer.word_index
    vocab_inv = {v: k for k, v in vocab.items()}
    for sentence in sentences:
        encoded = tokenizer.texts_to_sequences([sentence])[0]
        x, y = prepareSequences([encoded], maxLength)
        probs = model.predict(x, verbose=0)
        sentence_prob = get_sentence_prob(probs, y)
        bestEncoding = encoded
        for i, token in enumerate(encoded):
            encodedToks = tokenizer.texts_to_sequences(REMOVABLE_TOKENS)[0]
            for tok in encodedToks:
                tmpEncoded = list(encoded)
                encoded.insert(i, tok)
                tmp_x, tmp_y = prepareSequences([encoded], maxLength)
                new_probs = model.predict(tmp_x, verbose=0)
                new_sentence_prob = get_sentence_prob(new_probs, tmp_y)
                if (new_sentence_prob > sentence_prob):
                    sentence_prob = new_sentence_prob
                    bestEncoding = encoded
                encoded = tmpEncoded
        correctedSentences.append(bestEncoding)
    return correctedSentences

def decodeSentence(tokens, vocab):
    return ' '.join([vocab[token] for token in tokens])

def get_sentence_prob(probs, encodedSentence):
    log_p_sentence = 0
    for i, prob in enumerate(probs):
        prob_word = prob[encodedSentence[i]]
        log_p_sentence += np.log(prob_word)
    return np.exp(log_p_sentence)

def prepareIncorrectSentences(sentences):
    return [removeToken(x) for x in sentences]

def removeToken(sentence):
    words = sentence.split(' ')
    delete_i = -1
    for i, word in enumerate(words):
        if (word in REMOVABLE_TOKENS):
            print('removing ' + word)
            delete_i = i
            break
    if (delete_i != -1):
        words.pop(delete_i)
    return ' '.join([x for x in words])

def getIncorrectSentences(sentences, num_sentences):
    x = []
    y = []
    for sent in sentences:
        new_sent = removeToken(sent)
        if (sent != new_sent):
            x.append(new_sent)
            y.append(sent)
        if (len(x) == num_sentences):
            break
    return x, y

In [53]:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
NUM_SENTENCES = 100
REMOVABLE_TOKENS = ['the', '\'ve', '\'m', '\'s', '\'d', '\'ll', '\'nt']
rawDataFile = 'C:/Users/phili/GrammarChecker/rawtext.txt'
text = open(rawDataFile, 'r').read()
sentences = text.split('\n')
trainSentences = sentences[:NUM_SENTENCES]
incorrectSentences, trueSentences = getIncorrectSentences(sentences[NUM_SENTENCES:int(NUM_SENTENCES * 2)], int(NUM_SENTENCES * 0.2))

removing the
removing 'll
removing the
removing 'm
removing the
removing the
removing the
removing 'm
removing 've
removing 's
removing 's
removing 'd
removing 's
removing the
removing the
removing 'll
removing 've
removing the
removing 's
removing the


In [56]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.concatenate((trainSentences, trueSentences)))
trainEncoded = tokenizer.texts_to_sequences(trainSentences)
trueSentencesEncoded = tokenizer.texts_to_sequences(trueSentences)
vocab_size = len(tokenizer.word_index) + 1
print('vocab size: ' + str(vocab_size))

maxLength = max([len(x) for x in np.concatenate((trainEncoded, trueSentencesEncoded))])
x_train, y_train = prepareSequences(trainEncoded, maxLength)
x_test, y_test = prepareSequences(trueSentencesEncoded, maxLength)
y_train = utils.to_categorical(y_train, num_classes=vocab_size)
y_test = utils.to_categorical(y_test, num_classes=vocab_size)

model = Sequential()
model.add(keras.layers.Embedding(vocab_size, 10, input_length=maxLength - 1))
model.add(keras.layers.LSTM(50)
model.add(keras.layers.Dense(vocab_size, activation='softmax'))

SyntaxError: invalid syntax (<ipython-input-56-3519d571dc4e>, line 17)

In [57]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=300, verbose=2)
loss, accuracy = model.evaluate(x_train, y_train, verbose=1)
print('training loss: ' + str(loss) + ', accuracy: ' + str(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=1)
print('test loss: ' + str(loss) + ', accuracy: ' + str(accuracy))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/300
 - 8s - loss: 5.7312 - accuracy: 0.0528
Epoch 2/300
 - 7s - loss: 5.6715 - accuracy: 0.0528
Epoch 3/300
 - 7s - loss: 5.6555 - accuracy: 0.0528
Epoch 4/300


KeyboardInterrupt: 

In [None]:
incorrectSentences, trueSentences = getIncorrectSentences(sentences[:NUM_SENTENCES], int(NUM_SENTENCES * 0.2))
trueSentencesEncoded = tokenizer.texts_to_sequences(trueSentences)
# incorrectSentences, trueSentences = getIncorrectSentences(['it \'s that hot rod joey , right ? that \'s who you want me to bend my rules for ?'], 1)
trueSentencesEncoded = tokenizer.texts_to_sequences(trueSentences)
correctedSentences = processSentences(incorrectSentences, model, tokenizer, maxLength)

vocab = tokenizer.word_index
vocab_inv = {v: k for k, v in vocab.items()}

for i in range(len(correctedSentences)):
    print('orig: ' + incorrectSentences[i])
    print('pred: ' + decodeSentence(correctedSentences[i], vocab_inv))
    print('true: ' + decodeSentence(trueSentencesEncoded[i], vocab_inv))


# print(getAccuracy(correctedSentences, tokenizer.texts_to_sequences(['he \'ll never get it right . try the log ride !'])))
print(getAccuracy(correctedSentences, trueSentencesEncoded))

In [None]:
# evaluate
testText = 'it \'s that hot rod joey , right ? that \'s who you want me to bend my rules for ?'
testEncoded = tokenizer.texts_to_sequences(testText.split('\n'))
x_test, y_test = prepareSequences(testEncoded, maxLength)
probabilities = model.predict(x_test, verbose=0)

vocab = tokenizer.word_index
vocab_inv = {v: k for k, v in vocab.items()}
log_p_sentence = 0
for i, prob in enumerate(probabilities):
    word = vocab_inv[y_test[i]]  # Index 0 from vocab is reserved to <PAD>
    history = ' '.join([vocab_inv[w] for w in x_test[i, :] if w != 0])
    prob_word = prob[y_test[i]]
    log_p_sentence += np.log(prob_word)
    print('P(w={}|h={})={}'.format(word, history, prob_word))
print('Prob. sentence: {}'.format(np.exp(log_p_sentence)))

In [None]:
# 'console' with 'the': .35111
# 'console' without 'the': .04584