In [None]:
%sh pip install keras

In [None]:
%sh pip install tensorflow

In [None]:
%sh pip install --upgrade --force-reinstall tensorflow

In [None]:
%sh pip install --upgrade --force-reinstall numpy==1.19.5

In [None]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import defaultdict

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

nlp = spacy.load('en_core_web_lg')

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
import numpy as np

In [None]:
candidate_sentences = pd.read_csv("/dbfs/FileStore/shared_uploads/t_karthik.ragunath@tatadigital.com/qa.csv", delimiter='\t', encoding='utf-8', index_col=0)
candidate_sentences['Query'] = candidate_sentences['Query'].fillna('').str.lower()

In [None]:
def prepare_sentence(seq, maxlen):
    # Pads seq and slides windows
    x = []
    y = []
    for i, w in enumerate(seq):
        x_padded = pad_sequences([seq[:i]],
                                 maxlen=maxlen - 1,
                                 padding='pre')[0]  # Pads before each sequence
        x.append(x_padded)
        y.append(w)
    return x, y

In [None]:
# Data
data = list(candidate_sentences['Query'])

# Preprocess data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
vocab = tokenizer.word_index
seqs = tokenizer.texts_to_sequences(data)

# Slide windows over each sentence
maxlen = max([len(seq) for seq in seqs])
x = []
y = []
for seq in seqs:
    x_windows, y_windows = prepare_sentence(seq, maxlen)
    x += x_windows
    y += y_windows
x = np.array(x)
y = np.array(y) - 1
y = np.eye(len(vocab))[y]  # One hot encoding

# Define model
model = Sequential()
model.add(Embedding(input_dim=len(vocab) + 1,  # vocabulary size. Adding an
                                               # extra element for <PAD> word
                    output_dim=5,  # size of embeddings
                    input_length=maxlen - 1))  # length of the padded sequences
model.add(LSTM(10))
model.add(Dense(len(vocab), activation='softmax'))
model.compile('rmsprop', 'categorical_crossentropy')

# Train network
model.fit(x, y, epochs=500)

In [None]:
# Compute probability of occurence of a sentence
def return_sent_probability(sentence):
    tok = tokenizer.texts_to_sequences([sentence])[0]
    x_test, y_test = prepare_sentence(tok, maxlen)
    x_test = np.array(x_test)
    y_test = np.array(y_test) - 1  # The word <PAD> does not have a class
    p_pred = model.predict(x_test)
    vocab_inv = {v: k for k, v in vocab.items()}
    log_p_sentence = 0
    for i, prob in enumerate(p_pred):
        word = vocab_inv[y_test[i]+1]  # Index 0 from vocab is reserved to <PAD>
        history = ' '.join([vocab_inv[w] for w in x_test[i, :] if w != 0])
        prob_word = prob[y_test[i]]
        log_p_sentence += np.log(prob_word)
        print('P(w={}|h={})={}'.format(word, history, prob_word))
    print('Prob. sentence: {}'.format(np.exp(log_p_sentence)))
#     return np.exp(log_p_sentence)
    return prob_word

In [None]:
sentence = "red dress"
return_sent_probability(sentence)

In [None]:
import hunspell
hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff')
from spacy_hunspell import spaCyHunSpell
hunspell = spaCyHunSpell(nlp, ('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff'))

In [None]:
string = 'I have rer scarf'
doc = nlp(string)
red = doc[2]
red._.hunspell_suggest

In [None]:
replacement_texts = []
for token in doc:
    if not hobj.spell(token.text):
        print("text:", token.text)
        possible_candidates = []
        print('suggest:', token._.hunspell_suggest)
        for tok in token._.hunspell_suggest:
            nlp_local = nlp(tok)
            for loc in nlp_local:
                print('loc_text:', loc.text, 'loc_pos:', loc.pos_, 'tag:', loc.tag_)
                replacement_texts.append(string.replace(token.text, loc.text))
                possible_candidates.append(loc.text)
        print('possible_candidates:', possible_candidates)
    print()

In [None]:
replacement_texts

In [None]:
sent_probs = dict()
for sentence in replacement_texts:
    sent_probs[sentence] = return_sent_probability(sentence)

In [None]:
sent_probs

In [None]:
string = 'rer dress'

In [None]:
replacement_texts = []
for token in doc:
    if not hobj.spell(token.text):
        print("text:", token.text)
        possible_candidates = []
        print('suggest:', token._.hunspell_suggest)
        for tok in token._.hunspell_suggest:
            nlp_local = nlp(tok)
            for loc in nlp_local:
                print('loc_text:', loc.text, 'loc_pos:', loc.pos_, 'tag:', loc.tag_)
                replacement_texts.append(string.replace(token.text, loc.text))
                possible_candidates.append(loc.text)
        print('possible_candidates:', possible_candidates)
    print()

In [None]:
sent_probs = dict()
for sentence in replacement_texts:
    sent_probs[sentence] = return_sent_probability(sentence)

In [None]:
sent_probs