## Importing Packages

### Colab: https://colab.research.google.com/drive/1AVepdMUZBFMorMtKHKGI5S1VubsbUCn0?usp=sharing

In [None]:
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, LeakyReLU, Dropout, BatchNormalization, Conv2D, MaxPool2D, Flatten, Reshape
from keras.callbacks import EarlyStopping, ModelCheckpoint
import pandas as pd
import random 

## Importing Data

In [None]:
df = pd.read_csv("refined_bigdata.csv")
lines = list(df['text'].iloc[:])

In [None]:
lines = random.sample(lines, 9000)
print(len(lines))

## Tokenize Data

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
len(tokenizer.word_index)

### Save the tokenizer

In [None]:
pickle.dump(tokenizer, open('auto_suggest.pkl', 'wb'))

### Load the tokenizer

In [None]:
tokenizer = pickle.load(open('auto_suggest.pkl', 'rb'))

In [None]:
def get_sequence_of_tokens(corpus, tokenizer):
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    total_words = len(tokenizer.word_index) + 1
    return input_sequences, total_words

In [None]:
X, vocab_size = get_sequence_of_tokens(lines, tokenizer)

In [None]:
def generate_padded_sequences(input_sequences, total_words):
    max_sequence_len = max([len(x) for x in input_sequences])
    print(max_sequence_len)
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

X, y, seq_length = generate_padded_sequences(X, vocab_size)

In [None]:
X.shape[1]

### Importing any Pretrained Model to Improve on it

In [None]:
from tensorflow import keras
model = keras.models.load_model('auto_suggest_model.h5', compile=False)

In [None]:
model.summary()

## Building Up the model

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 512, input_length=seq_length - 1))
model.add(LSTM(128, return_sequences=True))
#model.add(Dropout(0.2))
model.add(LSTM(128))

model.add(Dense(4096))
model.add(LeakyReLU(alpha=0.1))
model.add(BatchNormalization())

model.add(Dense(2048))
model.add(LeakyReLU(alpha=0.1))

model.add(Dense(vocab_size, activation='softmax'))
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
es = EarlyStopping(monitor='loss',patience=3, verbose=1, mode='min', restore_best_weights=True)
mc = ModelCheckpoint('auto_suggest_weights.h5',monitor='loss',verbose=1,save_best_only=True,save_weights_only=True)

### Define batch_size and epochs for model fit on data

In [None]:
batch_size=100
epochs=20
model.fit(X, y, batch_size=batch_size, epochs=epochs, callbacks=[es, mc]) 

### Saving up the model

In [None]:
weights_name = 'auto_suggest_weights.h5'
model.save_weights(weights_name)

model_name = 'auto_suggest_model.h5'
model.save(model_name)

In [None]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    # truncate sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
    # predict probabilities for each word
    # yhat = self.model.predict_classes(encoded, verbose=0)
    predicted_l = list(tuple(enumerate(model.predict(encoded)[0])))
    top_3 = sorted(predicted_l, key=lambda x: x[1], reverse=True)[:4]
    print(top_3)
    # map predicted word index to word
    predicted_words = []
    for i, word in enumerate(top_3):
        for w in list(tokenizer.word_index.items()):
            if w[1] == word[0]:
                predicted_words.append({'word': w[0], 'probability': word[1]})
    return predicted_words

## Output Predictive Sequence

In [None]:
generated = generate_seq(model, tokenizer, 72, 'i just want good', 4) 
print(generated)