## Imports

In [1]:
import numpy as np
import pandas as pd
import spacy
import tensorflow as tf
import datetime
from pickle import dump

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

%load_ext tensorboard

## Import dataset & check data

In [2]:
df_fantasy = pd.read_csv('Data\Dataset\Fantasy-V1.csv', nrows=200)
df_fantasy.head()

Unnamed: 0,title,plot_synopsis,tags
0,the forbidden kingdom,South Boston teenager Jason Tripitikas is a fa...,"fantasy, murder, violence, flashback, philosop..."
1,twilight,Seventeen-year-old Bella Swan (Kristen Stewart...,"fantasy, gothic, murder, boring, cult, violenc..."
2,guardians of the galaxy,"On planet Earth in 1988, young Peter Quill (Wy...","fantasy, murder, violence, flashback, good ver..."
3,shrek 2,Shrek and Fiona return from their honeymoon to...,"comedy, fantasy, cult, violence, humor, satire..."
4,matilda,Matilda Wormwood (Mara Wilson) is an incredibl...,"comedy, fantasy, cruelty, paranormal, dramatic..."


In [3]:
df_fantasy_plot = df_fantasy['plot_synopsis']
df_fantasy_plot.head()

0    South Boston teenager Jason Tripitikas is a fa...
1    Seventeen-year-old Bella Swan (Kristen Stewart...
2    On planet Earth in 1988, young Peter Quill (Wy...
3    Shrek and Fiona return from their honeymoon to...
4    Matilda Wormwood (Mara Wilson) is an incredibl...
Name: plot_synopsis, dtype: object

## Preprocessing

In [4]:
nlp = spacy.load('en_core_web_sm', disable = ['parser', 'tagger', 'ner', 'lemmatizer'])

In [5]:
df_fantasy_plot_str = df_fantasy_plot.str.cat(sep=' ')
len(df_fantasy_plot_str)

1923547

In [6]:
nlp.max_length = 2000000
skip = '\r\n \n\n \n\n\n!"-#$%&()--.*+,-./:;<=>?@[\\]^_`{|}~\t\n\r '
tokens = [token.text.lower() for token in nlp(df_fantasy_plot_str) if token.text not in skip]
len(tokens) 

347177

In [7]:
text_sequences = []

for i in range(31, len(tokens)):
    seq = tokens[i - 31: i]
    text_sequences.append(seq)

In [8]:
' '.join(text_sequences[0])

'south boston teenager jason tripitikas is a fan of martial arts films and awakens from a dream of a battle between the monkey king and celestial soldiers in the clouds he'

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [10]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [11]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

20372

In [12]:
sequences = np.array(sequences)

In [13]:
X = sequences[:, :-1]

sequence_length = X.shape[1]
sequence_length

30

In [14]:
Y = sequences[:, -1]
Y = to_categorical(Y, num_classes=(vocabulary_size + 1))

## Create model LSTM

In [15]:
def create_model(vocabulary_size, sequence_length):
    
    model = Sequential()
    
    model.add(Embedding(input_dim=vocabulary_size, 
                        output_dim=sequence_length, 
                        input_length=sequence_length))
    model.add(LSTM(units=50, return_sequences=True))
    model.add(LSTM(units=50))
    model.add(Dense(units=50, activation='relu'))
    model.add(Dense(units=vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    
    model.summary()
    
    return model

In [16]:
model = create_model(vocabulary_size=(vocabulary_size + 1), sequence_length=sequence_length)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 30)            611190    
_________________________________________________________________
lstm (LSTM)                  (None, 30, 50)            16200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dense_1 (Dense)              (None, 20373)             1039023   
Total params: 1,689,163
Trainable params: 1,689,163
Non-trainable params: 0
_________________________________________________________________


## Train model

In [17]:
log_dir = "PA_logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.fit(x=X, y=Y, batch_size=128, epochs=5, verbose=1, callbacks=[tensorboard_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1894a28b5e0>

## Result

In [18]:
loss, accuracy =  model.evaluate(x=X, y=Y)



## Save

In [19]:
dump(tokenizer, open('tokenizer', 'wb'))

In [20]:
model.save('Fantasy-LSTM-relu-adam.h5')

## Test model

In [21]:
model = load_model('Fantasy-LSTM-relu-adam.h5')

In [22]:
def generate_text(model, tokenizer, sequence_length, seed_text, num_generate_words, temperature):
    
    output_text = []
    input_text = seed_text
    
    for i in range(num_generate_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=sequence_length, truncating='pre')
        pred_distribution = model.predict(pad_encoded, verbose=0)[0]
        
        new_pred_distribution = np.power(pred_distribution, (1 / temperature)) 
        new_pred_distribution = new_pred_distribution / new_pred_distribution.sum()
        
        choices = range(new_pred_distribution.size)
        pred_word_ind = np.random.choice(a=choices, p=new_pred_distribution)
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' ' + pred_word
        output_text.append(pred_word)
        
    return ' '.join(output_text)

In [23]:
seed_text = 'The Minister of Magic, Rufus, addresses the wizarding media, stating that the Ministry will remain strong even as the demons gains strength.'

In [24]:
generated_text = generate_text(model=model, tokenizer=tokenizer,
                                sequence_length=sequence_length, 
                                seed_text=seed_text, 
                                num_generate_words=80, 
                                temperature=0.9)

print(seed_text + ' ' + generated_text)

The Minister of Magic, Rufus, addresses the wizarding media, stating that the Ministry will remain strong even as the demons gains strength. of a willo the island or restates r.k. into the big night 's wife wearing a beeline and anvil in the stone that his feelings ring he likes this from a suv 's mission and george they have at any left but while most of the orcs before the king 's powerful jack will escape him thomas takes him to rescue the cage to cross the ship over his destruction and says that the western saito opens them anastasia who
