In [1]:
import numpy as np
import pandas as pd
# Read data. 
movies_raw_df = pd.read_csv('wiki_movie_plots_deduped.csv')

movies_raw_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [2]:
movies_to_select = ((movies_raw_df['Genre'] == 'horror') &
                    # Restrict to American movies. 
                    (movies_raw_df['Origin/Ethnicity'] == 'American') &
                    # Only movies from 2000.
                    (movies_raw_df['Release Year'] > 1999))

In [3]:
horror_df = movies_raw_df[movies_to_select]['Plot']

horror_df.head()

13617    In November 1999, tourists and fans of The Bla...
13640    Matthew Van Helsing, the alleged descendant of...
13681    A small group of fervent Roman Catholics belie...
13731    Cotton Weary, now living in Los Angeles and th...
13763    Amy Mayfield, a student at a prestigious film ...
Name: Plot, dtype: object

In [4]:
horror_df.shape

(260,)

In [5]:
# Join all plots into a string.
horror_str = horror_df.str.cat(sep=' ')

In [6]:
import spacy

# Load language model. 
nlp = spacy.load('en', disable = ['parser', 'tagger', 'ner'])

In [7]:
def get_tokens(doc_text):
    # This pattern is a modification of the defaul filter from the
    # Tokenizer() object in keras.preprocessing.text. 
    # It just indicates which patters no skip.
    skip_pattern = '\r\n \n\n \n\n\n!"-#$%&()--.*+,-./:;<=>?@[\\]^_`{|}~\t\n\r '
    
    tokens = [token.text.lower() for token in nlp(doc_text) if token.text not in skip_pattern]
    
    return tokens

In [8]:
# Get tokens.
tokens = get_tokens(horror_str)
# Let us see the first tokens.
tokens[0:9]


['in', 'november', '1999', 'tourists', 'and', 'fans', 'of', 'the', 'blair']

In [9]:
# Compute the number of tokens list.
len(tokens) 

165871

In [10]:
len_0 = 25

tokens[0:len_0]

['in',
 'november',
 '1999',
 'tourists',
 'and',
 'fans',
 'of',
 'the',
 'blair',
 'witch',
 'project',
 'descend',
 'on',
 'the',
 'small',
 'town',
 'of',
 'burkittsville',
 'maryland',
 'where',
 'the',
 'film',
 'is',
 'set',
 'local']

In [11]:
tokens[len_0:len_0 + 1]

['resident']

In [12]:
train_len = len_0 + 1

text_sequences = []
for i in range(train_len, len(tokens)):
    # Construct sequence.
    seq = tokens[i - train_len: i]
    # Append.
    text_sequences.append(seq)

In [13]:
' '.join(text_sequences[0])

'in november 1999 tourists and fans of the blair witch project descend on the small town of burkittsville maryland where the film is set local resident'

In [14]:
len(text_sequences[0])

26

In [15]:
for i in range(0, 5):
    print(' '.join(text_sequences[i]))
    print('-----')

in november 1999 tourists and fans of the blair witch project descend on the small town of burkittsville maryland where the film is set local resident
-----
november 1999 tourists and fans of the blair witch project descend on the small town of burkittsville maryland where the film is set local resident jeff
-----
1999 tourists and fans of the blair witch project descend on the small town of burkittsville maryland where the film is set local resident jeff a
-----
tourists and fans of the blair witch project descend on the small town of burkittsville maryland where the film is set local resident jeff a former
-----
and fans of the blair witch project descend on the small town of burkittsville maryland where the film is set local resident jeff a former psychiatric
-----


In [16]:
#import tensorflow as tf
#gpus = tf.config.experimental.list_physical_devices('GPU')
#for gpu in gpus:
  #tf.config.experimental.set_memory_growth(gpu, True)

In [17]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(text_sequences)

Using TensorFlow backend.


In [18]:
# Get numeric sequences.
sequences = tokenizer.texts_to_sequences(text_sequences)

In [19]:
sequences[0]

[8,
 12586,
 12585,
 2397,
 2,
 12584,
 5,
 1,
 5558,
 630,
 2195,
 2927,
 20,
 1,
 450,
 157,
 5,
 12583,
 7487,
 42,
 1,
 117,
 7,
 362,
 231,
 2928]

In [20]:
tokenizer.index_word[8]

'in'

In [21]:
vocabulary_size = len(tokenizer.word_counts)

vocabulary_size

12586

In [22]:
# We store the sequences in a numpy array.
sequences = np.array(sequences)
sequences

array([[    8, 12586, 12585, ...,   362,   231,  2928],
       [12586, 12585,  2397, ...,   231,  2928,   297],
       [12585,  2397,     2, ...,  2928,   297,     4],
       ...,
       [   20,     4,  1551, ...,     1,    59,     5],
       [    4,  1551,  1684, ...,    59,     5,     6],
       [ 1551,  1684,    22, ...,     5,     6,   169]])

In [23]:
from keras.utils import to_categorical

# select all but last word indices.
X = sequences[:, :-1]
X

array([[    8, 12586, 12585, ...,     7,   362,   231],
       [12586, 12585,  2397, ...,   362,   231,  2928],
       [12585,  2397,     2, ...,   231,  2928,   297],
       ...,
       [   20,     4,  1551, ...,    22,     1,    59],
       [    4,  1551,  1684, ...,     1,    59,     5],
       [ 1551,  1684,    22, ...,    59,     5,     6]])

In [24]:
X.shape

(165845, 25)

In [25]:
seq_len = X.shape[1]
# select all last word indices.
y = sequences[:, -1]
y

array([2928,  297,    4, ...,    5,    6,  169])

In [26]:
# Convert to categorical (we add + 1 because Keras needs a placeholder).
y = to_categorical(y, num_classes=(vocabulary_size + 1))
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [27]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

def create_model(vocabulary_size, seq_len):
    
    model = Sequential()
    
    model.add(Embedding(input_dim=vocabulary_size, 
                        output_dim=seq_len, 
                        input_length=seq_len))
    
    model.add(LSTM(units=50, return_sequences=True))
    
    model.add(LSTM(units=50))
    
    model.add(Dense(units=50, activation='relu'))
    
    model.add(Dense(units=vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    
    model.summary()
    
    return model
# Let us create the model and see summary.
model = create_model(vocabulary_size=(vocabulary_size + 1), seq_len=seq_len)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 25)            314675    
_________________________________________________________________
lstm (LSTM)                  (None, 25, 50)            15200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dense_1 (Dense)              (None, 12587)             641937    
Total params: 994,562
Trainable params: 994,562
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.fit(x=X, y=y, batch_size=128, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x26d01ad35e0>

In [30]:
# Get model metrics.
loss, accuracy =  model.evaluate(x=X, y=y)
print(f'Loss: {loss}\nAccuracy: {accuracy}')

Loss: 3.182206392288208
Accuracy: 0.34490036964416504


In [31]:
model.fit(x=X, y=y, batch_size=128, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x26d12eef310>

In [None]:
# Get model metrics.
loss, accuracy =  model.evaluate(x=X, y=y)
print(f'Loss: {loss}\nAccuracy: {accuracy}')

In [32]:
model.fit(x=X, y=y, batch_size=512, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x26d12f0d580>

In [33]:
# Get model metrics.
loss, accuracy =  model.evaluate(x=X, y=y)
print(f'Loss: {loss}\nAccuracy: {accuracy}')

Loss: 2.2861976623535156
Accuracy: 0.4980553984642029


In [34]:
from pickle import dump

dump(tokenizer, open('tokenizer', 'wb'))

In [35]:
model.save('model.h5')


In [36]:
from keras.models import load_model

model = load_model('model.h5')


In [37]:
from keras.preprocessing.sequence import pad_sequences 

def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    # List to store the generated words. 
    output_text = []
    # Set seed_text as input_text. 
    input_text = seed_text
    
    for i in range(num_gen_words):
        # Encode input text. 
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        # Add if the input tesxt does not have length len_0.
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        # Do the prediction. Here we automatically choose the word with highest probability. 
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        # Convert from numeric to word. 
        pred_word = tokenizer.index_word[pred_word_ind]
        # Attach predicted word. 
        input_text += ' ' + pred_word
        # Append new word to the list. 
        output_text.append(pred_word)
        
    return ' '.join(output_text)

In [38]:
sample_text = horror_df.iloc[100][:383]
print(sample_text)

Officer Frank Williams (Steven Vidler) and his partner Blaine investigate an abandoned house, where they find a young woman with her eyes ripped out. A large figure with an axe then murders Blaine and Frank has his arm chopped off before he is able to shoot the attacker in the head. Afterwards, detectives find seven bodies in the house, all of which have had their eyes ripped out.


In [39]:
seed_text = sample_text[:190]
print(seed_text)

Officer Frank Williams (Steven Vidler) and his partner Blaine investigate an abandoned house, where they find a young woman with her eyes ripped out. A large figure with an axe then murders 


In [40]:
generated_text = generate_text(model=model, 
                               tokenizer=tokenizer,
                               seq_len=seq_len, 
                               seed_text=seed_text, 
                               num_gen_words=40)

print(seed_text + ' ' + generated_text + '...')

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Officer Frank Williams (Steven Vidler) and his partner Blaine investigate an abandoned house, where they find a young woman with her eyes ripped out. A large figure with an axe then murders  blaine and the remote founding fathers lying to retrieve her preston spins out the citizens abruptly hudson is arrested for henry hewitt a hound of iran she travels to the conclusion that she is not the same treatment mad tells...


In [41]:
seed_text = 'the film starts in a dark house where a group of teenagers friends meet to spend the weekend when they suddenly hear'

In [42]:
generated_text = generate_text(model=model, 
                               tokenizer=tokenizer,
                               seq_len=seq_len, 
                               seed_text=seed_text, 
                               num_gen_words=80)

print(seed_text + ' ' + generated_text + '...')

the film starts in a dark house where a group of teenagers friends meet to spend the weekend when they suddenly hear the parasite but quentin is rendered distraught from the hospital and finds a bottle of report a few years later the rest of the group are then a dream of the mirror prompting him about screen deucalion is sitting somewhere ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ......


In [43]:
seed_text = movies_raw_df[movies_raw_df['Genre'] == 'comedy']['Plot'].iloc[330]
print(seed_text)

Cocky college football star Francis Finnegan has his eye on the attractive Gloria van Dayham, as does his rival, Larry Stacey.
Francis gets a job in a department store owned by Stacey's father, where salesgirl June Cort develops an attraction to him. Finnegan proposes that Stacey's store sponsor a football team, which causes rival shop owner Whimple to do likewise. The team's head cheerleader, Mimi, falls for team mascot Joe, meanwhile, and everybody pairs off with the perfect partner after the big game.


In [44]:
def generate_text2(model, tokenizer, seq_len, seed_text, num_gen_words, temperature):
    
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):
        # Encode input text. 
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
         # Add if the input tesxt does not have length len_0.
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        # Get learned distribution.
        pred_distribution = model.predict(pad_encoded, verbose=0)[0]
        
        # Apply temperature transformation.
        new_pred_distribution = np.power(pred_distribution, (1 / temperature)) 
        new_pred_distribution = new_pred_distribution / new_pred_distribution.sum()
        
        # Sample from modified distribution.
        choices = range(new_pred_distribution.size)
 
        pred_word_ind = np.random.choice(a=choices, p=new_pred_distribution)
        
        # Convert from numeric to word. 
        pred_word = tokenizer.index_word[pred_word_ind]
        # Attach predicted word. 
        input_text += ' ' + pred_word
        # Append new word to the list. 
        output_text.append(pred_word)
        
    return ' '.join(output_text)

In [45]:
seed_text = 'the film starts in a dark house where a group of teenagers friends meet to spend the weekend when they suddenly hear'

In [46]:
generated_text = generate_text2(model=model, 
                                tokenizer=tokenizer,
                                seq_len=seq_len, 
                                seed_text=seed_text, 
                                num_gen_words=80, 
                                temperature=0.9)

print(seed_text + ' ' + generated_text + ' ...')

the film starts in a dark house where a group of teenagers friends meet to spend the weekend when they suddenly hear the parasite in the process ultimately then 've ca house explosion or enter the basement in jennifer 's involvement doris is dramatic fictions hybrid chokes ambushes the car ringing the spirit 's eventually friend zoe confronts laurel being finally a dream as screen bathory has been dreams that it is herself wrong they are in the sanctuary with entering sonja timmy eva notices a woman named logan taylor and abigail embrace to the family on the future at his house ...


In [47]:
generated_text = generate_text2(model=model, 
                                tokenizer=tokenizer,
                                seq_len=seq_len, 
                                seed_text=seed_text, 
                                num_gen_words=82, 
                                temperature=0.5)

print(seed_text + ' ' + generated_text + ' ...')

the film starts in a dark house where a group of teenagers friends meet to spend the weekend when they suddenly hear the parasite but quentin and scott popular chase the cops eden making to rather by smiles and throws her off to blister they search and slaughter by letch where if g.m. g.m. g.m. g.m. has unknowingly revealed to sound an advantage to pursue the vatican and stabs her in the head and to find her and uses it who does n't edward brodus jeff to lift it out of the tub and are unable to evacuate them and the cannibals – other ...


In [48]:
generated_text = generate_text2(model=model, 
                                tokenizer=tokenizer,
                                seq_len=seq_len, 
                                seed_text=seed_text, 
                                num_gen_words=82, 
                                temperature = 0.1)

print(seed_text + ' ' + generated_text + ' ...')

the film starts in a dark house where a group of teenagers friends meet to spend the weekend when they suddenly hear the parasite but quentin is rendered catatonic and murders the next scene katie confesses that he is suicidal and is violently treated the zombies but captain roman andrews twenty lamia the cage 's exit dying and snaps at the beach she is a psychopath necrophile and serial rapist he holds the creature in the basement where she finds that the rollins family the next day andrews promises to fix the containers which is knocked on the gun and kills her with the ...
