In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
file = '/content/drive/My Drive/proj/wiki_movie_plots_deduped.csv'

movies_raw_df = pd.read_csv(file)

movies_raw_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [0]:
movies_to_select = ((movies_raw_df['Genre'] == 'horror') &
                    # Restrict to Amerian movies. 
                    (movies_raw_df['Origin/Ethnicity'] == 'American') &
                    # Only movies from 2000.
                    (movies_raw_df['Release Year'] > 1999))

In [4]:
# plots = movies_raw_df['Plot']
plots = movies_raw_df[movies_to_select]['Plot']

print(plots.head())
print(plots.shape)

13617    In November 1999, tourists and fans of The Bla...
13640    Matthew Van Helsing, the alleged descendant of...
13681    A small group of fervent Roman Catholics belie...
13731    Cotton Weary, now living in Los Angeles and th...
13763    Amy Mayfield, a student at a prestigious film ...
Name: Plot, dtype: object
(260,)


In [0]:
text = plots.str.cat(sep=' ')

In [0]:
import spacy

nlp = spacy.load('en', disable = ['parser', 'tagger', 'ner'])

In [0]:
def get_tokens(doc_text):
    # This pattern is a modification of the defaul filter from the
    # Tokenizer() object in keras.preprocessing.text. 
    # It just indicates which patters no skip.
    skip_pattern = '\r\n \n\n \n\n\n!"-#$%&()--.*+,-./:;<=>?@[\\]^_`{|}~\t\n\r '
    
    tokens = [token.text.lower() for token in nlp(doc_text) if token.text not in skip_pattern]
    
    return tokens

In [0]:
# tokens = set()
# maxlen = 25+1
# text_sequences = []
# for i in range(len(plots)):
#   temp = get_tokens(plots[i])
#   for i in range(maxlen, len(temp)):
#     seq = temp[i - maxlen: i]
#     text_sequences.append(seq)
#   for token in temp:
#     tokens.add(token)

# tokens = list(tokens)
# print(tokens[0:9])
# print(len(tokens))

tokens = get_tokens(text)

In [0]:
train_len = 25 + 1

text_sequences = []

for i in range(train_len, len(tokens)):
    # Construct sequence.
    seq = tokens[i - train_len: i]
    # Append.
    text_sequences.append(seq)

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(text_sequences)

sequences = tokenizer.texts_to_sequences(text_sequences)

In [11]:
vocabulary_size = len(tokenizer.word_counts)

vocabulary_size

12586

In [0]:
sequences = np.array(sequences)

In [13]:
from tensorflow.keras.utils import to_categorical

# select all but last word indices.
X = sequences[:, :-1]
y = sequences[:, -1]
y = to_categorical(y, num_classes=(vocabulary_size + 1))
print(X)
print(X.shape)

[[    8 12586 12585 ...     7   362   231]
 [12586 12585  2397 ...   362   231  2928]
 [12585  2397     2 ...   231  2928   297]
 ...
 [   20     4  1551 ...    22     1    59]
 [    4  1551  1684 ...     1    59     5]
 [ 1551  1684    22 ...    59     5     6]]
(165845, 25)


In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding

def create_model(vocabulary_size, seq_len):
    
  model = Sequential()
      
  model.add(Embedding(input_dim=vocabulary_size, 
                      output_dim=seq_len, 
                      input_length=seq_len))
      
  model.add(LSTM(units=50, return_sequences=True))
      
  model.add(LSTM(units=50))
      
  model.add(Dense(units=50, activation='relu'))
      
  model.add(Dense(units=vocabulary_size, activation='softmax'))
      
  model.compile(loss='categorical_crossentropy', 
                optimizer='adam', 
                metrics=['accuracy'])
      
  model.summary()
    
  return model

In [15]:
model = create_model(vocabulary_size=(vocabulary_size + 1), seq_len=X.shape[1])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 25)            314675    
_________________________________________________________________
lstm (LSTM)                  (None, 25, 50)            15200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dense_1 (Dense)              (None, 12587)             641937    
Total params: 994,562
Trainable params: 994,562
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.fit(x=X, y=y, batch_size=128, epochs=700, verbose=1)

Epoch 1/700
Epoch 2/700
Epoch 3/700
Epoch 4/700
Epoch 5/700
Epoch 6/700
Epoch 7/700
Epoch 8/700
Epoch 9/700
Epoch 10/700
Epoch 11/700
Epoch 12/700
Epoch 13/700
Epoch 14/700
Epoch 15/700
Epoch 16/700
Epoch 17/700
Epoch 18/700
Epoch 19/700
Epoch 20/700
Epoch 21/700
Epoch 22/700
Epoch 23/700
Epoch 24/700
Epoch 25/700
Epoch 26/700
Epoch 27/700
Epoch 28/700
Epoch 29/700
Epoch 30/700
Epoch 31/700
Epoch 32/700
Epoch 33/700
Epoch 34/700
Epoch 35/700
Epoch 36/700
Epoch 37/700
Epoch 38/700
Epoch 39/700
Epoch 40/700
Epoch 41/700
Epoch 42/700
Epoch 43/700
Epoch 44/700
Epoch 45/700
Epoch 46/700
Epoch 47/700
Epoch 48/700
Epoch 49/700
Epoch 50/700
Epoch 51/700
Epoch 52/700
Epoch 53/700
Epoch 54/700
Epoch 55/700
Epoch 56/700
Epoch 57/700
Epoch 58/700
Epoch 59/700
Epoch 60/700
Epoch 61/700
Epoch 62/700
Epoch 63/700
Epoch 64/700
Epoch 65/700
Epoch 66/700
Epoch 67/700
Epoch 68/700
Epoch 69/700
Epoch 70/700
Epoch 71/700
Epoch 72/700
Epoch 73/700
Epoch 74/700
Epoch 75/700
Epoch 76/700
Epoch 77/700
Epoch 78

In [0]:
from pickle import dump

dump(tokenizer, open('tokenizer', 'wb'))

model.save('/content/drive/My Drive/proj/model.h5')

In [0]:
def generate_text2(model, tokenizer, seq_len, seed_text, num_gen_words, temperature):
    
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):
        # Encode input text. 
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
         # Add if the input tesxt does not have length len_0.
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        # Get learned distribution.
        pred_distribution = model.predict(pad_encoded, verbose=0)[0]
        
        # Apply temperature transformation.
        new_pred_distribution = np.power(pred_distribution, (1 / temperature)) 
        new_pred_distribution = new_pred_distribution / new_pred_distribution.sum()
        
        # Sample from modified distribution.
        choices = range(new_pred_distribution.size)
 
        pred_word_ind = np.random.choice(a=choices, p=new_pred_distribution)
        
        # Convert from numeric to word. 
        pred_word = tokenizer.index_word[pred_word_ind]
        # Attach predicted word. 
        input_text += ' ' + pred_word
        # Append new word to the list. 
        output_text.append(pred_word)
        
    return ' '.join(output_text)

In [0]:
l = len(plots)
for i in range(5):
  seed_text = plots.iloc[np.random.randint(l)][:150]
  print(seed_text)
  generated_text = generate_text2(model=model, 
                  tokenizer=tokenizer,
                  seq_len=X.shape[1], 
                  seed_text=seed_text, 
                  num_gen_words=60, 
                  temperature=0.9)
  print(seed_text + ' ' + generated_text + ' ...')
  generated_text = generate_text2(model=model, 
                  tokenizer=tokenizer,
                  seq_len=X.shape[1], 
                  seed_text=seed_text, 
                  num_gen_words=60, 
                  temperature=0.5)
  print(seed_text + ' ' + generated_text + ' ...')
  generated_text = generate_text2(model=model, 
                  tokenizer=tokenizer,
                  seq_len=X.shape[1], 
                  seed_text=seed_text, 
                  num_gen_words=60, 
                  temperature=0.1)
  print(seed_text + ' ' + generated_text + ' ...')
  print()