##Import Packages

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Dropout, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np

##Download From Kaggle


In [2]:
!pip install -q kaggle
from google.colab import files
files.upload()
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d 'xvivancos/star-wars-movie-scripts'

Saving kaggle.json to kaggle.json
Downloading star-wars-movie-scripts.zip to /content
  0% 0.00/158k [00:00<?, ?B/s]
100% 158k/158k [00:00<00:00, 61.3MB/s]


In [3]:
!unzip "/content/star-wars-movie-scripts.zip"

Archive:  /content/star-wars-movie-scripts.zip
  inflating: SW_EpisodeIV.txt        
  inflating: SW_EpisodeV.txt         
  inflating: SW_EpisodeVI.txt        
  inflating: wordcloud_masks/r2d2.png  
  inflating: wordcloud_masks/rebel alliance.png  
  inflating: wordcloud_masks/vader.png  
  inflating: wordcloud_masks/yoda.png  


##Tokenize Text

In [4]:
tokenizer = Tokenizer()
data = open("/content/SW_EpisodeIV.txt").read()

corpus = data.lower().split("\n")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

##From text to numbers

In [5]:
input_sequences = []

for line in corpus:
  token_list = tokenizer.texts_to_sequences([line])[0]
  #print("LIST = ", str(token_list))
  for i in range(1,len(token_list)):
    n_gram_sequence = token_list[:i+1]
    #print(n_gram_sequence )
    input_sequences.append(n_gram_sequence)

#USE PADDING TO MAKE SENTENCES HAVE THE SAME LENGTH
max_sequence_len = max([len(seq) for seq in input_sequences])

#can use pre or post for padding
input_sequences = np.array(pad_sequences(input_sequences, padding="pre", maxlen=max_sequence_len))

#format training data
xs, labels = input_sequences[:,:-1], input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes = total_words)

##Build LSTM

In [6]:
from tensorflow.keras import regularizers

model = Sequential([
  Embedding(input_dim = total_words, output_dim = 100, input_length= max_sequence_len -1), #converts sequences into parameters
  Bidirectional(LSTM(150, return_sequences = True)), #all LSTM layers except the last should have this parameter to true
  Dropout(0.3),
  Bidirectional(LSTM(96)),
  Dense(total_words / 2, activation = "relu", kernel_regularizer= regularizers.l2(0.01)),
  Dense(total_words, activation = "softmax")
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 113, 100)          284400    
                                                                 
 bidirectional (Bidirectiona  (None, 113, 300)         301200    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 113, 300)          0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 192)              304896    
 nal)                                                            
                                                                 
 dense (Dense)               (None, 1422)              274446    
                                                                 
 dense_1 (Dense)             (None, 2844)              4

##Training Neural Network

In [None]:
model.compile(loss = "categorical_crossentropy", optimizer = "adam")
model.fit(xs,ys, epochs = 50, verbose = 1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50

##Generate Text

In [None]:

def predict_next_words(seed_text, next_words):
  for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predict_x=model.predict(token_list) 
    predicted=np.argmax(predict_x,axis=1)
    # predicted = model.predict_classes(token_list, verbose=0)
    output_word = ""
    for word, index in tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
    seed_text += " " + output_word

  print(seed_text)
  return seed_text

In [None]:
seed_text = '"1" "THREEPIO'
next_words = 100
generated_text = predict_next_words(seed_text, next_words)