In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.utils import to_categorical

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
voice_lines = pd.read_csv("/kaggle/input/league-of-legends-voice-lines/voice_lines.csv")

voice_lines.head()

In [None]:
voice_lines.describe(include='all')

In [None]:
voice_lines.drop(voice_lines[voice_lines.is_spoken == False].index, inplace=True)
voice_lines.drop(['Unnamed: 0', 'is_spoken'], axis=1, inplace=True)

voice_lines.describe(include='all')

In [None]:
example_text = ["this should be easy", "thats a mink if I've ever mink mink", "time to"]

vectorize_layer = keras.layers.TextVectorization(standardize="lower_and_strip_punctuation",
                                         split="whitespace",
                                         output_mode="int")

vectorize_layer.adapt(example_text)

example = vectorize_layer(["mink be easy", "easy be mink"])
example

In [None]:
vectorize_layer.adapt(voice_lines["voice_line"])

voice_lines_tokenized = []

for line in voice_lines["voice_line"]:
    token_list = vectorize_layer(line)
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        voice_lines_tokenized.append(n_gram_sequence.numpy().tolist())
total_words = len(vectorize_layer.get_vocabulary())

In [None]:
voice_lines_tokenized[0:10]

In [None]:
from keras.preprocessing.sequence import pad_sequences
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(voice_lines_tokenized)

predictors

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = keras.Sequential()
    
    # Add Input Embedding Layer
    model.add(keras.layers.Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(keras.layers.LSTM(100))
    model.add(keras.layers.Dropout(0.1))
    
    # Add Output Layer
    model.add(keras.layers.Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

In [None]:
model.fit(predictors, label, batch_size=3000, epochs=100, verbose=5)

In [None]:
from tensorflow.keras.models import load_model
model.save('/path_to_model/model.h5')

In [1]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = vectorize_layer(seed_text)
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        prediction = model.predict(token_list)[0]
                
        partition = np.argpartition(prediction, -5)[-5:]
        word_index = partition[np.random.choice(partition.shape[0], 1, replace=False)][0]
        seed_text += " "+vectorize_layer.get_vocabulary()[word_index]
        
    return seed_text.title()

In [3]:
print (generate_text("world", 15, model, max_sequence_len))

NameError: name 'model' is not defined