In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.utils import to_categorical

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/league-of-legends-voice-lines/voice_lines.csv
/kaggle/input/league-of-legends-voice-lines/LICENSE
/kaggle/input/league-of-legends-voice-lines/README.md
/kaggle/input/league-of-legends-voice-lines/utils/all_voice_lines.csv
/kaggle/input/league-of-legends-voice-lines/utils/Wiki Scraping.ipynb
/kaggle/input/league-of-legends-voice-lines/utils/.ipynb_checkpoints/Wiki Scraping-checkpoint.ipynb


In [2]:
voice_lines = pd.read_csv("/kaggle/input/league-of-legends-voice-lines/voice_lines.csv")

voice_lines.head()

Unnamed: 0.1,Unnamed: 0,champion,voice_line,is_spoken
0,0,Aatrox,"Now, hear the silence of annihilation!",True
1,1,Aatrox,Let me end this!,True
2,2,Aatrox,"Pantheon, I see you! Like a lamprey you attach...",True
3,3,Aatrox,"Pantheon! I will crush your godhood, I will ev...",True
4,4,Aatrox,Targon sends an aspect against me? Prepare you...,True


In [3]:
voice_lines.describe(include='all')

Unnamed: 0.1,Unnamed: 0,champion,voice_line,is_spoken
count,28904.0,28904,28904,28904
unique,,158,28898,2
top,,Kayn,This power is mine to command.,True
freq,,864,2,26889
mean,18923.205785,,,
std,11698.44545,,,
min,0.0,,,
25%,8213.75,,,
50%,18205.5,,,
75%,29397.25,,,


In [4]:
voice_lines.drop(voice_lines[voice_lines.is_spoken == False].index, inplace=True)
voice_lines.drop(['Unnamed: 0', 'is_spoken'], axis=1, inplace=True)

voice_lines["lines_augmented"] = "lname" + voice_lines["champion"] + " " + voice_lines["voice_line"] + " " + "endlstop"

voice_lines.describe(include='all')

Unnamed: 0,champion,voice_line,lines_augmented
count,26889,26889,26889
unique,156,26883,26887
top,Kayn,Rhaast.,lnameKayn Rhaast. endlstop
freq,838,2,2


In [5]:
example_text = ["this should be easy", "thats a mink if I've ever mink mink", "time to"]

vectorize_layer = keras.layers.TextVectorization(standardize="lower_and_strip_punctuation",
                                         split="whitespace",
                                         output_mode="int")

vectorize_layer.adapt(example_text)

example = vectorize_layer(["mink be easy", "easy be mink"])
example

2022-07-20 04:51:36.609341: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-07-20 04:51:36.723811: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[ 2, 12, 11],
       [11, 12,  2]])>

In [6]:
vectorize_layer.adapt(voice_lines["lines_augmented"])

voice_lines_tokenized = []

for line in voice_lines["lines_augmented"]:
    token_list = vectorize_layer(line)
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        voice_lines_tokenized.append(n_gram_sequence.numpy().tolist())
total_words = len(vectorize_layer.get_vocabulary())

In [7]:
voice_lines_tokenized[0:10]

[[160, 47],
 [160, 47, 250],
 [160, 47, 250, 3],
 [160, 47, 250, 3, 705],
 [160, 47, 250, 3, 705, 10],
 [160, 47, 250, 3, 705, 10, 1483],
 [160, 47, 250, 3, 705, 10, 1483, 2],
 [160, 112],
 [160, 112, 11],
 [160, 112, 11, 147]]

In [8]:
from keras.preprocessing.sequence import pad_sequences
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(voice_lines_tokenized)

predictors

array([[   0,    0,    0, ...,    0,    0,  160],
       [   0,    0,    0, ...,    0,  160,   47],
       [   0,    0,    0, ...,  160,   47,  250],
       ...,
       [   0,    0,    0, ...,  781,   15, 2514],
       [   0,    0,    0, ...,   15, 2514,    8],
       [   0,    0,    0, ..., 2514,    8,  428]], dtype=int32)

In [9]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = keras.Sequential()
    
    # Add Input Embedding Layer
    model.add(keras.layers.Embedding(total_words, 10, input_length=input_len, mask_zero=True))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(keras.layers.LSTM(100))
    model.add(keras.layers.Dropout(0.1))
    
    # Add Output Layer
    model.add(keras.layers.Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 47, 10)            127140    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 12714)             1284114   
Total params: 1,455,654
Trainable params: 1,455,654
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.fit(predictors, label, batch_size=2000, epochs=100, verbose=5)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f81f4ec30d0>

In [11]:
from tensorflow.keras.models import load_model
model.save('/path_to_model/model.h5')

In [12]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = vectorize_layer(seed_text)
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        prediction = model.predict(token_list)[0]
                
        partition = np.argpartition(prediction, -5)[-5:]
        word_index = partition[np.random.choice(partition.shape[0], 1, replace=False)][0]
        seed_text += " "+vectorize_layer.get_vocabulary()[word_index]
        
    return seed_text.title()

In [13]:
print (generate_text("lnameahri this", 15, model, max_sequence_len))

Lnameahri This World Has A Little Bit To A Good Endlstop But Its Not The Best Side


In [14]:
print (generate_text("lnameaatrox this", 15, model, max_sequence_len))

Lnameaatrox This Will Be Judged In My Blade The Truth And A God Endlstop Endlstop And I
