In [1]:
import pandas as pd

from nlp_mining import WordCleaner

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jgnsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jgnsa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [14]:
df = pd.read_csv('BASE.csv', index_col=0)
df.head()

Unnamed: 0,title,quote,ID,Year,Genre
0,10 things i hate about you,Who knocked up your sister?,tt0147800,1999,"Comedy,Drama,Romance"
1,10 things i hate about you,"I was watching you out there, before. I've nev...",tt0147800,1999,"Comedy,Drama,Romance"
2,10 things i hate about you,"You're 18, you don't know what you want. And y...",tt0147800,1999,"Comedy,Drama,Romance"
3,10 things i hate about you,"Ooh, see that, there. Who needs affection when...",tt0147800,1999,"Comedy,Drama,Romance"
4,10 things i hate about you,"Just 'cause you're beautiful, that doesn't mea...",tt0147800,1999,"Comedy,Drama,Romance"


In [15]:
expanse = pd.read_csv('EXPANSE.csv', index_col=0)
expanse.head()

Unnamed: 0,title,quote,ID,Year,Genre
0,10 things i hate about you,Who knocked up your sister?,tt0147800,1999,Comedy
1,10 things i hate about you,Who knocked up your sister?,tt0147800,1999,Drama
2,10 things i hate about you,Who knocked up your sister?,tt0147800,1999,Romance
0,10 things i hate about you,"I was watching you out there, before. I've nev...",tt0147800,1999,Comedy
1,10 things i hate about you,"I was watching you out there, before. I've nev...",tt0147800,1999,Drama


In [16]:
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import random
import io

## 1. Input Preprocessing

In [40]:
# Quote List
quotes = list(df.quote)

# Corpus
text = ''
for q in quotes:
    text += ' ' + q
print("Corpus length:", len(text))

# Total Characters
chars = sorted(list(set(text)))
print("Total chars:", len(chars))

# Dictionaries
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# Number of sequences
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print("Number of sequences:", len(sentences))

Corpus length: 630844
Total chars: 78
Number of sequences: 210268


In [41]:
# Defining X and y
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

## 2. Building the Model

In [44]:
model = keras.Sequential(
    [
        keras.layers.InputLayer(input_shape=(maxlen, len(chars))),
        layers.LSTM(128),
        layers.Dense(len(chars), activation="softmax"),
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [45]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## 3. Training the Model

In [46]:
epochs = 40
batch_size = 128

for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1)
    print()
    print("Generating text after epoch: %d" % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("...Diversity:", diversity)

        generated = ""
        sentence = text[start_index : start_index + maxlen]
        print('...Generating with seed: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.0
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("...Generated: ", generated)
        print()


...Generated:   you night, thent hole are fucking dashon plan! Potter open to a blackes. To the times... shand... and feel into the Can civatte, and you're going to servick the side. They make anything. And I don't make anyone wanna comes my neirk with rung so deal back of uncartre? This is your seeing like the bourno kik, phocwly getheag alright my drise was gonna not again thing over to buy the offest, but old

...Diversity: 1.2
...Generating with seed: " of perm maintenance that you are forbid"
...Generated:  . Wellohimle was the book of tking preeps thy write waathed. Tumblet. I got the comubbaly you always take ciss. Barconice? eye talking? if we have listen on the oictomat's inherition tires as much Sum forgevin's myself! I can't hel lay e, anesses, foutly wrongs  gh them posstr, , reach inseers to get horrib. She a makin' back I I am I? That's ago around. Ssigging. This willing I started him nigyes


Generating text after epoch: 31
...Diversity: 0.2
...Generating with seed: "es

In [47]:
model.save('Base_Quote_Generator.h5')