<a href="https://colab.research.google.com/github/GlassesNoGlasses/TFProjects/blob/light-novel-generator/projects/text/light-novel-text-generation/LightNovelGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# clone github repo

!git clone https://github.com/GlassesNoGlasses/TFProjects.git

Cloning into 'TFProjects'...
remote: Enumerating objects: 1816, done.[K
remote: Counting objects: 100% (1816/1816), done.[K
remote: Compressing objects: 100% (1530/1530), done.[K
remote: Total 1816 (delta 301), reused 1761 (delta 277), pack-reused 0[K
Receiving objects: 100% (1816/1816), 22.93 MiB | 24.82 MiB/s, done.
Resolving deltas: 100% (301/301), done.


In [2]:
# imports

import tensorflow as tf
import numpy as np
import pandas as pd
import re

In [3]:
# fetch csv file data

file_path = "/content/TFProjects/data/csv/light-novel-titles.csv"

df = pd.read_csv(file_path)

df

Unnamed: 0,titles,descriptions,genres,links
0,Thereafter of an Exiled Magician ~Somehow My E...,"Lain, a reincarnated person, lives his life as...","['Action', 'Adventure', 'Ecchi', 'Fantasy', 'H...",https://www.novelupdates.com/series/thereafter...
1,Mahoutsukai no Konyakusha (LN),I who was reincarnated into a world of sword a...,"['Action', 'Drama', 'Fantasy', 'Harem', 'Matur...",https://www.novelupdates.com/series/mahoutsuka...
2,Exiled Prince Without Skills: Infinite Growth ...,"Henry, born in a royal family without skills, ...","['Action', 'Adventure', 'Fantasy', 'Horror', '...",https://www.novelupdates.com/series/exiled-pri...
3,Reincarnation Of The Strongest Sword God,"Starting over once more, he has entered this “...","['Action', 'Fantasy', 'Martial Arts', 'School ...",https://www.novelupdates.com/series/reincarnat...
4,Hard Work Will Never Betray Me (LN),A youth whose effort wasn’t rewarded lost his ...,"['Action', 'Adventure', 'Fantasy']",https://www.novelupdates.com/series/hard-work-...
...,...,...,...,...
1361,Slayers,"""Slayers follows the adventures of teenage sor...","['Action', 'Fantasy', 'Shounen', 'Supernatural']",https://www.novelupdates.com/series/slayers/
1362,No. 6,"The year is 2013, and it's the city of No. 6: ...","['Action', 'Adventure', 'Fantasy', 'School Life']",https://www.novelupdates.com/series/no-6/
1363,Kidou Senshi Gundam High-Streamer,Originally published in Animage magazine in th...,"['Comedy', 'Shounen Ai']",https://www.novelupdates.com/series/kidou-sens...
1364,Oda Nobuna no Yabou,"Suddenly finding himself in the Sengoku era, r...","['Action', 'Adventure', 'Comedy', 'Fantasy', '...",https://www.novelupdates.com/series/oda-nobuna...


In [4]:
# Extract title and descriptions from data

column_names = list(df)

print(column_names)

df = df[['titles', 'descriptions']]
df

['titles', 'descriptions', 'genres', 'links']


Unnamed: 0,titles,descriptions
0,Thereafter of an Exiled Magician ~Somehow My E...,"Lain, a reincarnated person, lives his life as..."
1,Mahoutsukai no Konyakusha (LN),I who was reincarnated into a world of sword a...
2,Exiled Prince Without Skills: Infinite Growth ...,"Henry, born in a royal family without skills, ..."
3,Reincarnation Of The Strongest Sword God,"Starting over once more, he has entered this “..."
4,Hard Work Will Never Betray Me (LN),A youth whose effort wasn’t rewarded lost his ...
...,...,...
1361,Slayers,"""Slayers follows the adventures of teenage sor..."
1362,No. 6,"The year is 2013, and it's the city of No. 6: ..."
1363,Kidou Senshi Gundam High-Streamer,Originally published in Animage magazine in th...
1364,Oda Nobuna no Yabou,"Suddenly finding himself in the Sengoku era, r..."


In [5]:
# get only long descriptions

df.descriptions.apply(lambda x: x.strip())
df['desc_word_count'] = df.descriptions.apply(lambda x: len(str(x).split()))

text_df = df.descriptions[df['desc_word_count'] > 10]
text = text_df.to_numpy()
text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['desc_word_count'] = df.descriptions.apply(lambda x: len(str(x).split()))


array(['Lain, a reincarnated person, lives his life as an 「Anti-Demon Aristocrat」 who is responsible for defeating demons in the frontier.At the age of 15, his engagement to the princess, a privilege of the Anti-Demon Aristocrats, is supposed to be announced, but he is told to break the engagement and is also stripped of his status as a member of the anti-demon aristocracy.\nCasted out not only by his country, but also by his adoptive-father, Lain leaves the royal capital and uses wind magic to fly through the air to a faraway village.\nFor some reason, Lain decides to live in the village, where only young girls live, and works with the girls to strengthen the village, teach them magic, and enjoy the freedom he has never had before…\nLain has escaped his black environment and is enjoying the slow life, but Lain’s power is affecting the world――!',
       'I who was reincarnated into a world of sword and magic. The revived Demon King, the Hero chosen by the holy sword─────in the middle o

In [6]:
# tokenize and parse descriptions
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(text)

In [7]:
# create input texts for model

input_sequences = []
for title in text:
  # convert each description into a sequence
  token_list = tokenizer.texts_to_sequences([title])[0]
  for i in range(1, len(token_list)):
    # append n-length tokens to input sequence
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)


In [8]:
# padding sequences

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [9]:
# create predictors and labels
total_words = len(tokenizer.word_index) + 1

xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

labels

array([   2,  100,  135, ..., 2079,   56, 3314], dtype=int32)

In [10]:
# creating the model
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

model = Sequential([
  layers.Embedding(total_words, 95, input_length=max_sequence_len -1),
  layers.Bidirectional(keras.layers.LSTM(128)),
  layers.Dense(total_words, activation='softmax'),
])

model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adam(learning_rate=0.001),
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 497, 95)           1424620   
                                                                 
 bidirectional (Bidirection  (None, 256)               229376    
 al)                                                             
                                                                 
 dense (Dense)               (None, 14996)             3853972   
                                                                 
Total params: 5507968 (21.01 MB)
Trainable params: 5507968 (21.01 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
# training model

epochs = 40

history = model.fit(
  xs,
  ys,
  epochs=epochs,
  verbose=1
)