# Training an LSTM model on books dataset

First we import necessary packages for our project

In [2]:
from matplotlib import pyplot as plt
import numpy as np
import os
from pathlib import Path
import pickle
import random
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tqdm import tqdm

## Preparing Dataset

Initialize the project root, data, and model directories.

In [3]:
# get data and model directories
project_root = Path('__file__').resolve().parents[1]
data_dir = project_root / 'data/'
model_dir = project_root / 'models/bn_lstm'
os.makedirs(model_dir, exist_ok=True) # Create if does not exist

Get list of files in the data directory

In [4]:
filenames = [str(file) for file in Path(data_dir).glob('**/*.txt')]
sorted(filenames, key=os.path.getsize) # sort the list by file size

['/home/shafquat/bangla-next-word-prediction/data/রুদ্র মুহম্মদ শহিদুল্লাহ/মৌলিক মুখোশ (১৯৯০).txt',
 '/home/shafquat/bangla-next-word-prediction/data/শক্তি চট্টোপাধ্যায়/মন্ত্রের মতন আছি স্থির (১৯৮০).txt',
 '/home/shafquat/bangla-next-word-prediction/data/শক্তি চট্টোপাধ্যায়/অস্ত্রের গৌরবহীন একা (১৯৭৫).txt',
 '/home/shafquat/bangla-next-word-prediction/data/শক্তি চট্টোপাধ্যায়/প্রভু, নষ্ট হয়ে যাই (১৯৭২).txt',
 '/home/shafquat/bangla-next-word-prediction/data/বাংলা মহাভারত/০৮. কর্ণপর্ব (সংস্কৃত).txt',
 '/home/shafquat/bangla-next-word-prediction/data/বাংলা মহাভারত/০৯. শল্যপর্ব (সংস্কৃত).txt',
 '/home/shafquat/bangla-next-word-prediction/data/শক্তি চট্টোপাধ্যায়/ঈশ্বর থাকেন জলে (১৯৭৫).txt',
 '/home/shafquat/bangla-next-word-prediction/data/বাংলা মহাভারত/০৭. দ্রোণপর্ব (সংস্কৃত).txt',
 '/home/shafquat/bangla-next-word-prediction/data/বাংলা মহাভারত/০৪. বিরাটপর্ব (সংস্কৃত).txt',
 '/home/shafquat/bangla-next-word-prediction/data/বাংলা মহাভারত/০৩. অরণ্যপর্ব (সংস্কৃত).txt',
 '/home/shafquat/bangla

Create a training and test `TextLineDataset` from a list of files.

In [5]:
train_data = tf.data.Dataset.from_tensor_slices(filenames[:50])
train_data = train_data.flat_map(lambda filename: tf.data.TextLineDataset(filename))
test_data = tf.data.Dataset.from_tensor_slices(filenames[50:60])
test_data = test_data.flat_map(lambda filename: tf.data.TextLineDataset(filename))

2022-03-30 21:30:07.031214: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-30 21:30:07.163734: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-30 21:30:07.164019: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-30 21:30:07.166270: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Tokenize the data and append to list


In [6]:
MAX_WORDS = 20000
MAX_SEQUENCE_LENGTH = 100

In [7]:
vectorize_layer = tf.keras.layers.TextVectorization(output_mode='int',
                                                    max_tokens=MAX_WORDS,
                                                    output_sequence_length=MAX_SEQUENCE_LENGTH)
vectorize_layer.adapt(train_data)
train_data = train_data.map(vectorize_layer)

In [12]:
# need to create X_train, y_train, etc.

TypeError: 'MapDataset' object is not subscriptable

## Training Model

Create a sequential LSTM model

In [87]:
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(
        MAX_WORDS+1, MAX_SEQUENCE_LENGTH-1, input_length=MAX_SEQUENCE_LENGTH-1),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(MAX_WORDS+1, activation='softmax')
])
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

Create callbacks `EarlyStopping` which monitors `val_loss` for 10 epochs, and `ModelCheckpoint` which saves best model to `model_dir`.

In [88]:
earlystop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0, patience=10, mode='auto')
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    str(model_dir), monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [89]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=epochs,
                    validation_split=0.2, verbose=2)