In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset/1661-0.txt


# # **Downloading Dependencies**

In [31]:
! pip install tensorflow



In [32]:
import pandas as pd
import os
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Reading the file and processing the text**

In [33]:
file='/kaggle/input/dataset/1661-0.txt'
text=open(file).read().lower()

In [34]:
text = text.lower()

In [35]:
def remove_punctuation(input_string):
    # Create a translation table to remove punctuation
    translator = str.maketrans("", "", string.punctuation)

    # Apply the translation table to remove punctuation
    result_string = input_string.translate(translator)
    
    return result_string

In [36]:
text = remove_punctuation(text)
lst = str.split(text)

In [37]:
def remove_stopwords(lst):
    stop=stopwords.words('english')
    new_lst=[]
    for i in lst:
        if i not in stop:
            new_lst.append(i)
    return new_lst

In [38]:
lst = remove_stopwords(lst)

In [39]:
lemmatizer=nltk.stem.WordNetLemmatizer()
def lemmatzation(lst):
    new_lst=[]
    for i in lst:
        i=lemmatizer.lemmatize(i)
        new_lst.append(i)
    return new_lst



# **Tokenisation**

In [40]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([lst])
total_words = len(tokenizer.word_index) + 1

In [41]:
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [42]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [43]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [44]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))


# **Using LSTM**

In [45]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='relu'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 100)           1028400   
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 10284)             1552884   
                                                                 
Total params: 2731884 (10.42 MB)
Trainable params: 2731884 (10.42 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


# **Running epochs**

In [47]:
model.compile(loss='categorical_crossentropy'', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7a5ba73a9fc0>

# **Function word generate**

In [52]:
def word_generate(seed_text, next_words):
    generated_text = seed_text
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
        generated_text += " " + output_word
    return generated_text

In [53]:
word_generate("remarkable as being the scene",500)



'remarkable as being the scene jewel robbery hotel went back yard went back home body upon door door opened passage behind head face “it standing windows face man goose head way looking bed bed instant upon scent five upon face face even pile could heard heard hair upon advice picked whither though light slip light still dark floor little dark eyes face hair hair red purple plush old old dark eyes red hair said sherlock holmes sat beside beside patted one side twenty points made looking bed little blow him” light passage hurried face face face freckled standing running hard three small place—within side edge edge holmes—lord edge edge white edge small neck neck white white brown jet board year paper year keep house beyond red right right together uncertain body step body lay upon heart made absolutely fell everywhere face five face dried shadow faced looked upon affair upon dark saw dead dead way one floor even became shall see gets gets bottom certainly it” said holmes “if left anyone

#  **Downloading the weights in h5 format**

In [51]:
model.save_weights("lstm_weights.h5")