<a href="https://colab.research.google.com/github/Mfiso1/Recurrent-Neural-Networks/blob/main/rnn_lstm_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================
# 📘 Author Prediction Bot - LSTM (Single Folder)
# ============================================

# 1️⃣ Install necessary libraries
!pip install tensorflow keras nltk

import os
import re
import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

nltk.download('punkt')





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# 1️⃣ Path to your txt file(s)
data_path = "/content"
txt_files = [f for f in os.listdir(data_path) if f.endswith(".txt")]

# 2️⃣ Read txt files and extract author from filename
texts = []
authors = []

for file_name in txt_files:
    file_path = os.path.join(data_path, file_name)
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        texts.append(f.read())
        author = file_name.split("_by_")[-1].replace(".txt", "")
        authors.append(author)

df = pd.DataFrame({"author": authors, "text": texts})
print("Dataset shape:", df.shape)
df.head()





Dataset shape: (20, 2)


Unnamed: 0,author,text
0,Charles_Dickens,The Project Gutenberg EBook of Great Expectati...
1,Oscar_Wilde,"The Project Gutenberg eBook, The Importance of..."
2,The_Brothers_Grimm,The Project Gutenberg EBook of Grimms’ Fairy T...
3,Arthur_Conan_Doyle,Project Gutenberg's The Adventures of Sherlock...
4,Jane_Austen,"The Project Gutenberg EBook of Emma, by Jane A..."


In [None]:
# 3️⃣ Split text into chunks (~200 words)
def chunk_text(text, chunk_size=200):
    words = re.findall(r'\S+', text)
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunked_texts = []
chunked_authors = []

for i, row in df.iterrows():
    chunks = chunk_text(row["text"])
    chunked_texts.extend(chunks)
    chunked_authors.extend([row["author"]] * len(chunks))

df_chunks = pd.DataFrame({"author": chunked_authors, "text": chunked_texts})
print("Chunked dataset shape:", df_chunks.shape)
df_chunks.head()


Chunked dataset shape: (12102, 2)


Unnamed: 0,author,text
0,Charles_Dickens,The Project Gutenberg EBook of Great Expectati...
1,Charles_Dickens,of them (for their days were long before the d...
2,Charles_Dickens,raw afternoon towards evening. At such a time ...
3,Charles_Dickens,"water, and smothered in mud, and lamed by ston..."
4,Charles_Dickens,"man, licking his lips, “what fat cheeks you ha..."


In [None]:
# 4️⃣ Encode author labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_chunks["author"])
y = to_categorical(y)

# 5️⃣ Tokenize text
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df_chunks["text"])
X = tokenizer.texts_to_sequences(df_chunks["text"])
X = pad_sequences(X, maxlen=200, padding="post", truncating="post")

print("X shape:", X.shape, "y shape:", y.shape)

X shape: (12102, 200) y shape: (12102, 16)


In [None]:
# 6️⃣ Build LSTM Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=200),
    LSTM(128),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(y.shape[1], activation="softmax")
])

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()




In [None]:
# 7️⃣ Train model
history = model.fit(X, y, epochs=5, batch_size=16, validation_split=0.2)

# 8️⃣ Test prediction
def predict_author(sample_text):
    seq = tokenizer.texts_to_sequences([sample_text])
    seq = pad_sequences(seq, maxlen=200, padding="post")
    pred = model.predict(seq)
    return label_encoder.inverse_transform([np.argmax(pred)])[0]

sample = "This play explores the conflicts between duty, morality, and societal expectations."
print("Prediction:", predict_author(sample))

Epoch 1/5
[1m606/606[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 163ms/step - accuracy: 0.2174 - loss: 2.3055 - val_accuracy: 0.0136 - val_loss: 4.2000
Epoch 2/5
[1m606/606[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 169ms/step - accuracy: 0.2676 - loss: 2.0992 - val_accuracy: 0.1330 - val_loss: 6.5769
Epoch 3/5
[1m519/606[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m13s[0m 154ms/step - accuracy: 0.3288 - loss: 1.8923