<a href="https://colab.research.google.com/github/Mfiso1/Recurrent-Neural-Networks/blob/main/rnn_lstm_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import os                           # Provides functions to interact with the operating system
import re                           # Allows regular expression operations for pattern matching and text cleaning
import numpy as np                  # Numerical computing library for arrays and mathematical operations
import pandas as pd                 # Data manipulation and analysis library (DataFrames, CSV/Excel I/O)
import nltk                         # Natural Language Toolkit for text processing and tokenization
from sklearn.model_selection import train_test_split   # Splits datasets into training and testing sets
from sklearn.preprocessing import LabelEncoder        # Converts categorical labels into numeric form
from tensorflow.keras.preprocessing.text import Tokenizer        # Converts text into sequences of integers for ML models
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Pads sequences to a fixed length for uniform input
from tensorflow.keras.utils import to_categorical               # Converts numeric labels into one-hot encoded vectors
from tensorflow.keras.models import Sequential                  # Keras class for building sequential neural network models
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout  # Layers: Embedding (word vectors), LSTM (sequence learning), Dense (fully connected), Dropout (regularization)


# Spark
from pyspark.sql import SparkSession          # Entry point for using Spark SQL and DataFrames in PySpark
from pyspark.sql.functions import input_file_name, udf  # input_file_name: gets the source file path; `udf` defines custom user functions
from pyspark.sql.types import StringType     # Defines the data type of columns when using UDFs or creating DataFrames -  in this instance -  string type

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
# Initializing Spark
spark = SparkSession.builder \
    .appName("AuthorPrediction") \
    .getOrCreate()


In [28]:
# Clone repo
!git clone https://github.com/Mfiso1/Recurrent-Neural-Networks.git
data_path = "Recurrent-Neural-Networks/text_files"  # folder name

# Read all .txt files into Spark DataFrame
df = spark.read.text(f"{data_path}/*.txt").withColumnRenamed("value", "text") \
    .withColumn("file_path", input_file_name())


fatal: destination path 'Recurrent-Neural-Networks' already exists and is not an empty directory.


In [29]:
df.show(5)   # Displays the first 5 rows in the DataFrame

+--------------------+--------------------+
|                text|           file_path|
+--------------------+--------------------+
|Project Gutenberg...|file:///content/R...|
|                    |file:///content/R...|
|This eBook is for...|file:///content/R...|
|no restrictions w...|file:///content/R...|
|under the terms o...|file:///content/R...|
+--------------------+--------------------+
only showing top 5 rows



In [30]:
# Show all unique file paths
df.select("file_path").distinct().show(truncate=False)


+----------------------------------------------------------------------------------------------------------------+
|file_path                                                                                                       |
+----------------------------------------------------------------------------------------------------------------+
|file:///content/Recurrent-Neural-Networks/text_files/Emma_by_Jane_Austen.txt                                    |
|file:///content/Recurrent-Neural-Networks/text_files/A_Tale_of_Two_Cities_by_Charles_Dickens.txt                |
|file:///content/Recurrent-Neural-Networks/text_files/Great_Expectations_by_Charles_Dickens.txt                  |
|file:///content/Recurrent-Neural-Networks/text_files/The_Adventures_of_Sherlock_Holmes_by_Arthur_Conan_Doyle.txt|
|file:///content/Recurrent-Neural-Networks/text_files/Ulysses_by_James_Joyce.txt                                 |
|file:///content/Recurrent-Neural-Networks/text_files/Adventures_of_Huckleberry_

In [22]:
# Extract author from filename
def extract_author(path):
    return re.findall(r'_by_(.*).txt', path)[0]

author_udf = udf(extract_author, StringType())
df = df.withColumn("author", author_udf("file_path"))

# Optional: clean text, remove punctuation
def clean_text(text):
    return re.sub(r'[^a-zA-Z\s]', '', text.lower())

clean_udf = udf(clean_text, StringType())
df = df.withColumn("clean_text", clean_udf("text"))

df.show(5)


+--------------------+--------------------+---------------+--------------------+
|                text|           file_path|         author|          clean_text|
+--------------------+--------------------+---------------+--------------------+
|Project Gutenberg...|file:///content/R...|Alexandre_Dumas|project gutenberg...|
|                    |file:///content/R...|Alexandre_Dumas|                    |
|This eBook is for...|file:///content/R...|Alexandre_Dumas|this ebook is for...|
|no restrictions w...|file:///content/R...|Alexandre_Dumas|no restrictions w...|
|under the terms o...|file:///content/R...|Alexandre_Dumas|under the terms o...|
+--------------------+--------------------+---------------+--------------------+
only showing top 5 rows



In [23]:
# Show all unique authors
df.select("author").distinct().show(truncate=False)

+------------------------+
|author                  |
+------------------------+
|Arthur_Conan_Doyle      |
|Charles_Dickens         |
|Alexandre_Dumas         |
|Jane_Austen             |
|Mark_Twain              |
|Anonymous               |
|James_Joyce             |
|Bram_Stoker             |
|Oscar_Wilde             |
|Charlotte_Perkins_Gilman|
|Franz_Kafka             |
|Lewis_Carroll           |
|Henrik_Ibsen            |
|Nicolo_Machiavelli      |
|Mary_Shelley            |
|The_Brothers_Grimm      |
+------------------------+



In [24]:
pandas_df = df.select("clean_text", "author").toPandas()
texts = pandas_df['clean_text'].tolist()
labels = pandas_df['author'].tolist()


In [25]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=500, padding='post')

le = LabelEncoder()
y = to_categorical(le.fit_transform(labels))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=500),
    LSTM(128),
    Dropout(0.2),
    Dense(y.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)




Epoch 1/5
[1m 292/6182[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:32:25[0m 942ms/step - accuracy: 0.2104 - loss: 2.4779