# Artificial Neural Network

### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
tf.__version__




'2.18.0'

## Data Preprocessing

### Importing the dataset

In [2]:
import os

current_dir = os.path.dirname(os.path.realpath('__file__'))
root = os.path.join(current_dir, "Books")

data = []

# Loop through author and book folders, adding .txt file contents to data with matching labels
for author in os.listdir(root):
    author_path = os.path.join(root, author)

    for book in os.listdir(author_path):
        book_path = os.path.join(author_path, book)

        for chapter in os.listdir(book_path):
            chapter_path = os.path.join(book_path, chapter)
            
            with open(chapter_path, 'r', encoding='utf-8') as file:
                text = file.read()
                
            data.append({
                'Author': author,
                'Book': book,
                'Chapter': chapter[:-4],
                'Text': text
            })

dataset = pd.DataFrame(data)
print(dataset.head())

            Author                  Book Chapter  \
0  Charles Dickens  A Tale of Two Cities   1 - 1   
1  Charles Dickens  A Tale of Two Cities   1 - 2   
2  Charles Dickens  A Tale of Two Cities   1 - 3   
3  Charles Dickens  A Tale of Two Cities   1 - 4   
4  Charles Dickens  A Tale of Two Cities   1 - 5   

                                                Text  
0   It was the best of times, it was the worst of...  
1  It was the Dover road that lay, on a Friday ni...  
2  A wonderful fact to reflect upon, that every h...  
3  Then the mail got successfully to Dover, in th...  
4  A large cask of wine had been dropped and brok...  


### Splitting and Cleaning

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

fragment_size = 200
overlap = 50

# First we clean our text, converting to lower case and removing unwanted characted
# Then we apply corpus, simplifying our text
# Finally we split our text into fragments of 'fragment_size', with an overlap of 'overlap' words from the previous fragment

def preprocess_text(text, ps, all_stopwords):
    # Clean text
    text = text.lower()  # Lowercase
    text = re.sub(r'\n', " ", text)  # Newlines
    text = re.sub(r'[^a-zA-Z\s]', " ", text)  # Punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Extra spaces

    # Apply corpus
    words = text.split()
    words = [ps.stem(word) for word in words if word not in all_stopwords]
    processed_text = " ".join(words)

    return processed_text

def fragment_text(text, fragment_size, overlap):
    # Split text into fragments of fragment_size length, returns array of fragments
    words = text.split()
    current_text_fragments = []
    
    step_size = fragment_size - overlap  
    
    for i in range(0, len(words), step_size):
        current_fragment = " ".join(words[i:i + fragment_size])
        current_text_fragments.append(current_fragment)

        # Handle situation where final chapter fragment is already contained in the previous fragment
        if len(words) - i < fragment_size:
            break
        
    return current_text_fragments

In [4]:
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

# Apply our cleaning and create a new dataset to replace our previous one, this time with processed text
text_fragments = []
for index, row in dataset.iterrows():
    text = row["Text"]
    text = preprocess_text(text, ps, all_stopwords)
    current_text_fragments = fragment_text(text, fragment_size, overlap)
    
    for text_fragment in current_text_fragments:
        text_fragments.append({
            "Book": row["Book"],
            "Author": row["Author"],
            "Text": text_fragment
        })

# Convert the data fragments into a Pandas DataFrame and replace the original
dataset = pd.DataFrame(text_fragments)

### Encoding

In [5]:
X = dataset["Text"].values
y = dataset["Author"].values

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words='english',    # Remove common stopwords if they don't carry stylistic information
    max_df=0.95,             # Optionally ignore words that appear in too many documents
    min_df=5                 # Optionally ignore rare words that might be noise
)

X = vectorizer.fit_transform(X)

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Create the model instance
nb_model = MultinomialNB()

# Create Stratified K-Fold object
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using K-fold cross-validation
author_names = dataset['Author']
scores = cross_val_score(nb_model, X, author_names, cv=cv, scoring='accuracy')
print("Mean Accuracy: {:.2f}%".format(scores.mean() * 100))

# Fit model
nb_model.fit(X, author_names)

Mean Accuracy: 94.01%


In [24]:
# Complete set of tests, one for each author, using text not in dataset
tests_path = os.path.join(current_dir, "Tests")

for author in os.listdir(tests_path):
    test_file_path = os.path.join(tests_path, author)
    
    with open(test_file_path, 'r', encoding='utf-8') as file:
        new_text = file.read()
    
    processed_new_text = preprocess_text(new_text, ps, all_stopwords)
    
    encoded_new_text = vectorizer.transform([processed_new_text])
    
    predicted_author = nb_model.predict(encoded_new_text)
    
    print(f"Predicted: {predicted_author} vs Actual: {author[:-4]}")

Predicted: ['Charles Dickens'] vs Actual: Charles Dickens
Predicted: ['F Scott Fitzgerald'] vs Actual: F Scott Fitzgerald
Predicted: ['Fyodor Dostoyevsky'] vs Actual: Fyodor Dostoyevsky
Predicted: ['Fyodor Dostoyevsky'] vs Actual: Herman Melville
Predicted: ['Mary Shelley'] vs Actual: Mary Shelley
Predicted: ['Oscar Wilde'] vs Actual: Oscar Wilde
Predicted: ['Robert Louise Stevenson'] vs Actual: Robert Louise Stevenson
