# Artificial Neural Network

### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
tf.__version__




'2.18.0'

## Data Preprocessing

### Importing the dataset

In [2]:
import os

current_dir = os.path.dirname(os.path.realpath('__file__'))
root = os.path.join(current_dir, "Books")

data = []

# Loop through author and book folders, adding .txt file contents to data with matching labels
for author in os.listdir(root):
    author_path = os.path.join(root, author)

    for book in os.listdir(author_path):
        book_path = os.path.join(author_path, book)

        for chapter in os.listdir(book_path):
            chapter_path = os.path.join(book_path, chapter)
            
            with open(chapter_path, 'r', encoding='utf-8') as file:
                text = file.read()
                
            data.append({
                'Author': author,
                'Book': book,
                'Chapter': chapter[:-4],
                'Text': text
            })

dataset = pd.DataFrame(data)
print(dataset.head())

            Author                  Book Chapter  \
0  Charles Dickens  A Tale of Two Cities   1 - 1   
1  Charles Dickens  A Tale of Two Cities   1 - 2   
2  Charles Dickens  A Tale of Two Cities   1 - 3   
3  Charles Dickens  A Tale of Two Cities   1 - 4   
4  Charles Dickens  A Tale of Two Cities   1 - 5   

                                                Text  
0   It was the best of times, it was the worst of...  
1  It was the Dover road that lay, on a Friday ni...  
2  A wonderful fact to reflect upon, that every h...  
3  Then the mail got successfully to Dover, in th...  
4  A large cask of wine had been dropped and brok...  


### Splitting and Cleaning

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

fragment_size = 200
overlap = 50

# First we clean our text, converting to lower case and removing unwanted characted
# Then we apply corpus, simplifying our text
# Finally we split our text into fragments of 'fragment_size', with an overlap of 'overlap' words from the previous fragment

def preprocess_text(text, ps, all_stopwords):
    # Clean text
    text = text.lower()  # Lowercase
    text = re.sub(r'\n', " ", text)  # Newlines
    text = re.sub(r'[^a-zA-Z\s]', " ", text)  # Punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Extra spaces

    # Apply corpus
    words = text.split()
    words = [ps.stem(word) for word in words if word not in all_stopwords]
    processed_text = " ".join(words)

    return processed_text

def fragment_text(text, fragment_size, overlap):
    # Split text into fragments of fragment_size length, returns array of fragments
    words = text.split()
    current_text_fragments = []
    
    step_size = fragment_size - overlap  
    
    for i in range(0, len(words), step_size):
        current_fragment = " ".join(words[i:i + fragment_size])
        current_text_fragments.append(current_fragment)

        # Handle situation where final chapter fragment is already contained in the previous fragment
        if len(words) - i < fragment_size:
            break
        
    return current_text_fragments

In [4]:
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

# Apply our cleaning and create a new dataset to replace our previous one, this time with processed text
text_fragments = []
for index, row in dataset.iterrows():
    text = row["Text"]
    text = preprocess_text(text, ps, all_stopwords)
    current_text_fragments = fragment_text(text, fragment_size, overlap)
    
    for text_fragment in current_text_fragments:
        text_fragments.append({
            "Book": row["Book"],
            "Author": row["Author"],
            "Text": text_fragment
        })

# Convert the data fragments into a Pandas DataFrame and replace the original
dataset = pd.DataFrame(text_fragments)

### Encoding

In [5]:
X = dataset["Text"].values
y = dataset["Author"].values

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

## Encoding

### Tokenisation and Padding

In [47]:
# Remove empty strings after .strip() applied
# There shouldn't be any but tokeniser fails if not done
X = [x for x in X if x.strip() != '']

In [44]:
from transformers import BertTokenizerFast

# Max length chosen based on token distribution coming from fragment size to minimise truncation
# Edit if changing fragment_size, or remove entirely although this should be less efficient
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
X_tokenized = tokenizer(
    X, 
    padding=True, 
    truncation=True,
    max_length=330,
    return_tensors='tf'
)

In [84]:
from transformers import TFBertModel
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Model

# Load pretrained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Define input layers
input_ids = tf.keras.layers.Input(shape=(330,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.layers.Input(shape=(330,), dtype=tf.int32, name="attention_mask")

# Wrap BERT in a Lambda layer to handle KerasTensors correctly
def bert_layer(inputs):
    return bert_model(input_ids=inputs[0], attention_mask=inputs[1])[1]

bert_output = Lambda(bert_layer, output_shape=(768,))([input_ids, attention_mask])

# Add dropout (prevent overfitting)
dropout = Dropout(0.3)(bert_output)

# Create classification layers
num_authors = len(dataset['Author'].unique())
output = Dense(num_authors, activation="softmax")(dropout)

# # Create model
model = Model(inputs=[input_ids, attention_mask], outputs=output)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [85]:


# Make predictions on the test data
predictions = model.predict([X_test_tokenized['input_ids'], X_test_tokenized['attention_mask']])

# Get the predicted class (author) for each sample
predicted_classes = predictions.argmax(axis=-1)  # Get the index of the highest probability

# Optionally, map these indices to actual author names
predicted_authors = [author_mapping[i] for i in predicted_classes]

print(predicted_authors)

NameError: name 'X_test_tokenized' is not defined

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size = 0.2, random_state = 0)

In [64]:
from tensorflow.keras.layers import Input, Lambda
from tensorflow.keras.models import Model
from transformers import TFBertModel

# Load BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Define input layers
input_ids = Input(shape=(X_tokenised['input_ids'].shape[1],), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(X_tokenised['attention_mask'].shape[1],), dtype=tf.int32, name="attention_mask")

# Wrap BERT in a Lambda layer to ensure TensorFlow compatibility
def bert_layer(inputs):
    return bert_model(input_ids=inputs[0], attention_mask=inputs[1])[1]  # [1] = pooled output

bert_output = Lambda(bert_layer, output_shape=(768,))([input_ids, attention_mask])

# Build the model
model = Model(inputs=[input_ids, attention_mask], outputs=bert_output)

# Print summary
model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [66]:
loss, accuracy = model.evaluate([X_tokenized['input_ids'], X_tokenized['attention_mask']], y)
print(f'Accuracy: {accuracy}')

ValueError: You must call `compile()` before using the model.

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_tokenised = tokenizer.texts_to_sequences(X)
# print("Word Index:", tokenizer.word_index)
# print("Sequences:", X_tokenised)

Word Index: {'<OOV>': 1, 'not': 2, 'one': 3, 'said': 4, 'look': 5, 'would': 6, 'like': 7, 'man': 8, 'mr': 9, 'go': 10, 'could': 11, 'hand': 12, 'time': 13, 'know': 14, 'come': 15, 'upon': 16, 'see': 17, 'eye': 18, 'littl': 19, 'even': 20, 'thing': 21, 'room': 22, 'day': 23, 'came': 24, 'thought': 25, 'old': 26, 'think': 27, 'never': 28, 'face': 29, 'say': 30, 'door': 31, 'well': 32, 'seem': 33, 'made': 34, 'life': 35, 'back': 36, 'two': 37, 'good': 38, 'hous': 39, 'must': 40, 'long': 41, 'went': 42, 'turn': 43, 'gatsbi': 44, 'night': 45, 'us': 46, 'head': 47, 'want': 48, 'away': 49, 'last': 50, 'make': 51, 'get': 52, 'way': 53, 'first': 54, 'someth': 55, 'moment': 56, 'though': 57, 'noth': 58, 'much': 59, 'ask': 60, 'love': 61, 'ye': 62, 'open': 63, 'year': 64, 'dorian': 65, 'take': 66, 'still': 67, 'great': 68, 'daisi': 69, 'everi': 70, 'yet': 71, 'tell': 72, 'began': 73, 'alway': 74, 'light': 75, 'word': 76, 'lord': 77, 'might': 78, 'young': 79, 'pass': 80, 'put': 81, 'cri': 82, 'voi

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = fragment_size
X_padded = pad_sequences(X_tokenised, maxlen=max_length, padding='post', truncating='post')

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size = 0.2, random_state = 0)

### Training Model

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.model_selection import KFold

X = np.array(X_padded)
y = np.array(y)

# K-fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=1)

In [12]:
accuracies = []  # List to store accuracy for each fold

for train_index, test_index in kf.split(X):
    # Split the data into training and test sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Initialise model
    model = Sequential()
    model.add(Embedding(input_dim=20000, output_dim=64))  # Adjust input_dim based on your vocabulary size
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2, strides=2))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(np.unique(y_train)), activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))
    
    # Evaluate the model on the test set
    loss, accuracy = model.evaluate(X_test, y_test)
    
    # Save the accuracy for this fold
    accuracies.append(accuracy)

# Calculate the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy: {average_accuracy:.4f}')

Epoch 1/5
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.1612 - loss: 1.9521 - val_accuracy: 0.1768 - val_loss: 1.9135
Epoch 2/5
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2182 - loss: 1.8715 - val_accuracy: 0.1768 - val_loss: 1.8607
Epoch 3/5
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2826 - loss: 1.6880 - val_accuracy: 0.4085 - val_loss: 1.4993
Epoch 4/5
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5110 - loss: 1.2957 - val_accuracy: 0.6159 - val_loss: 1.1511
Epoch 5/5
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6051 - loss: 0.9532 - val_accuracy: 0.7134 - val_loss: 0.8193
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7447 - loss: 0.8165 
Epoch 1/5
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy

## Evaluate New Text

In [13]:
tests_path = os.path.join(current_dir, "Tests")

count = 0
passed = 0
for author in os.listdir(tests_path):
    test_file_path = os.path.join(tests_path, author)

    with open(test_file_path, 'r', encoding='utf-8') as file:
        new_text = file.read()

    # Process text for model
    processed_new_text = preprocess_text(new_text, ps, all_stopwords)
    tokenised_new_text = tokenizer.texts_to_sequences([processed_new_text])
    padded_new_text = pad_sequences(tokenised_new_text, maxlen=fragment_size, padding='post', truncating='post')

    # Make prediction
    predicted_class = model.predict(padded_new_text)

    predicted_class_label = np.argmax(predicted_class, axis=1)
    author_names = dataset['Author'].unique()
    author_mapping = {index: author for index, author in enumerate(author_names)}
    predicted_author = author_mapping[predicted_class_label[0]]
    print(f"{author[:-4]} vs {predicted_author}")

    # Index counts
    count += 1
    if author[:-4] == predicted_author:
        passed += 1

print(f"Passed {passed} out of {count}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Herman Melville vs Robert Louise Stevenson
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Oscar Wilde vs Charles Dickens
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Robert Louise Stevenson vs Mary Shelley
Passed 0 out of 3
