# Artificial Neural Network

### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
tf.__version__




'2.18.0'

## Data Preprocessing

### Importing the dataset

In [2]:
import csv
dataset = pd.read_csv('Author Dataset.csv', encoding='latin1', sep=',', quoting=csv.QUOTE_ALL)
dataset = dataset.drop("Chapter", axis=1)
print(dataset.head())

           Book        Author  \
0  Frankenstein  Mary Shelley   
1  Frankenstein  Mary Shelley   
2  Frankenstein  Mary Shelley   
3  Frankenstein  Mary Shelley   
4  Frankenstein  Mary Shelley   

                                                Text  
0  To Mrs. Saville, England.\n\nSt. Petersburgh, ...  
1  To Mrs. Saville, England.\n\nArchangel, 28th M...  
2  To Mrs. Saville, England.\n\nJuly 7th, 17.\n\...  
3  To Mrs. Saville, England.\n\nAugust 5th, 17.\...  
4  I am by birth a Genevese, and my family is one...  


### Splitting and Cleaning

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

fragment_size = 200
overlap = 50

def preprocess_text(text, ps, all_stopwords):
    # Clean text
    text = text.lower()  # Lowercase
    text = re.sub(r'\n', " ", text)  # Newlines
    text = re.sub(r'[^a-zA-Z\s]', " ", text)  # Punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Extra spaces

    # Apply corpus
    words = text.split()
    words = [ps.stem(word) for word in words if word not in all_stopwords]
    processed_text = " ".join(words)

    return processed_text

def fragment_text(text, fragment_size, overlap):
    # Split text into fragments of fragment_size length, returns array of fragments
    words = text.split()
    current_text_fragments = []
    
    step_size = fragment_size - overlap  
    
    for i in range(0, len(words), step_size):
        current_fragment = " ".join(words[i:i + fragment_size])
        current_text_fragments.append(current_fragment)

        # Handle situation where final chapter fragment is already contained in the previous fragment
        if len(words) - i < fragment_size:
            break
        
    return current_text_fragments

In [4]:
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

text_fragments = []
for index, row in dataset.iterrows():
    text = row["Text"]
    text = preprocess_text(text, ps, all_stopwords)
    current_text_fragments = fragment_text(text, fragment_size, overlap)
    
    for text_fragment in current_text_fragments:
        text_fragments.append({
            "Book": row["Book"],
            "Author": row["Author"],
            "Text": text_fragment
        })

# Convert the data fragments into a Pandas DataFrame and replace the original
dataset = pd.DataFrame(text_fragments)

### Encoding

In [5]:
X = dataset["Text"].values
y = dataset["Author"].values

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [7]:
print(y)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## Encoding

### Tokenisation and Padding

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_tokenised = tokenizer.texts_to_sequences(X)
# print("Word Index:", tokenizer.word_index)
# print("Sequences:", X)

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = fragment_size
X_padded = pad_sequences(X_tokenised, maxlen=max_length, padding='post', truncating='post')

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size = 0.2, random_state = 0)

### Training Model

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.model_selection import KFold

X = np.array(X_padded)
y = np.array(y)

# K-fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=1)

In [12]:
accuracies = []  # List to store accuracy for each fold

for train_index, test_index in kf.split(X):
    # Split the data into training and test sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Initialise model
    model = Sequential()
    model.add(Embedding(input_dim=20000, output_dim=64))  # Adjust input_dim based on your vocabulary size
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=3, strides=3))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(np.unique(y_train)), activation='softmax'))  # Number of classes in y_train
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))
    
    # Evaluate the model on the test set
    loss, accuracy = model.evaluate(X_test, y_test)
    
    # Save the accuracy for this fold
    accuracies.append(accuracy)

# Calculate the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy: {average_accuracy:.4f}')

Epoch 1/5
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.4908 - loss: 1.0737 - val_accuracy: 0.4930 - val_loss: 1.0299
Epoch 2/5
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.4661 - loss: 0.9973 - val_accuracy: 0.4930 - val_loss: 0.9986
Epoch 3/5
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5218 - loss: 0.8537 - val_accuracy: 0.4930 - val_loss: 0.8707
Epoch 4/5
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7585 - loss: 0.6475 - val_accuracy: 0.6761 - val_loss: 0.6523
Epoch 5/5
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9276 - loss: 0.3753 - val_accuracy: 0.8732 - val_loss: 0.3622
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8429 - loss: 0.4391 
Epoch 1/5
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.4708 - l

## Evaluate New Text

In [27]:
new_text = "There was at present nothing to be learned from the Piccadilly side, and nothing could be done; so I went round to the back to see if anything could be gathered from this quarter. The mews were active, the Piccadilly houses being mostly in occupation. I asked one or two of the grooms and helpers whom I saw around if they could tell me anything about the empty house. One of them said that he heard it had lately been taken, but he couldn’t say from whom. He told me, however, that up to very lately there had been a notice-board of"

In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Process text for model
processed_new_text = preprocess_text(new_text, ps, all_stopwords)
tokenised_new_text = tokenizer.texts_to_sequences([processed_new_text])
padded_new_text = pad_sequences(tokenised_new_text, maxlen=fragment_size, padding='post', truncating='post')

In [31]:
predicted_class = model.predict(padded_new_text)

predicted_class_label = np.argmax(predicted_class, axis=1)
author_mapping = {0: "Mary Shelley", 1: "Lewis Carol", 2: "Bram Stoker"}
predicted_author = author_mapping[predicted_class_label[0]]
print(f"Predicted Author: {predicted_author}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Predicted Author: Lewis Carol
