# Artificial Neural Network

### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
tf.__version__




'2.18.0'

## Data Preprocessing

### Importing the dataset

In [2]:
import csv
dataset = pd.read_csv('Author Dataset.csv', encoding='latin1', sep=',', quoting=csv.QUOTE_ALL)
dataset = dataset.drop("Chapter", axis=1)
print(dataset.head())

           Book        Author  \
0  Frankenstein  Mary Shelley   
1  Frankenstein  Mary Shelley   
2  Frankenstein  Mary Shelley   
3  Frankenstein  Mary Shelley   
4  Frankenstein  Mary Shelley   

                                                Text  
0  To Mrs. Saville, England.\n\nSt. Petersburgh, ...  
1  To Mrs. Saville, England.\n\nArchangel, 28th M...  
2  To Mrs. Saville, England.\n\nJuly 7th, 17.\n\...  
3  To Mrs. Saville, England.\n\nAugust 5th, 17.\...  
4  I am by birth a Genevese, and my family is one...  


### Splitting and Cleaning

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

for i in range(0, len(dataset)):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Text'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

dataset['Text'] = corpus

In [4]:
fragment_size = 200
overlap = 50

def clean_text(text):
    # Clean text and apply corpus
    text = text.lower()  # Lowercase
    text = re.sub(r'\n', ' ', text)  # Newlines
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)  # Punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Extra spaces
    return text

def fragment_text(text, fragment_size, overlap):
    # Split text into fragments of fragment_size length, returns array of fragments
    words = text.split()
    current_text_fragments = []
    
    step_size = fragment_size - overlap  
    
    for i in range(0, len(words), step_size):
        current_fragment = " ".join(words[i:i + fragment_size])
        current_text_fragments.append(current_fragment)

        # Handle situation where final chapter fragment is already contained in the previous fragment
        if len(words) - i < fragment_size:
            break
        
    return current_text_fragments
        
text_fragments = []
for index, row in dataset.iterrows():
    text = row["Text"]
    text = clean_text(text)
    current_text_fragments = fragment_text(text, fragment_size, overlap)
    
    for text_fragment in current_text_fragments:
        text_fragments.append({
            "Book": row["Book"],
            "Author": row["Author"],
            "Text": text_fragment
        })

# Convert the data fragments into a Pandas DataFrame
dataset = pd.DataFrame(text_fragments)

In [5]:
print(dataset.iloc[8, 2])

mr savil england juli th dear sister write line hast say safe well advanc voyag letter reach england merchantman homeward voyag archangel fortun may not see nativ land perhap mani year howev good spirit men bold appar firm purpos float sheet ice continu pass us indic danger region toward advanc appear dismay alreadi reach high latitud height summer although not warm england southern gale blow us speedili toward shore ardent desir attain breath degre renov warmth not expect incid hitherto befallen us would make figur letter one two stiff gale spring leak accid experienc navig scarc rememb record shall well content noth wors happen us voyag adieu dear margaret assur sake well not rashli encount danger cool persev prudent success shall crown endeavour wherefor not thu far gone trace secur way pathless sea star wit testimoni triumph not still proceed untam yet obedi element stop determin heart resolv man swell heart involuntarili pour thu must finish heaven bless belov sister r w


### Encoding

In [6]:
X = dataset["Text"].values
y = dataset["Author"].values

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [8]:
print(y)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## Encoding

### Tokenisation and Padding

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
print("Word Index:", tokenizer.word_index)
print("Sequences:", X)

Word Index: {'<OOV>': 1, 'not': 2, 'said': 3, 'alic': 4, 'one': 5, 'could': 6, 'would': 7, 'look': 8, 'time': 9, 'seem': 10, 'know': 11, 'like': 12, 'see': 13, 'come': 14, 'littl': 15, 'must': 16, 'went': 17, 'shall': 18, 'well': 19, 'go': 20, 'thing': 21, 'thought': 22, 'say': 23, 'day': 24, 'night': 25, 'may': 26, 'hand': 27, 'dear': 28, 'came': 29, 'think': 30, 'eye': 31, 'way': 32, 'good': 33, 'get': 34, 'even': 35, 'man': 36, 'great': 37, 'much': 38, 'make': 39, 'first': 40, 'back': 41, 'room': 42, 'luci': 43, 'door': 44, 'feel': 45, 'found': 46, 'tri': 47, 'ask': 48, 'friend': 49, 'saw': 50, 'tell': 51, 'might': 52, 'old': 53, 'us': 54, 'made': 55, 'long': 56, 'yet': 57, 'never': 58, 'last': 59, 'love': 60, 'fear': 61, 'quit': 62, 'place': 63, 'life': 64, 'two': 65, 'turn': 66, 'sleep': 67, 'upon': 68, 'away': 69, 'everi': 70, 'noth': 71, 'began': 72, 'poor': 73, 'count': 74, 'mind': 75, 'head': 76, 'got': 77, 'take': 78, 'took': 79, 'felt': 80, 'open': 81, 'without': 82, 'pass':

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define maximum sequence length (adjust based on your fragments)
max_length = 200  # Since your fragments are 200 words, this aligns well

# Apply padding
X_padded = pad_sequences(X, maxlen=max_length, padding='post', truncating='post')

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size = 0.2, random_state = 0)

### Training Model

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.model_selection import KFold

X = np.array(X_padded)
y = np.array(y)

kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold cross-validation

In [14]:
accuracies = []  # List to store accuracy for each fold

for train_index, test_index in kf.split(X):
    # Split the data into training and test (validation) sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Initialize your model (CNN or other)
    model = Sequential()
    model.add(Embedding(input_dim=20000, output_dim=64))  # Adjust input_dim based on your vocabulary size
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(np.unique(y_train)), activation='softmax'))  # Number of classes in y_train
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_test, y_test))
    
    # Evaluate the model on the test set (validation)
    loss, accuracy = model.evaluate(X_test, y_test)
    
    # Save the accuracy for this fold
    accuracies.append(accuracy)

# Calculate the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy: {average_accuracy:.4f}')


Epoch 1/3
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.4942 - loss: 1.0563 - val_accuracy: 0.4085 - val_loss: 1.0664
Epoch 2/3
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5141 - loss: 0.9977 - val_accuracy: 0.4085 - val_loss: 1.0601
Epoch 3/3
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.4683 - loss: 0.8974 - val_accuracy: 0.4085 - val_loss: 0.9998
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2902 - loss: 1.1161     
Epoch 1/3
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.4474 - loss: 1.0861 - val_accuracy: 0.5493 - val_loss: 0.9901
Epoch 2/3
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.4869 - loss: 1.0271 - val_accuracy: 0.5493 - val_loss: 1.0154
Epoch 3/3
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5202