# 1. Downloading the Dataset

In [4]:
import requests
import os

In [5]:
books_urls = {
    "austen_pride": "https://www.gutenberg.org/cache/epub/1342/pg1342.txt",
    "austen_emma": "https://www.gutenberg.org/cache/epub/158/pg158.txt",
    "austen_sense": "https://www.gutenberg.org/cache/epub/161/pg161.txt",

    "dickens_two_cities": "https://www.gutenberg.org/cache/epub/98/pg98.txt",
    "dickens_expectations": "https://www.gutenberg.org/cache/epub/1400/pg1400.txt",
    "dickens_twist": "https://www.gutenberg.org/cache/epub/730/pg730.txt",

    "twain_tom": "https://www.gutenberg.org/cache/epub/74/pg74.txt",
    "twain_huck": "https://www.gutenberg.org/cache/epub/76/pg76.txt",
    "twain_prince": "https://www.gutenberg.org/cache/epub/1837/pg1837.txt",
}

In [6]:
for name, url in books_urls.items():
	response = requests.get(url)
	path = f"../data/{name}.txt"

	with open(path, "w", encoding = "utf-8") as f:
		f.write(response.text)
	print(f"Book downloaded: {path}")

Book downloaded: ../data/austen_pride.txt
Book downloaded: ../data/austen_emma.txt
Book downloaded: ../data/austen_sense.txt
Book downloaded: ../data/dickens_two_cities.txt
Book downloaded: ../data/dickens_expectations.txt
Book downloaded: ../data/dickens_twist.txt
Book downloaded: ../data/twain_tom.txt
Book downloaded: ../data/twain_huck.txt
Book downloaded: ../data/twain_prince.txt


# 2. Data Preprocessing and Loading

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
import re

In [8]:
nltk.download("punkt")
nltk.download("punkt_tab")

stop_words = set(stopwords.words("english"))

token_pattern = re.compile(r"^[a-z]+(?:'[a-z]+)?$")

[nltk_data] Downloading package punkt to /home/nico/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/nico/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [9]:
def strip_gutenberg_headers(text):
	start_match = re.search(r"\*\*\*\s*START OF (THE|THIS) PROJECT GUTENBERG EBOOK.*\*\*\*", text, re.IGNORECASE)
	start_index = start_match.end() if start_match else 0

	end_match = re.search(r"\*\*\*\s*END OF (THE|THIS) PROJECT GUTENBERG EBOOK.*\*\*\*", text, re.IGNORECASE)
	end_index = end_match.start() if end_match else len(text)

	clean_text = text[start_index:end_index].strip()
	return clean_text.strip()

In [10]:
def text_to_sent_token_lists(text):
	sentences = sent_tokenize(text)
	tokenized = []

	for sentence in sentences:
		sentence = sentence.lower()
		sentence = sentence.replace("-", " ")

		words = word_tokenize(sentence)

		cleaned = [word for word in words if token_pattern.match(word)]
		cleaned = [word for word in cleaned if word and not word.isdigit()]
		cleaned = [word for word in cleaned if not word in stop_words]

		if len(cleaned) >= 2:
			tokenized.append(cleaned)
    
	return tokenized

In [11]:
def build_corpus_from_books(book_dir):
    all_sentences = []
    files = sorted([file for file in os.listdir(book_dir)])
    
    for file_name in files:
        path = os.path.join(book_dir, file_name)
        print(f"Processing {path} ...")
        
        with open(path, "r", encoding = "utf-8") as f:
            raw = f.read()
            
        stripped = strip_gutenberg_headers(raw)
        sentences = text_to_sent_token_lists(stripped)
        all_sentences.extend(sentences)
        
    print(f"\nTotal sentences in corpus: {len(all_sentences)}")
    return all_sentences

In [12]:
all_sentences = build_corpus_from_books("../data")

Processing ../data/austen_emma.txt ...
Processing ../data/austen_pride.txt ...
Processing ../data/austen_sense.txt ...
Processing ../data/dickens_expectations.txt ...
Processing ../data/dickens_twist.txt ...
Processing ../data/dickens_two_cities.txt ...
Processing ../data/twain_huck.txt ...
Processing ../data/twain_prince.txt ...
Processing ../data/twain_tom.txt ...

Total sentences in corpus: 42516


# 3. Training Embedding Models

In [13]:
from gensim.models import Word2Vec
import os

In [14]:
dimensions = [50, 100, 200]
group_code = "G01"

In [15]:
def train_and_save_models(sentences, dimensions, group_code, output_dir):
    
    for dimension in dimensions:
        print(f"\nTraining model with dimension {dimension} ...")
        
        model = Word2Vec(
            sentences = sentences,
            vector_size = dimension,
            window = 5,
            negative = 20, 
            min_count = 2,
            workers = 8,
            sg = 1,
            epochs = 40
        )
        
        model_path = os.path.join(output_dir, f"Books_{dimension}_{group_code}.model")
        vec_path = os.path.join(output_dir, f"Books_{dimension}_{group_code}.vec")
        
        model.save(model_path)
        model.wv.save_word2vec_format(vec_path, binary = False)
        
        print(f"Model saved in:\n  - {model_path}\n  - {vec_path}")

In [16]:
train_and_save_models(all_sentences, dimensions, group_code, "../models")


Training model with dimension 50 ...
Model saved in:
  - ../models/Books_50_G01.model
  - ../models/Books_50_G01.vec

Training model with dimension 100 ...
Model saved in:
  - ../models/Books_100_G01.model
  - ../models/Books_100_G01.vec

Training model with dimension 200 ...
Model saved in:
  - ../models/Books_200_G01.model
  - ../models/Books_200_G01.vec


# 4. Loading the Trained Models

In [17]:
from gensim.models import Word2Vec

In [18]:
model_dir = "../models"

model_paths = {
    50: f"{model_dir}/Books_50_G01.model",
    100: f"{model_dir}/Books_100_G01.model",
    200: f"{model_dir}/Books_200_G01.model"
}

models = {}
for dimension, path in model_paths.items():
    print(f"Loading model {dimension}D from {path} ...")
    models[dimension] = Word2Vec.load(path)
    print(models[dimension])


Loading model 50D from ../models/Books_50_G01.model ...
Word2Vec<vocab=16108, vector_size=50, alpha=0.025>
Loading model 100D from ../models/Books_100_G01.model ...
Word2Vec<vocab=16108, vector_size=100, alpha=0.025>
Loading model 200D from ../models/Books_200_G01.model ...
Word2Vec<vocab=16108, vector_size=200, alpha=0.025>


# 5. 2D Visualization of Embeddings

In [19]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

In [20]:
def plot_similar_words(model, target_word, topn = 5):
    
    similar_words = [w for w, _ in model.wv.most_similar(target_word, topn = topn)]
    all_words = [target_word] + similar_words

    vectors = np.array([model.wv[w] for w in all_words])

    tsne = TSNE(n_components = 2, random_state = 42, perplexity = 5)
    reduced = tsne.fit_transform(vectors)

    plt.figure(figsize=(10, 8))
    plt.scatter(reduced[:, 0], reduced[:, 1])

    for i, word in enumerate(all_words):
        if i == 0:
            plt.annotate(word, xy = (reduced[i, 0], reduced[i, 1]), fontsize = 14, color = 'red', fontweight = 'bold')
        else:
            plt.annotate(word, xy = (reduced[i, 0], reduced[i, 1]), fontsize = 12)

    plt.title(f"Words most similar to '{target_word}'")
    plt.show()

In [21]:
main_characters = {
    "austen_pride": ["elizabeth", "darcy"],       
    "austen_emma": ["emma", "knightley"],         
    "austen_sense": ["elinor", "marianne"],       

    "dickens_two_cities": ["darnay", "carton"],   
    "dickens_expectations": ["pip", "havisham"],  
    "dickens_twist": ["oliver", "fagin"],         

    "twain_tom": ["tom", "becky"],               
    "twain_huck": ["huck", "jim"],               
    "twain_prince": ["prince", "pauper"],        
}


In [22]:
def plot_and_save_similar_words(model, target_word, book_id, dimension, topn, output_dir):

    similar_words = model.wv.most_similar(target_word, topn = topn)
    words = [target_word] + [w for w, _ in similar_words]
    vectors = np.array([model.wv[w] for w in words])

    tsne = TSNE(n_components = 2, random_state = 42, perplexity = min(5, len(words)-1))
    reduced = tsne.fit_transform(vectors)

    plt.figure(figsize=(8, 6))
    plt.scatter(reduced[0, 0], reduced[0, 1], color = 'red')  
    plt.text(reduced[0, 0] + 1, reduced[0, 1] + 1, target_word, fontsize = 12, color = 'red', weight = 'bold')

    for i, word in enumerate(words[1:], start = 1):
        x, y = reduced[i, 0], reduced[i, 1]
        plt.scatter(x, y)
        plt.text(x + 1, y + 1, word, fontsize=10)

    plt.title(f"Most similar to '{target_word}' ({book_id}, dim = {dimension})")
    plt.tight_layout()

    filename = f"{target_word}_{dimension}d.png"
    filepath = os.path.join(output_dir, filename)
    plt.savefig(filepath)
    plt.close()  
    print(f"Saved: {filepath}")

In [23]:
for book_id, characters in main_characters.items():
	for name in characters:
		plot_and_save_similar_words(models[50], name, book_id, 50, 4, "../figures")

Saved: ../figures/elizabeth_50d.png
Saved: ../figures/darcy_50d.png
Saved: ../figures/emma_50d.png
Saved: ../figures/knightley_50d.png
Saved: ../figures/elinor_50d.png
Saved: ../figures/marianne_50d.png
Saved: ../figures/darnay_50d.png
Saved: ../figures/carton_50d.png
Saved: ../figures/pip_50d.png
Saved: ../figures/havisham_50d.png
Saved: ../figures/oliver_50d.png
Saved: ../figures/fagin_50d.png
Saved: ../figures/tom_50d.png
Saved: ../figures/becky_50d.png
Saved: ../figures/huck_50d.png
Saved: ../figures/jim_50d.png
Saved: ../figures/prince_50d.png
Saved: ../figures/pauper_50d.png


# 6. Dataset Preparation for Classification

In [24]:
from sklearn.model_selection import train_test_split
from collections import Counter
import random
import os

In [25]:
books_to_author = {
    "austen_pride.txt": "austen",
    "austen_emma.txt": "austen",
    "austen_sense.txt": "austen",
    "dickens_two_cities.txt": "dickens",
    "dickens_expectations.txt": "dickens",
    "dickens_twist.txt": "dickens",
    "twain_tom.txt": "twain",
    "twain_huck.txt": "twain",
    "twain_prince.txt": "twain",
}

labels_to_ids = {
    "austen": 0,
    "dickens": 1,
    "twain": 2
}

In [26]:
def read_text(path):
    with open(path, "r", encoding = "utf-8") as f:
        return f.read()

In [27]:
def preprocess_and_chunks(text, chunk_size, min_chunk_size):
    
    sentences = sent_tokenize(text)
    all_tokens = []

    for sentence in sentences:
        sentence = sentence.lower()
        sentence = sentence.replace("-", " ")

        words = word_tokenize(sentence)

        cleaned = [word for word in words if token_pattern.match(word)]
        cleaned = [word for word in cleaned if not word.isdigit()]
        cleaned = [word for word in cleaned if word not in stop_words]

        if len(cleaned) >= 1:
            all_tokens.extend(cleaned)

    chunks = []
    for i in range(0, len(all_tokens), chunk_size):
        chunk_tokens = all_tokens[i:i+chunk_size]
        if len(chunk_tokens) >= min_chunk_size:
            chunk_text = " ".join(chunk_tokens)
            chunks.append(chunk_text)

    return chunks

In [28]:
def build_clasification_dataset(books_dir, test_size, validation_size, random_seed):
	random.seed(random_seed)
	texts, labels = [], []

	for file_name in sorted(os.listdir(books_dir)):
		author = books_to_author.get(file_name)
		
		raw_text = read_text(os.path.join(books_dir, file_name))
		chunks = preprocess_and_chunks(raw_text, 200, 200)
		print(f"{len(chunks)} chunks generated for {file_name}")

		texts.extend(chunks)
		labels.extend([author] * len(chunks))

	y = np.array([labels_to_ids[label] for label in labels])

	X_train, X_temp, y_train, y_temp = train_test_split(texts, y, test_size = test_size, random_state = random_seed, stratify = y)
	X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size = validation_size, random_state = random_seed, stratify = y_temp)

	def class_counts(y_arr):
		c = Counter(y_arr)
		return {label: c[idx] for label, idx in labels_to_ids.items()}

	summary = {
		"train": class_counts(y_train),
		"validation": class_counts(y_validation),
		"test": class_counts(y_test),
	}

	print("\nSummary samples by split and class:\n")
	for split, counts in summary.items():
		print(f"  {split}: {counts}")

	return (X_train, y_train), (X_validation, y_validation), (X_test, y_test)


In [29]:
(X_train, y_train), (X_validation, y_validation), (X_test, y_test) = build_clasification_dataset("../data", 0.3, 0.5, 42)

348 chunks generated for austen_emma.txt
287 chunks generated for austen_pride.txt
268 chunks generated for austen_sense.txt
411 chunks generated for dickens_expectations.txt
384 chunks generated for dickens_twist.txt
322 chunks generated for dickens_two_cities.txt
252 chunks generated for twain_huck.txt
185 chunks generated for twain_prince.txt
175 chunks generated for twain_tom.txt

Summary samples by split and class:

  train: {'austen': 632, 'dickens': 782, 'twain': 428}
  validation: {'austen': 135, 'dickens': 168, 'twain': 92}
  test: {'austen': 136, 'dickens': 167, 'twain': 92}


In [30]:
for split_name, X_split, y_split in [("Train", X_train, y_train), ("Validation", X_validation, y_validation), ("Test", X_test, y_test),]:
    print(f"\n--- {split_name} examples ---")
    for i in range(2): 
        print(f"Label: {list(labels_to_ids.keys())[list(labels_to_ids.values()).index(y_split[i])]}")
        print("Text:", "".join(X_split[i][:50]), "...\n")



--- Train examples ---
Label: austen
Text: rather dark darker narrower one could wish miss sm ...

Label: dickens
Text: stood drinking little counter conversation defarge ...


--- Validation examples ---
Label: twain
Text: office child play wherefore last ladies visit draw ...

Label: dickens
Text: condition declared peril sake altering way living  ...


--- Test examples ---
Label: austen
Text: well satisfied consider present campbells may knig ...

Label: dickens
Text: though would shake hands let go room said boy retr ...



# 7. Building Embeddings Matrix

In [101]:
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

In [102]:
sequence_length = 200

tokenizer = Tokenizer(oov_token = "<OOV>")
tokenizer.fit_on_texts(X_train)

vocabulary_size = len(tokenizer.word_index) + 1
print("Training vocabulary size:", vocabulary_size)

Training vocabulary size: 21023


In [103]:
def build_embedding_matrix(embeddings_model, tokenizer, embedding_dimension):
    matrix = np.zeros((vocabulary_size, embedding_dimension))
    for word, i in tokenizer.word_index.items():
        if word in embeddings_model.wv:
            matrix[i] = embeddings_model.wv[word]
    return matrix

In [104]:
embedding_matrices = {}
for dimension, embedding_model in models.items():
    embedding_matrices[dimension] = build_embedding_matrix(embedding_model, tokenizer, dimension)

In [107]:
X_train_sequences = np.array(tokenizer.texts_to_sequences(X_train))
X_validation_sequences = np.array(tokenizer.texts_to_sequences(X_validation))
X_test_sequences  = np.array(tokenizer.texts_to_sequences(X_test))

# 8. Training Feed-Forward Model for Classification (Trained Embeddings)

In [119]:
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Flatten, Dense
from sklearn.metrics import precision_score, recall_score, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np

In [120]:
num_authors = 3

In [121]:
def make_embedding_layer(embedding_matrix):
	embedding_dimension = embedding_matrix.shape[1]

	return Embedding(
		input_dim = vocabulary_size,
		output_dim = embedding_dimension,
		weights = [embedding_matrix],
		trainable = False
	)

In [122]:
def build_model_a(embedding_layer):
    model = Sequential([
        embedding_layer,
        GlobalAveragePooling1D(),
        Dense(32, activation = 'relu'),
        Dense(num_authors, activation = 'softmax')
    ])
    return model

In [123]:
def build_model_b(embedding_layer):
    model = Sequential([
        embedding_layer,
        GlobalAveragePooling1D(),
        Dense(128, activation = 'relu'),
        Dense(64, activation = 'relu'),
        Dense(num_authors, activation = 'softmax')
    ])
    return model

In [124]:
def build_model_c(embedding_layer):
    model = Sequential([
        embedding_layer,
        Flatten(),
        Dense(256, activation = 'relu'),
        Dense(128, activation = 'relu'),
        Dense(num_authors, activation = 'softmax')
    ])
    return model

In [125]:
architectures = {
    "A": build_model_a,
    "B": build_model_b,
    "C": build_model_c
}

In [126]:
results = {}

for embedding_dimension, embedding_matrix in embedding_matrices.items():
    for arch_name, build_fn in architectures.items():
        print(f"Training model {arch_name} with embedding {embedding_dimension}D")

        embedding_layer = make_embedding_layer(embedding_matrix)
        model = build_fn(embedding_layer)

        model.compile(
            optimizer = Adam(),
            loss = 'sparse_categorical_crossentropy',
            metrics = ['accuracy']
        )

        history = model.fit(
            X_train_sequences, y_train,
            validation_data = (X_validation_sequences, y_validation),
            epochs = 10,
            batch_size = 32,
            verbose = 1
        )

        y_pred_probs = model.predict(X_test_sequences, verbose = 0)
        y_pred = np.argmax(y_pred_probs, axis = 1)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average = 'macro', zero_division = 0)
        recall = recall_score(y_test, y_pred, average = 'macro', zero_division = 0)

        key = f"{arch_name}_{embedding_dimension}D"
        results[key] = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall
        }

Training model A with embedding 50D
Epoch 1/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5119 - loss: 0.9810 - val_accuracy: 0.7747 - val_loss: 0.8658
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 922us/step - accuracy: 0.8084 - loss: 0.7719 - val_accuracy: 0.8481 - val_loss: 0.6644
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 904us/step - accuracy: 0.8692 - loss: 0.5838 - val_accuracy: 0.8962 - val_loss: 0.4956
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 910us/step - accuracy: 0.9083 - loss: 0.4407 - val_accuracy: 0.9241 - val_loss: 0.3844
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 920us/step - accuracy: 0.9305 - loss: 0.3485 - val_accuracy: 0.9241 - val_loss: 0.3098
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 942us/step - accuracy: 0.9376 - loss: 0.2896 - val_accuracy: 0.9418 - val_loss: 0

In [129]:
df = pd.DataFrame(results).T 

df_formatted = df.map(lambda x: f"{x:.3f}")

print(df_formatted)

       accuracy precision recall
A_50D     0.954     0.955  0.954
B_50D     0.967     0.961  0.973
C_50D     0.949     0.946  0.950
A_100D    0.970     0.971  0.967
B_100D    0.962     0.954  0.969
C_100D    0.962     0.966  0.956
A_200D    0.962     0.963  0.961
B_200D    0.982     0.980  0.985
C_200D    0.934     0.943  0.922
