# 1. Section Marker

In [1]:
import requests
import os

In [2]:
books_urls = {
    "austen_pride": "https://www.gutenberg.org/cache/epub/1342/pg1342.txt",
    "austen_emma": "https://www.gutenberg.org/cache/epub/158/pg158.txt",
    "austen_sense": "https://www.gutenberg.org/cache/epub/161/pg161.txt",

    "dickens_two_cities": "https://www.gutenberg.org/cache/epub/98/pg98.txt",
    "dickens_expectations": "https://www.gutenberg.org/cache/epub/1400/pg1400.txt",
    "dickens_twist": "https://www.gutenberg.org/cache/epub/730/pg730.txt",

    "twain_tom": "https://www.gutenberg.org/cache/epub/74/pg74.txt",
    "twain_huck": "https://www.gutenberg.org/cache/epub/76/pg76.txt",
    "twain_prince": "https://www.gutenberg.org/cache/epub/1837/pg1837.txt",
}

In [3]:
for name, url in books_urls.items():
	response = requests.get(url)
	path = f"../data/{name}.txt"

	with open(path, "w", encoding = "utf-8") as f:
		f.write(response.text)
	print(f"Book downloaded: {path}")

Book downloaded: ../data/austen_pride.txt
Book downloaded: ../data/austen_emma.txt
Book downloaded: ../data/austen_sense.txt
Book downloaded: ../data/dickens_two_cities.txt
Book downloaded: ../data/dickens_expectations.txt
Book downloaded: ../data/dickens_twist.txt
Book downloaded: ../data/twain_tom.txt
Book downloaded: ../data/twain_huck.txt
Book downloaded: ../data/twain_prince.txt


# 2. Section Marker

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import re

In [5]:
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /home/nico/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/nico/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
def strip_gutenberg_headers(text):
	start_match = re.search(r"\*\*\*\s*START OF (THE|THIS) PROJECT GUTENBERG EBOOK.*\*\*\*", text, re.IGNORECASE)
	start_index = start_match.end() if start_match else 0

	end_match = re.search(r"\*\*\*\s*END OF (THE|THIS) PROJECT GUTENBERG EBOOK.*\*\*\*", text, re.IGNORECASE)
	end_index = end_match.start() if end_match else len(text)

	clean_text = text[start_index:end_index].strip()
	return clean_text.strip()

In [7]:
def text_to_sent_token_lists(text):
	sentences = sent_tokenize(text)
	tokenized = []

	for sentence in sentences:
		sentence = sentence.lower()
		sentence = sentence.replace("-", " ")

		words = word_tokenize(sentence)

		cleaned = [re.sub(r"[^a-z']+", "", w) for w in words]
		cleaned = [w for w in cleaned if w and not w.isdigit()]

		if len(cleaned) >= 2:
			tokenized.append(cleaned)
    
	return tokenized

In [12]:
def build_corpus_from_books(book_dir):
    all_sentences = []
    files = sorted([file for file in os.listdir(book_dir)])
    
    for file_name in files:
        path = os.path.join(book_dir, file_name)
        print(f"Processing {path} ...")
        
        with open(path, "r", encoding = "utf-8") as f:
            raw = f.read()
            
        stripped = strip_gutenberg_headers(raw)
        sentences = text_to_sent_token_lists(stripped)
        all_sentences.extend(sentences)
        
    print(f"\nTotal sentences in corpus: {len(all_sentences)}")
    return all_sentences

In [13]:
all_sentences = build_corpus_from_books("../data")

Processing ../data/austen_emma.txt ...
Processing ../data/austen_pride.txt ...
Processing ../data/austen_sense.txt ...
Processing ../data/dickens_expectations.txt ...
Processing ../data/dickens_twist.txt ...
Processing ../data/dickens_two_cities.txt ...
Processing ../data/twain_huck.txt ...
Processing ../data/twain_prince.txt ...
Processing ../data/twain_tom.txt ...

Total sentences in corpus: 44139


# 3. Section Marker

In [16]:
from gensim.models import Word2Vec
import os

In [17]:
dimensions = [50, 100, 200]
group_code = "G01"

In [19]:
def train_and_save_models(sentences, dimensions, group_code, output_dir):
    
    for dimension in dimensions:
        print(f"\nTraining model with dimension {dimension} ...")
        
        model = Word2Vec(
            sentences = sentences,
            vector_size = dimension,
            window = 5,
            min_count = 2,
            workers = 8,
            sg = 1,
            epochs = 30
        )
        
        model_path = os.path.join(output_dir, f"Books_{dimension}_{group_code}.model")
        vec_path = os.path.join(output_dir, f"Books_{dimension}_{group_code}.vec")
        
        model.save(model_path)
        model.wv.save_word2vec_format(vec_path, binary = False)
        
        print(f"Model saved in:\n  - {model_path}\n  - {vec_path}")

In [20]:
train_and_save_models(all_sentences, dimensions, group_code, "../models")


Training model with dimension 50 ...
Model saved in:
  - ../models/Books_50_G01.model
  - ../models/Books_50_G01.vec

Training model with dimension 100 ...
Model saved in:
  - ../models/Books_100_G01.model
  - ../models/Books_100_G01.vec

Training model with dimension 200 ...
Model saved in:
  - ../models/Books_200_G01.model
  - ../models/Books_200_G01.vec


# 4. Section Marker

In [67]:
from gensim.models import Word2Vec

In [68]:
model_dir = "../models"

model_paths = {
    50: f"{model_dir}/Books_50_G01.model",
    100: f"{model_dir}/Books_100_G01.model",
    200: f"{model_dir}/Books_200_G01.model"
}

models = {}
for dimension, path in model_paths.items():
    print(f"Loading model {dimension}D from {path} ...")
    models[dimension] = Word2Vec.load(path)
    print(models[dimension])


Loading model 50D from ../models/Books_50_G01.model ...
Word2Vec<vocab=16669, vector_size=50, alpha=0.025>
Loading model 100D from ../models/Books_100_G01.model ...
Word2Vec<vocab=16669, vector_size=100, alpha=0.025>
Loading model 200D from ../models/Books_200_G01.model ...
Word2Vec<vocab=16669, vector_size=200, alpha=0.025>


# 5. Section Marker

In [69]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

In [70]:
def plot_similar_words(model, target_word, topn = 5):
    
    similar_words = [w for w, _ in model.wv.most_similar(target_word, topn = topn)]
    all_words = [target_word] + similar_words

    vectors = np.array([model.wv[w] for w in all_words])

    tsne = TSNE(n_components = 2, random_state = 42, perplexity = 5)
    reduced = tsne.fit_transform(vectors)

    plt.figure(figsize=(10, 8))
    plt.scatter(reduced[:, 0], reduced[:, 1])

    for i, word in enumerate(all_words):
        if i == 0:
            plt.annotate(word, xy = (reduced[i, 0], reduced[i, 1]), fontsize = 14, color = 'red', fontweight = 'bold')
        else:
            plt.annotate(word, xy = (reduced[i, 0], reduced[i, 1]), fontsize = 12)

    plt.title(f"Words most similar to '{target_word}'")
    plt.show()

In [78]:
main_characters = {
    "austen_pride": ["elizabeth", "darcy"],
    "austen_emma": ["emma", "harriet", "elton"],
    "austen_sense": ["elinor", "marianne"],
    "dickens_two_cities": ["darnay", "carton", "lucie"],
    "dickens_expectations": ["pip", "havisham", "magwitch"],
    "dickens_twist": ["oliver", "fagin", "nancy"],
    "twain_tom": ["tom", "becky", "sid"],
    "twain_huck": ["huck", "jim", "duke"],
    "twain_prince": ["prince", "pauper", "edward"],
}

In [84]:
def plot_and_save_similar_words(model, target_word, book_id, dimension, topn, output_dir):

    similar_words = model.wv.most_similar(target_word, topn = topn)
    words = [target_word] + [w for w, _ in similar_words]
    vectors = np.array([model.wv[w] for w in words])

    tsne = TSNE(n_components = 2, random_state = 42, perplexity = min(5, len(words)-1))
    reduced = tsne.fit_transform(vectors)

    plt.figure(figsize=(8, 6))
    plt.scatter(reduced[0, 0], reduced[0, 1], color = 'red')  
    plt.text(reduced[0, 0] + 1, reduced[0, 1] + 1, target_word, fontsize = 12, color = 'red', weight = 'bold')

    for i, word in enumerate(words[1:], start = 1):
        x, y = reduced[i, 0], reduced[i, 1]
        plt.scatter(x, y)
        plt.text(x + 1, y + 1, word, fontsize=10)

    plt.title(f"Most similar to '{target_word}' ({book_id}, dim = {dimension})")
    plt.tight_layout()

    filename = f"{target_word}_{dimension}d.png"
    filepath = os.path.join(output_dir, filename)
    plt.savefig(filepath)
    plt.close()  
    print(f"Saved: {filepath}")

In [85]:
for book_id, characters in main_characters.items():
	for name in characters:
		plot_and_save_similar_words(models[50], name, book_id, 50, 7, "../figures")

Saved: ../figures/elizabeth_50d.png
Saved: ../figures/darcy_50d.png
Saved: ../figures/emma_50d.png
Saved: ../figures/harriet_50d.png
Saved: ../figures/elton_50d.png
Saved: ../figures/elinor_50d.png
Saved: ../figures/marianne_50d.png
Saved: ../figures/darnay_50d.png
Saved: ../figures/carton_50d.png
Saved: ../figures/lucie_50d.png
Saved: ../figures/pip_50d.png
Saved: ../figures/havisham_50d.png
Saved: ../figures/magwitch_50d.png
Saved: ../figures/oliver_50d.png
Saved: ../figures/fagin_50d.png
Saved: ../figures/nancy_50d.png
Saved: ../figures/tom_50d.png
Saved: ../figures/becky_50d.png
Saved: ../figures/sid_50d.png
Saved: ../figures/huck_50d.png
Saved: ../figures/jim_50d.png
Saved: ../figures/duke_50d.png
Saved: ../figures/prince_50d.png
Saved: ../figures/pauper_50d.png
Saved: ../figures/edward_50d.png
