In [14]:
%pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [15]:
import nltk
import numpy as np
import pandas as pd
import gensim
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [16]:
nltk.download('gutenberg')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords') 
nltk.corpus.gutenberg.fileids()

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/johnpaultamburro/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/johnpaultamburro/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johnpaultamburro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnpaultamburro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

Use this NLTK corpus:
Option A (recommended): Gutenberg - 'austen-emma.txt' or 'carroll-alice.txt'


In [17]:
corpus = nltk.corpus.gutenberg.raw('austen-emma.txt')
#used to read and gain a better understanding of the corpus.
#print(corpus)

In [18]:
#A1
raw_text = nltk.corpus.gutenberg.raw('austen-emma.txt')
sentences = nltk.corpus.gutenberg.sents('austen-emma.txt')
num_characters = len(raw_text)
num_sentences = len(sentences)
print(f"Number of characters: {num_characters}")
print(f"Number of sentences: {num_sentences}")
print(f"Total number of tokens BEFORE preprocessing: {len(nltk.word_tokenize(raw_text))}")

Number of characters: 887071
Number of sentences: 7752
Total number of tokens BEFORE preprocessing: 191855


In [19]:
#A2
def preprocess(text):
    # accept either raw string or a token list (e.g., a single sentence)
    if isinstance(text, str):
        text = text.lower()
        tokens = nltk.word_tokenize(text)
    else:
        # treat as an iterable of tokens (e.g., list or sentence from corpus)
        tokens = [str(token).lower() for token in text]
    stop = nltk.corpus.stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    tokens = [token for token in tokens if token.isalpha()]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

vocabulary = preprocess(raw_text)
# preprocess each sentence separately to produce a list of token lists
sent_vocab = [preprocess(sent) for sent in sentences]
vocabulary_size = len(set(vocabulary))
sent_vocabulary_size = len(set([token for sentence in sent_vocab for token in sentence]))
freqtoken = nltk.FreqDist(vocabulary)
print(f"Total number of characters (raw text): {len(raw_text)}")
print(f"Total number of tokens AFTER preprocessing: {len(vocabulary)}")
#This not changing makes sense, as we are working in a character based model.
print(f"Total number of tokens in sentences AFTER preprocessing: {len(sent_vocab)}")
print(f"Vocabulary size (unique tokens): {vocabulary_size}")
print(f"Top 20 most frequent tokens: {freqtoken.most_common(20)}")


KeyboardInterrupt: 

A3

My preprocessing choices will influence various tasks done in other parts. The mandatory preprocessing has already reduced the number of unqiue tokens present, reducing dimensionality and thus preventing overfitting. The initial plan was to keep stop words due to the story-like nature of the corpus, as these stop words contribute to the flow and gramatical refinement of the sentences that make up the story. However, initial running proved this to logic to be impractical and thus stop-words were removed. Using lemmatization instead of stemming will make processes slower, although the end results will be more accurate.

In [None]:
#B1
chapters = nltk.corpus.gutenberg.fileids()
documents = []
document_labels = []

for chapter_file_id in chapters:
    book_raw_text = nltk.corpus.gutenberg.raw(chapter_file_id)
    chapter_strings = book_raw_text.split('CHAPTER')[1:]


    for i, chapter_text in enumerate(chapter_strings):
        documents.append(chapter_text)
        document_labels.append(f"{chapter_file_id.replace('.txt', '')} - Chapter {i+1}")

processed_documents = [' '.join(preprocess(doc)) for doc in documents]

print(chapters)

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [None]:
#B2
print(chapters)
BagofWords = CountVectorizer()
TFIDF = TfidfVectorizer()
BagofWords_matrix = BagofWords.fit_transform(processed_documents)
TFIDF_matrix = TFIDF.fit_transform(processed_documents)
print(f"Shape of Bag-of-Words matrix: {BagofWords_matrix.shape}")
print(f"Shape of TF-IDF matrix: {TFIDF_matrix.shape}")

blake15 = TFIDF_matrix[0].toarray()[0].argsort()[-15:][::-1]
blake15_string = TFIDF.get_feature_names_out()[blake15]
bryant15 = TFIDF_matrix[1].toarray()[0].argsort()[-15:][::-1]
bryant15_string = TFIDF.get_feature_names_out()[bryant15]
print(f"Top 15 TF-IDF terms for first document:",blake15_string)
print(f"Top 15 TF-IDF terms for second document:",bryant15_string)

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
Shape of Bag-of-Words matrix: (292, 19422)
Shape of TF-IDF matrix: (292, 19422)
Top 15 TF-IDF terms for first document: ['taylor' 'emma' 'miss' 'weston' 'knightley' 'papa' 'match' 'woodhouse'
 'father' 'success' 'james' 'every' 'year' 'always' 'must']
Top 15 TF-IDF terms for second document: ['churchill' 'weston' 'miss' 'highbury' 'taylor' 'woodhouse' 'letter'
 'frank' 'perry' 'marriage' 'never' 'randalls' 'brother' 'could' 'enough']


In [None]:
#B3
cosine_sim = np.dot(TFIDF_matrix, TFIDF_matrix.T).toarray()
np.fill_diagonal(cosine_sim, 0)
max_sim_index = np.unravel_index(np.argmax(cosine_sim), cosine_sim.shape)
max_sim_value = cosine_sim[max_sim_index]
rounded_value = int((round(max_sim_value, 2)) * 100)
print(f"Most similar pair of documents: {document_labels[max_sim_index[0]]} and {document_labels[max_sim_index[1]]} with a similarity of {rounded_value} percent.")

sim_table = pd.DataFrame(cosine_sim, index=document_labels, columns=document_labels)
stringed_sim_table = sim_table.applymap(lambda x: f"{int(round(x, 2) * 100)}%")
print("Similarity table:")
print(stringed_sim_table.iloc[:5, :5])

Most similar pair of documents: carroll-alice - Chapter 9 and carroll-alice - Chapter 10 with a similarity of 75 percent.


  stringed_sim_table = sim_table.applymap(lambda x: f"{int(round(x, 2) * 100)}%")


Similarity table:
                        austen-emma - Chapter 1 austen-emma - Chapter 2  \
austen-emma - Chapter 1                      0%                     44%   
austen-emma - Chapter 2                     44%                      0%   
austen-emma - Chapter 3                     35%                     30%   
austen-emma - Chapter 4                     36%                     27%   
austen-emma - Chapter 5                     48%                     27%   

                        austen-emma - Chapter 3 austen-emma - Chapter 4  \
austen-emma - Chapter 1                     35%                     36%   
austen-emma - Chapter 2                     30%                     27%   
austen-emma - Chapter 3                      0%                     33%   
austen-emma - Chapter 4                     33%                      0%   
austen-emma - Chapter 5                     33%                     42%   

                        austen-emma - Chapter 5  
austen-emma - Chapter 1       

B4 (Reflection)

Organizing by chapter was a logical thing to do as it reflects the corpus structure.

The top TF-IDF terms reflect the characters/topics that are focused on in the given chapter (which are linked to the actual in-book/corpus chapters). For instance, chapter 1 introduces and focuses most on Emma Woodhouse, Miss Taylor, Mr. Weston, and Mr. Knightly; while chapter 2 looks more into Mr. Weston and Miss Taylor.

For the most similar chapters via cosine comparison, chapter 10 directly explores the aftermath after the events of chapter 9, and is also the shortest chapter, which would explain their high similarity.


In [None]:
#C1
trainer = nltk.tokenize.sent_tokenize
sentences = [preprocess(sentence) for sentence in trainer(raw_text)]

In [None]:
#C2
model = gensim.models.Word2Vec(sentences, vector_size=100, window=5, min_count=3, sg=1, epochs=10)

In [None]:
#C3
targets = [word for word, _ in freqtoken.most_common(5)]
for word in targets:
    if word in model.wv:
        similar_words = model.wv.most_similar(word, topn=10)
        print(f"\nTop 10 most similar words to '{word}':")
        for sim_word, score in similar_words:
            print(f"  {sim_word}: {score:.4f}")
    else:
        print(f"\nWord '{word}' not in vocabulary.")


Top 10 most similar words to 'emma':
  smiling: 0.8791
  looked: 0.8436
  head: 0.8414
  weston: 0.8333
  tone: 0.8328
  warmly: 0.8300
  listened: 0.8291
  laughing: 0.8291
  trying: 0.8242
  voice: 0.8231

Top 10 most similar words to 'could':
  possible: 0.8472
  smallest: 0.8421
  less: 0.8376
  meaning: 0.8358
  reason: 0.8346
  sake: 0.8296
  spite: 0.8252
  degree: 0.8183
  answer: 0.8170
  difficulty: 0.8155

Top 10 most similar words to 'would':
  impossible: 0.8969
  justice: 0.8883
  ashamed: 0.8851
  try: 0.8849
  must: 0.8838
  assured: 0.8759
  differently: 0.8746
  whenever: 0.8741
  reason: 0.8725
  judge: 0.8718

Top 10 most similar words to 'miss':
  poor: 0.7723
  niece: 0.7510
  told: 0.7194
  madam: 0.7180
  jane: 0.7117
  hoped: 0.7046
  obliging: 0.7021
  inquiry: 0.6988
  oh: 0.6987
  thanks: 0.6987

Top 10 most similar words to 'must':
  chuse: 0.9114
  worth: 0.9070
  disagreeable: 0.9063
  whenever: 0.8995
  trouble: 0.8990
  agree: 0.8985
  dearest: 0.8979


In [None]:
#C4
analogyA = model.wv.most_similar(positive=['mr', 'weston'], negative=['emma'], topn=1)
analogyB = model.wv.most_similar(positive=['knightley', 'weston'], negative=['letter'], topn=1)
analogyC = model.wv.most_similar(positive=['miss', 'taylor'], negative=['woodhouse'], topn=1)
print(f"\nAnalogy A: {analogyA}")
print(f"\nAnalogy B: {analogyB}")
print(f"\nAnalogy C: {analogyC}")
#More to further test performance:
analogyD = model.wv.most_similar(positive=['churchill', 'emma'], negative=['never'], topn=1)
analogyE = model.wv.most_similar(positive=['taylor', 'woodhouse'], negative=['letter'], topn=1)
analogyF = model.wv.most_similar(positive=['elton', 'think'], negative=['miss'], topn=1)
print(f"\nAnalogy D: {analogyD}")
print(f"\nAnalogy E: {analogyE}")
print(f"\nAnalogy F: {analogyF}")


Analogy A: [('dancing', 0.8931943774223328)]

Analogy B: [('emma', 0.720059335231781)]

Analogy C: [('poor', 0.7635173797607422)]

Analogy D: [('looking', 0.6551715731620789)]

Analogy E: [('smith', 0.8085635900497437)]

Analogy F: [('speak', 0.8679779767990112)]


C5

Concerning neighbours, three of them are remeniscent of stop words and have logical neighbours; the model is correctly confident, given the high accuracy. miss has slightly lower numbers reflecting the fact the model may be getting confused by double meaning of the word (to not hit and the feminine title). There are a few sensable neighbours present here, such as madam and niece. This confusion is 'acknowledged' by the model with slightly lower scores. Emma is the worst offender; as a character with which no standard word is similar to, the model is left to its own devices linking her to other words, such as smiling and added (these may reflect Emma's actions or persona, but this is extremely unlikely and impractical to verify).

Analogies within this corpus seem solid, even as three extra were done out of skeptism. All have strong confidence values, and make sense to a certain extent based on vague researching of the corpus.