# Word Embeddings for Shakespearean English
## Neural Networks and Deep Learning
### Karanveer Singh
#### 0382306

# 1.Data
## Using nltk.corpus.gutenberg.raw to load the three plays listed above into a single variable and lower the case.



In [None]:
# Importing libraries
import nltk
from nltk.corpus import gutenberg
from nltk.corpus import stopwords
from nltk import tokenize
from nltk.stem import WordNetLemmatizer


!pip install autocorrect
from autocorrect import Speller
import re



In [None]:
# Downloading requirements
nltk.download("gutenberg")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
raw_txt = nltk.corpus.gutenberg.raw

In [None]:
# Load the texts of the three plays into a single variable
hamlet_text = gutenberg.raw("shakespeare-hamlet.txt")
macbeth_text = gutenberg.raw("shakespeare-macbeth.txt")
julius_caesar_text = gutenberg.raw("shakespeare-caesar.txt")

In [None]:
raw_txt = hamlet_text + macbeth_text + julius_caesar_text
# Lower the case of the plays
raw_txt = raw_txt.lower()

In [None]:
#print(raw_txt)


##Tokenize the text into sentences, and then each sentence into words.





In [None]:
# Tokenizing the raw text into sentences
txt_sents = tokenize.sent_tokenize(raw_txt)

In [None]:
print(txt_sents)



In [None]:
# Tokeninzing the sentences into words
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]

In [None]:
print(txt_words)



###Use Speller from the autocorrect library to correct spelling mistakes.

In [None]:
# Create a Speller instance
spell = Speller()

# Define the batch size (e.g., 1000 tokens per batch)
batch_size = 1000000

# Initialize a list to store corrected words
corrected_words = []

# Process the text in batches
for i in range(0, len(txt_words), batch_size):
    batch = txt_words[i:i + batch_size]
    corrected_batch = [spell(' '.join(word)) for word in batch]
    corrected_words.extend(corrected_batch)

# Print the corrected words (first 10)
print(corrected_words[:10])

['[ the tragedies of hamlet by william shakespeare 1599 ] acts primes .', 'scene prima .', 'enter bernard and francisco two sentinel .', 'bernard .', "who 's there ?", 'fran .', 'nay answer me : stand & unfold your self bar .', 'long like the king fran .', 'bernard ?', 'bar .']


###Create a list of stopwords (using publicly available lists and/or adding your own) and remove these.


In [None]:
# English stopwords
stop_nltk = stopwords.words("english")
from string import punctuation
stop_punct = list(punctuation)
stop_final = stop_punct + stop_nltk

In [None]:
# Defining the drop function for removing stop words
def drop_stop(input_tokens):
    return [token for token in input_tokens if token not in stop_final]

In [None]:
txt_words_nostop = [drop_stop(sent) for sent in txt_words]

In [None]:
print(corrected_words)



In [None]:
print(txt_words_nostop)



###  Use PorterStemmer or WordNetLemmatizer from nltk.stem on the text.

In [None]:
# Using WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatized_words = []
for ls in txt_words_nostop:
    word_lem = []
    for word in ls:
        word_lem.append(lemmatizer.lemmatize(word))
    lemmatized_words.append(word_lem)

In [None]:
print(lemmatized_words)



###Use regular expressions (the re library) to do any additional cleanup of the text you wish to do.

In [None]:
# Lets do additional cleanup

# Define regular expressions to remove punctuation and numbers
punctuation_pattern = re.compile(r'[^\w\s]')
numbers_pattern = re.compile(r'\d')

# The sentences in lemmatized_word(which itself is a list) are represented by lists
# Process each sentence separately
for i in range(len(lemmatized_words)):
    sentence = lemmatized_words[i]

    # Remove punctuation and numbers from each word in the sentence
    sentence = [punctuation_pattern.sub('', word) for word in sentence]
    sentence = [numbers_pattern.sub('', word) for word in sentence]

    # Remove extra whitespace from each word in the sentence
    sentence = ' '.join(sentence)

    # Filter out single alphabetic characters (single letters)
    sentence = ' '.join([word for word in sentence.split() if len(word) > 1])
    #when priting the result found some sentences with only a single alphabet so removed that.


    lemmatized_words[i] = sentence


In [None]:
print(lemmatized_words)



## Print out the words in the first five sentences of the processed text data. (Viewing this may give you additional ideas for the previous steps.)

In [None]:
for sentence in lemmatized_words[:6]:
    print(sentence)

tragedie hamlet william shakespeare actus primus
scoena prima
enter barnardo francisco two centinels
barnardo

fran


# 2.Modeling
## Create a CBOW word2vec model from gensim.model. Make choices of vector_size, epochs, window, min_count, and possibly other hyperparameters.


# CBOW word2vec model

In [None]:
import gensim.downloader as api
from gensim.models import Word2Vec

In [None]:
# Define hyperparameters as a dictionary
epochs = 10
sg = 0
min_count = 5
window = 2
vector_size = 300
sample = 6e-5
alpha = 0.03
min_alpha = 0.0007
negative = 20

In [None]:
sentences = lemmatized_words

# Split each sentence into words and remove empty sentences
cleaned_sentences = []
for sentence in sentences:
    words = sentence.split()
    if words:  # Check if the list of words is not empty
        cleaned_sentences.append(words)

# Initialize and train the Word2Vec model with the defined hyperparameters
model = Word2Vec(
    cleaned_sentences,
    epochs=epochs,
    sg=sg,
    min_count=min_count,
    window=window,
    vector_size=vector_size,
    sample=sample,
    alpha=alpha,
    min_alpha=min_alpha,
    negative=negative
)

### Use gensim.model.wv.key_to_index  and gensim.model.wv.get_vecattr to print out a list of the 20 most frequent words in the vocabulary along with the word count

In [None]:
# Access the Word2Vec model's vocabulary
vocab = model.wv.key_to_index

# Get word counts for the 20 most frequent words
word_counts = [(word, vocab[word]) for word in sorted(vocab, key=vocab.get, reverse=True)[:20]]

# Print the 20 most frequent words and their word counts
for word, count in word_counts:
    print(f"{word}: {count}")

censure: 1325
hate: 1324
vouchsafe: 1323
cure: 1322
soueraigne: 1321
awe: 1320
safely: 1319
disease: 1318
vice: 1317
spot: 1316
step: 1315
seale: 1314
dutie: 1313
rash: 1312
passage: 1311
courtier: 1310
broad: 1309
schoole: 1308
therein: 1307
limbes: 1306


In [None]:
vocab = model.wv.get_vecattr

## Create a skipgram word2vec model from gensim.model. Making choices of vector_size, epochs, window, min_count, and possibly other hyperparameters. Training it on the cleaned Shakespeare text data.

# skipgram word2vec model

In [None]:
# Initialize and train the Skipgram Word2Vec model with the specified hyperparameters
skipgram_model = Word2Vec(
    cleaned_sentences,
    vector_size=vector_size,
    epochs=epochs,
    window=window,
    min_count=min_count,
    sg=sg,
    sample=sample,
    alpha=alpha,
    min_alpha=min_alpha,
    negative=negative
)

# Access word vectors
word_vectors = skipgram_model.wv

## Load the pretrained GloVe model from gensim.models.keyedvectors for comparison with the models trained on Shakespeare text.

In [None]:
from gensim.models.keyedvectors import KeyedVectors
import requests

I am using Common Crawl available on Github: https://github.com/stanfordnlp/GloVe. I choose this model file since it has:

840B tokens,
2.2M vocab,
300d vectors,

This word vectors is trained on massive web datasets and is made available under the Public Domain Dedication and License.

The GloVe model is trained on the non-zero entries of a global word-word co-occurrence matrix, which tabulates how frequently words co-occur with one another in a given corpus.

In [None]:
# Check the dimension of the word vectors in the GloVe model
sample_word = "king"
vector = loaded_glove_model[sample_word]
vector_dim = len(vector)

print(f"The word vectors in the GloVe model are {vector_dim}-dimensional.")

The word vectors in the GloVe model are 100-dimensional.


# 3.Discussion

### Comparing the three models by finding the 5 most similar terms to each of the following terms: 'hamlet', 'cauldron', 'nature', 'spirit', 'general', and 'prythee'.

In [None]:
def find_similar_terms(model, word, topn=5):
    similar_words = []
    try:
        if hasattr(model, 'wv'):  # Check if the model has a 'wv' attribute (Word2Vec model)
            similar_words = model.wv.most_similar(word, topn=topn)
        else:
            similar_words = model.most_similar(word, topn=topn)
    except KeyError:
        print(f"'{word}' is not in the vocabulary.")
    return similar_words

In [None]:
# Words to find similar terms for
words_to_find = ['hamlet', 'cauldron', 'nature', 'spirit', 'general', 'prythee']

In [None]:
# Load 3 modelS
CBOW_Word2Vec = model
Skipgram_Word2Vec = skipgram_model
GloVe_100D = loaded_glove_model

In [None]:
# Find similar terms for each word using each model
# Find 5 most similar terms for each word using each model
similar_terms = {
    'CBOW Word2Vec': {},
    'Skipgram Word2Vec': {},
    'GloVe 100-dimensional': {}
}

for word in words_to_find:
    similar_terms['CBOW Word2Vec'][word] = find_similar_terms(CBOW_Word2Vec, word, topn=5)
    similar_terms['Skipgram Word2Vec'][word] = find_similar_terms(Skipgram_Word2Vec, word, topn=5)
    similar_terms['GloVe 100-dimensional'][word] = find_similar_terms(GloVe_100D, word, topn=5)

'general' is not in the vocabulary.
'general' is not in the vocabulary.
'prythee' is not in the vocabulary.


In [None]:
# Print the results side by side
for word in words_to_find:
    print(f"Word: {word}")
    for model_name, similar_words in similar_terms.items():
        print(f"{model_name}:")
        for term, similarity in similar_words[word]:
            print(f"- {term}: {similarity}")
    print()


Word: hamlet
CBOW Word2Vec:
- word: 0.9998230338096619
- blood: 0.9998229742050171
- doth: 0.999819815158844
- hath: 0.9998195171356201
- like: 0.9998189210891724
Skipgram Word2Vec:
- word: 0.9998261332511902
- doth: 0.9998247027397156
- heart: 0.9998237490653992
- blood: 0.9998235702514648
- hath: 0.9998215436935425
GloVe 100-dimensional:
- village: 0.6998987197875977
- town: 0.6558532118797302
- situated: 0.5926076173782349
- located: 0.5660547614097595
- unincorporated: 0.5599358677864075

Word: cauldron
CBOW Word2Vec:
- hand: 0.9998217225074768
- ayre: 0.999819278717041
- vp: 0.9998164176940918
- ith: 0.9998149871826172
- might: 0.9998104572296143
Skipgram Word2Vec:
- ayre: 0.9998202919960022
- hand: 0.9998143315315247
- neuer: 0.9998138546943665
- vp: 0.9998131394386292
- ith: 0.9998098611831665
GloVe 100-dimensional:
- caldron: 0.7603139877319336
- flame: 0.6907342672348022
- lit: 0.5912410020828247
- torch: 0.5581894516944885
- candle: 0.547653079032898

Word: nature
CBOW Word2V

### Word: hamlet
- **CBOW Word2Vec and Skipgram Word2Vec:** These models provide very high similarities for words like "word," "like," and "blood." They seem to capture the context of the word "hamlet" in terms of language and literary references.
- **GloVe 100-dimensional:** This model returns relatively unrelated terms like "village," "town," and "situated." This suggests that this model may not capture the context or meaning of "hamlet" as well as the Word2Vec models.

### Word: cauldron
- **CBOW Word2Vec and Skipgram Word2Vec:** These models provide relatively high similarities for words like "hand" and "ayre." They seem to capture the association between "cauldron" and other objects or concepts.
- **GloVe 100-dimensional:** This model returns "caldron," which is a close match and suggests a stronger understanding of the word's meaning.

### Word: nature
- **CBOW Word2Vec and Skipgram Word2Vec:** These models return similar words such as "go," "let," and "caesar." They may not effectively capture the word's meaning.
- **GloVe 100-dimensional:** This model returns terms like "natural" and "true," which are related to "nature" and suggest a better understanding of the word.

### Word: spirit
- **CBOW Word2Vec and Skipgram Word2Vec:** These models provide terms like "go," "would," and "minde," which may not directly relate to "spirit."
- **GloVe 100-dimensional:** This model returns terms like "passion," "faith," and "love," which are more relevant to the concept of "spirit."

### Word: general
- **CBOW Word2Vec and Skipgram Word2Vec:** These models seem to provide no results for "general," indicating that they may not recognize the term.
- **GloVe 100-dimensional:** This model returns terms like "secretary," "chief," and "gen.," which, while not directly related to "general" in the context of a military leader, suggest a more informed understanding compared to the Word2Vec models.

### Word: prythee
- **CBOW Word2Vec and Skipgram Word2Vec:** These models return words like "ith," "go," and "thy," which are common in archaic language but may not capture the precise meaning of "prythee."
- **GloVe 100-dimensional:** This model does not provide any results for "prythee," which suggests it may not have a representation for this archaic word.


### Compare the three models by finding the cosine similarity between the following pairs of terms: ('brutus', 'murder'), ('lady macbeth', 'queen gertrude'), ('fortinbras', 'norway'), ('rome', 'norway'), ('ghost', 'spirit'), ('macbeth', 'hamlet'). Comment on how well each model captured the similarity between these terms, especially considering the data that each was trained on.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Define pairs of terms for cosine similarity
term_pairs = [
    ('brutus', 'murder'),
    ('lady macbeth', 'queen gertrude'),
    ('fortinbras', 'norway'),
    ('rome', 'norway'),
    ('ghost', 'spirit'),
    ('macbeth', 'hamlet')
]

# Calculate cosine similarity for term pairs using each model
models = [CBOW_Word2Vec, Skipgram_Word2Vec]
model_names = ['CBOW Word2Vec', 'Skipgram Word2Vec']
model_glove = GloVe_100D

for word1, word2 in term_pairs:
    print(f"Term pairs: '{word1}' and '{word2}'")
    for model_name, model in zip(model_names, models):
        if word1 in model.wv and word2 in model.wv:
            v1 = model.wv[word1]
            v2 = model.wv[word2]
            similarity = cosine_similarity([v1], [v2])[0][0]
            print(f"{model_name}: Similarity = {similarity:.4f}")
        else:
            print(f"{model_name}: One or both terms not found")
    # For GloVe
    if word1 in model_glove and word2 in model_glove:
        v1 = model_glove[word1]
        v2 = model_glove[word2]
        similarity = cosine_similarity([v1], [v2])[0][0]
        print(f"GloVe 100-dimensional: Similarity = {similarity:.4f}")
    else:
        print("GloVe 100-dimensional: One or both terms not found")
    print()

Term pairs: 'brutus' and 'murder'
CBOW Word2Vec: One or both terms not found
Skipgram Word2Vec: One or both terms not found
GloVe 100-dimensional: Similarity = 0.0736

Term pairs: 'lady macbeth' and 'queen gertrude'
CBOW Word2Vec: One or both terms not found
Skipgram Word2Vec: One or both terms not found
GloVe 100-dimensional: One or both terms not found

Term pairs: 'fortinbras' and 'norway'
CBOW Word2Vec: Similarity = 0.9998
Skipgram Word2Vec: Similarity = 0.9998
GloVe 100-dimensional: Similarity = -0.0290

Term pairs: 'rome' and 'norway'
CBOW Word2Vec: Similarity = 0.9997
Skipgram Word2Vec: Similarity = 0.9997
GloVe 100-dimensional: Similarity = 0.2858

Term pairs: 'ghost' and 'spirit'
CBOW Word2Vec: Similarity = 0.9997
Skipgram Word2Vec: Similarity = 0.9997
GloVe 100-dimensional: Similarity = 0.4282

Term pairs: 'macbeth' and 'hamlet'
CBOW Word2Vec: Similarity = 0.9997
Skipgram Word2Vec: Similarity = 0.9997
GloVe 100-dimensional: Similarity = 0.4294



1. **Term pairs: 'brutus' and 'murder'**
   - CBOW Word2Vec and Skipgram Word2Vec: One or both terms not found.
   - GloVe 100-dimensional: Similarity = 0.0736
   - GloVe provides a similarity score for these terms, suggesting a weak connection between "brutus" and "murder."

2. **Term pairs: 'lady macbeth' and 'queen gertrude'**
   - CBOW Word2Vec and Skipgram Word2Vec: One or both terms not found.
   - GloVe 100-dimensional: One or both terms not found.
   - None of the models found similarities for these specific character pairs from Shakespeare's plays, which might indicate limited exposure to these character combinations during training.

3. **Term pairs: 'fortinbras' and 'norway'**
   - CBOW Word2Vec: Similarity = 0.9998
   - Skipgram Word2Vec: Similarity = 0.9998
   - GloVe 100-dimensional: Similarity = -0.0290
   - Word2Vec models return a high similarity score, indicating a strong association between "fortinbras" and "norway." In contrast, GloVe provides a low similarity, potentially suggesting less clarity on this specific association.

4. **Term pairs: 'rome' and 'norway'**
   - CBOW Word2Vec: Similarity = 0.9997
   - Skipgram Word2Vec: Similarity = 0.9997
   - GloVe 100-dimensional: Similarity = 0.2858
   - All models show a high similarity between "rome" and "norway." However, the GloVe model provides a slightly lower similarity score, indicating a potential difference in how it represents these terms.

5. **Term pairs: 'ghost' and 'spirit'**
   - CBOW Word2Vec: Similarity = 0.9997
   - Skipgram Word2Vec: Similarity = 0.9997
   - GloVe 100-dimensional: Similarity = 0.4282
   - All models find a high similarity between "ghost" and "spirit." However, GloVe gives a lower similarity score, suggesting it may differentiate between these terms more than the Word2Vec models.

6. **Term pairs: 'macbeth' and 'hamlet'**
   - CBOW Word2Vec: Similarity = 0.9997
   - Skipgram Word2Vec: Similarity = 0.9997
   - GloVe 100-dimensional: Similarity = 0.4294
   - Similar to the previous pair, all models show high similarities between "macbeth" and "hamlet." GloVe has a slightly lower similarity score, indicating a nuanced difference.

In summary, the differences in similarity scores between the models may reflect variations in their training data, architecture, or word embeddings. GloVe sometimes returns lower scores, suggesting it might be more cautious in making associations or that its representation might differ from Word2Vec models. The ability to find terms may also depend on the specific vocabulary used in each model's training corpus.


## Comparing the three models by finding the 5 most similar terms to each of the following word vectors obtained via linear combination: 'denmark' + 'queen', 'scotland' + 'army' + 'general', 'father' - 'man' + 'woman', 'mother' - 'woman' + 'man'.


In [None]:
# Define the word combinations
word_combinations = [
    ['denmark', 'queen'],
    ['scotland', 'army', 'general'],
    ['father', 'woman', 'man'],
    ['mother', 'man', 'woman']
]

In [None]:
models = [CBOW_Word2Vec, Skipgram_Word2Vec]
model_names = ['CBOW Word2Vec', 'Skipgram Word2Vec']
model_glove = GloVe_100D

In [None]:
l = []

# Define the list of model names
model_names = ['CBOW Word2Vec', 'Skipgram Word2Vec', 'GloVe 100-dimensional']

# Define the list of models
models = [CBOW_Word2Vec, Skipgram_Word2Vec, model_glove]

# Loop through the models
for model_name, model in zip(model_names, models):
    l1 = []  # Initialize an empty list for each model
    try:
        if hasattr(model, 'wv'):
            l1 = model.wv.most_similar(positive=['denmark', 'queen'], topn=5)
        else:
            l1 = model.most_similar(positive=['denmark', 'queen'], topn=5)
    except KeyError:
        l1 = [("not found", 0.0)] * 5

    l2 = []

    try:
        if hasattr(model, 'wv'):
            l2 = model.wv.most_similar(positive=['scotland', 'army', 'general'], topn=5)
        else:
            l2 = model.most_similar(positive=['scotland', 'army', 'general'], topn=5)
    except KeyError:
        l2 = [("not found", 0.0)] * 5

    l3 = []

    try:
        if hasattr(model, 'wv'):
            l3 = model.wv.most_similar(positive=['father', 'woman'], negative=['man'], topn=5)
        else:
            l3 = model.most_similar(positive=['father', 'woman'], negative=['man'], topn=5)
    except KeyError:
        l3 = [("not found", 0.0)] * 5

    l4 = []

    try:
        if hasattr(model, 'wv'):
            l4 = model.wv.most_similar(positive=['mother', 'man'], negative=['woman'], topn=5)
        else:
            l4 = model.most_similar(positive=['mother', 'man'], negative=['woman'], topn=5)
    except KeyError:
        l4 = [("not found", 0.0)] * 5

    # Store the results along with the model name in a tuple
    result = (model_name, l1, l2, l3, l4)
    l.append(result)

In [None]:
# Get the word combinations from your previous code
word_combinations = ['denmark and queen', 'scotland, army, general', 'father, woman (without man)', 'mother, man (without woman)']

# Print the results
for word_combination, similar_terms in zip(word_combinations, zip(l[0][1:], l[1][1:], l[2][1:])):
    print(f"Word Combination: {word_combination}")
    for model_name, l1 in zip(model_names, similar_terms):
        print(f"Model: {model_name}")
        print("Similar Terms:")
        for term, similarity in l1:
            print(f"- {term}: {similarity:.4f}")
        print()


Word Combination: denmark and queen
Model: CBOW Word2Vec
Similar Terms:
- not found: 0.0000
- not found: 0.0000
- not found: 0.0000
- not found: 0.0000
- not found: 0.0000

Model: Skipgram Word2Vec
Similar Terms:
- not found: 0.0000
- not found: 0.0000
- not found: 0.0000
- not found: 0.0000
- not found: 0.0000

Model: GloVe 100-dimensional
Similar Terms:
- sweden: 0.7462
- norway: 0.7017
- kingdom: 0.6879
- princess: 0.6800
- britain: 0.6786

Word Combination: scotland, army, general
Model: CBOW Word2Vec
Similar Terms:
- not found: 0.0000
- not found: 0.0000
- not found: 0.0000
- not found: 0.0000
- not found: 0.0000

Model: Skipgram Word2Vec
Similar Terms:
- not found: 0.0000
- not found: 0.0000
- not found: 0.0000
- not found: 0.0000
- not found: 0.0000

Model: GloVe 100-dimensional
Similar Terms:
- force: 0.7447
- british: 0.7336
- military: 0.7317
- command: 0.7294
- forces: 0.7241

Word Combination: father, woman (without man)
Model: CBOW Word2Vec
Similar Terms:
- grace: 0.9996
-

## Comments

**CBOW Word2Vec:**
- For the word combinations "denmark and queen" and "scotland, army, general," CBOW Word2Vec didn't find any similar terms. This suggests that the model struggled to capture meaningful relationships between these words in the context of Shakespearean English.
- However, for "father, woman (without man)" and "mother, man (without woman)," CBOW Word2Vec provided some relevant terms like "grace," "hath," and "eye." While these terms are related to the input words, they may not entirely capture the intended context.

**Skipgram Word2Vec:**
- Similar to CBOW, Skipgram Word2Vec also couldn't find any similar terms for the word combinations "denmark and queen" and "scotland, army, general."
- For "father, woman (without man)" and "mother, man (without woman)," Skipgram Word2Vec yielded terms like "grace," "hath," and "rest," which are related to the input words but may not fully capture the intended context.

**GloVe 100-dimensional:**
- GloVe 100-dimensional demonstrated better performance for word combinations.
- For "denmark and queen," it provided relevant terms such as "sweden," "norway," and "princess." These terms seem to reflect a certain level of understanding of the relationship between Denmark and a queen.
- For "scotland, army, general," it offered terms like "force," "british," "military," and "command," which indicate an understanding of the military context related to Scotland.
- For "father, woman (without man)," it gave terms like "mother," "daughter," "wife," and "husband," which appropriately capture the relationship between a father and a woman in the absence of a man.
- For "mother, man (without woman)," it provided terms like "father," "brother," "son," and "uncle," indicating an understanding of the relationships among these terms.

In summary, the GloVe 100-dimensional model performed better in capturing the ideas behind word vectors in the context of Shakespearean English. It provided more relevant and meaningful terms for the word combinations, while both CBOW and Skipgram Word2Vec models struggled to find significant associations for some of the combinations. GloVe's training process, which leverages global word co-occurrence statistics, seems to have contributed to its better performance in this context.


## Model Performance Summary
**CBOW Word2Vec:**
- Generally provides relatively high similarities for various word pairs.
- Strong at capturing semantic relationships, especially for words like "hamlet," "cauldron," and "nature."
- Fails to provide any similarity values in some instances, indicating limitations.
- A robust model for word similarity but not equally effective for all word pairs.

**Skipgram Word2Vec:**
- Demonstrates strong performance in capturing word similarities for most pairs.
- Excellent at finding similar terms for words like "hamlet," "cauldron," and "nature."
- Sometimes performs slightly better than CBOW (e.g., "prythee").
- Like CBOW, faces challenges with specific word pairs ("general").
- A reliable model for word similarity, on par with CBOW.

**GloVe 100-dimensional:**
- A mixed performance despite its dimensionality.
- Delivers excellent results for some pairs (e.g., "hamlet," "cauldron," and "nature").
- Falls short in handling other pairs, particularly "general" and "prythee."
- May have limitations in capturing semantic nuances compared to Word2Vec models.
- More varied performance across different word pairs.
- A decent model for word similarity but may require fine-tuning.


Table for finding the similar words. 1 represents model was able to find the similar words and 0 that it failed.

|Similar words| CBOW Word2Vec | Skipgram Word2Vec | GloVe 100-dimensional |
|------------------|---------------|--------------------|------------------------|
| 'hamlet'  | 1 | 1 | 1 |
| 'cauldron'| 1 | 1 | 1 |
| 'nature'  | 1 | 1 | 1 |
| 'spirit'  | 1 | 1 | 1 |
| 'general' | 0 | 0 | 0 |
| 'prythee' | 1 | 1 | 0 |


---
Table for the model comparison for linear vector similarities where 0 represents that model didn't work and 1 that model worked.

| Word Combo| CBOW Word2Vec | Skipgram Word2Vec | GloVe 100-dimensional |
|------------------|---------------|--------------------|------------------------|
| 'denmark and queen'          | 0 | 0 | 1 |
| 'scotland, army, general'    | 0 | 0 | 1 |
| 'father, woman (without man)'| 1 | 1 | 0 |
| 'mother, man (without woman) | 1 | 1 | 0 |

---


Table for term pair cosine similarity where the values are written and 0 means no similarity was found

| Term Pair        | CBOW Word2Vec | Skipgram Word2Vec | GloVe 100-dimensional |
|------------------|---------------|--------------------|------------------------|
| 'brutus' - 'murder'               | 0      | 0      | 0.0736 |
| 'lady macbeth' - 'queen gertrude' | 0      | 0      | 0      |
| 'fortinbras' - 'norway'           | 0.9998 | 0.9998 | -0.0290|
| 'rome' - 'norway'                 | 0.9997 | 0.9997 | 0.2858 |
| 'ghost' - 'spirit'                | 0.9997 | 0.9997 | 0.4282 |
| 'macbeth' - 'hamlet'              | 0.9997 | 0.9997 | 0.4294 |



Overall, CBOW and Skipgram Word2Vec are reliable choices for most scenarios, offering strong performance in capturing word similarities. GloVe 100-dimensional, while effective in some cases, exhibits a mixed performance and may require adjustments depending on the application and dataset.



To create a word embedding model that effectively, a diverse and extensive dataset is essential. This dataset should include Shakespeare's complete works, modern English translations, historical context, multilingual content,  linguistic resources, and a substantial corpus. The model should be fine-tuned and evaluated to ensure a faithful representation of Shakespearean language nuances, enabling it to convey the rich meaning and linguistic intricacies of this historical form of English.