In [1]:
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import re

In [6]:
paragraph = """
The Matter of Britain (French: matière de Bretagne; Welsh: Mater Prydain; Cornish: Mater Brythain; Breton: Afer Breizh-Veur) is the body of medieval literature and legendary material associated with Great Britain and Brittany and the legendary kings and heroes associated with it, particularly King Arthur. The 12th-century writer Geoffrey of Monmouth's Historia Regum Britanniae (History of the Kings of Britain) is a central component of the Matter of Britain.
"""

In [7]:
# Preprocessing the text
text = re.sub(r'\[[0-9]*\]', ' ', paragraph) # removing numbers in brackets
text = re.sub(r'\s+', ' ', text) # Remove extra spaces
text = text.lower() # Convert to lower case
text = re.sub(r'\d', ' ', text) # Remove numeric values
text = re.sub(r'\s+', ' ', text) # Remove extra spaces again


In [9]:
# tokenising sentences and words
sentences = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

In [10]:
# Removing stopwords
stop_words = set(stopwords.words("english"))
filter_sentences = [[word for word in sentence if word not in stop_words] for sentence in sentences]

In [11]:
# Training the word to vec model
model = Word2Vec(filter_sentences, vector_size = 100, window = 5, min_count = 1, workers = 4) 

In [12]:
# Words in vocabulary
words = model.wv.index_to_key # Updated way to access words and vocabulary

In [14]:
# Finding word vectors (ensure word exists before accessing)
if "medieval" in words:
    vector = model.wv["medieval"]
    print("Vectors for 'medieval' is: \n", vector)

Vectors for 'medieval' is: 
 [-0.0021954  -0.00970765  0.00929529  0.00203197 -0.00116118 -0.00550371
 -0.00850983 -0.00990347  0.00894438 -0.00249943  0.00459238 -0.00451736
  0.00995806  0.00365472  0.00102569 -0.0040441   0.00121062 -0.0026468
  0.00735074  0.00447825  0.00098668  0.00348309  0.00371435 -0.00678538
  0.00893076  0.00173623 -0.00578746  0.00865668 -0.00129148  0.00818704
 -0.00150687  0.00698765  0.0027264  -0.00435973 -0.00374793  0.00919178
  0.00159187 -0.00600701  0.00034962 -0.0019624   0.00158453 -0.00771732
  0.00738409  0.00131223  0.0078752   0.00445304 -0.00439387  0.00375268
 -0.00063848 -0.00985625  0.00824213  0.00964937  0.00965339 -0.0037998
 -0.00844962  0.0048277  -0.00765548  0.00853205  0.00276025  0.00560334
  0.00611649  0.00046481 -0.00209489  0.00077317  0.00983596 -0.00712454
 -0.00155245 -0.00235854  0.00486601  0.00645004 -0.00413204  0.00362367
 -0.00448113  0.00327047  0.00816847  0.00362861 -0.00457281 -0.00300916
  0.0078664   0.00960406

In [15]:
# Finding most similar words (ensure word exists before querying)
if 'french' in words:
    similar = model.wv.most_similar('french')
    print("Most similar words to 'french':\n", similar)

Most similar words to 'french':
 [("'s", 0.1961192935705185), ('medieval', 0.15878428518772125), (':', 0.12300866842269897), ('material', 0.10278216749429703), ('heroes', 0.08683586120605469), ('prydain', 0.08284296095371246), (')', 0.08007049560546875), ('regum', 0.06525752693414688), ('britain', 0.05378425866365433), ('historia', 0.03162921220064163)]
