In [1]:
# pip install gensim   
                      # gensim is capable of implementing algorithms for discovering topics in a collection of documents like 
                      # 1. Latent Semantic Analysis (LSA), 
                      # 2. Latent Dirichlet Allocation (LDA)
                      # 3. Hierarchical Dirichlet Process (HDP) 
                      # Offers tools for training and loading word vectors, including:
                            # 1. Word2Vec
                            # 2. FastText
                            # 3. Pretrained models like Google News Word2Vec
                      # Find the most similar words or documents using cosine similarity in vector space. Example Use Cases:
                      # 1. Document Clustering (e.g., grouping news articles by topic), 
                      # 2. Semantic Search (e.g., retrieving similar questions from a Q&A database), 
                      # 3. Recommendation Systems (e.g., finding similar products based on descriptions)
                      # 4. Word Similarity Tasks (e.g., “king” is to “queen” as “man” is to “woman”)

In [2]:
import gensim

In [3]:
import gensim.downloader # allows to easily download and load pre-trained models and datasets

In [4]:
gensim.downloader.info().keys() # returns keys of the metadata dictionary describing the models and datasets available 
                                 # for download via Gensim's downloader.

dict_keys(['corpora', 'models'])

In [5]:
gensim.downloader.info()['models']

{'fasttext-wiki-news-subwords-300': {'num_records': 999999,
  'file_size': 1005007116,
  'base_dataset': 'Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens)',
  'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/fasttext-wiki-news-subwords-300/__init__.py',
  'license': 'https://creativecommons.org/licenses/by-sa/3.0/',
  'parameters': {'dimension': 300},
  'description': '1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).',
  'read_more': ['https://fasttext.cc/docs/en/english-vectors.html',
   'https://arxiv.org/abs/1712.09405',
   'https://arxiv.org/abs/1607.01759'],
  'checksum': 'de2bb3a20c46ce65c9c131e1ad9a77af',
  'file_name': 'fasttext-wiki-news-subwords-300.gz',
  'parts': 1},
 'conceptnet-numberbatch-17-06-300': {'num_records': 1917247,
  'file_size': 1225497562,
  'base_dataset': 'ConceptNet, word2vec, GloVe, and OpenSubtitles 2016',
  'reader_code': 'https:/

In [6]:
gensim.downloader.info()['models'].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

In [9]:
gensim.downloader.info()['models']['word2vec-google-news-300']

{'num_records': 3000000,
 'file_size': 1743563840,
 'base_dataset': 'Google News (about 100 billion words)',
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/word2vec-google-news-300/__init__.py',
 'license': 'not found',
 'parameters': {'dimension': 300},
 'description': "Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in 'Distributed Representations of Words and Phrases and their Compositionality' (https://code.google.com/archive/p/word2vec/).",
 'read_more': ['https://code.google.com/archive/p/word2vec/',
  'https://arxiv.org/abs/1301.3781',
  'https://arxiv.org/abs/1310.4546',
  'https://www.microsoft.com/en-us/research/publication/linguistic-regularities-in-continuous-space-word-representations/?from=http%3A%2F%2Fresearch.microsoft.com%2Fpubs%2F189726%2Frvec

### Loading the Model

In [11]:
Word_2_Vec = gensim.downloader.load('word2vec-google-news-300') # Note: 300 represnts 300-dimensional

In [13]:
vector_representation = Word_2_Vec.get_vector('Continent')  # fetches the vector representation (a 300-dimensional NumPy array) of 
                                                            # the word 'continent' from the Word2Vec model.

In [14]:
Word_2_Vec.most_similar('Continent') # returns the top words most similar to 'emperor' based on cosine similarity of word embeddings 
                                     # in the word2vec_300 model 

[('continent', 0.7036794424057007),
 ('Europe', 0.5909315347671509),
 ('Africa', 0.5554481744766235),
 ('British_Isles', 0.5482714176177979),
 ('continental', 0.5253060460090637),
 ('Iberian_Peninsula', 0.5039645433425903),
 ('European', 0.48930081725120544),
 ('Scandinavia', 0.4872168004512787),
 ('Iberian_peninsula', 0.4815359115600586),
 ('continental_Europe', 0.4758695960044861)]

### Calculating Cosine Similarity Between 2 Vectors (Words---> Word Embeddings)

In [20]:
emperor = Word_2_Vec.get_vector('King')

In [21]:
empress = Word_2_Vec.get_vector('Queen')

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
cosine_similarity(emperor.reshape(1,300),empress.reshape(1,300))

array([[0.5157251]], dtype=float32)

In [34]:
knight = Word_2_Vec.get_vector('man')

In [35]:
lady = Word_2_Vec.get_vector('woman')

In [36]:
lady = knight-man+woman

In [37]:
cosine_similarity(knight.reshape(1,300), lady.reshape(1,300))

array([[0.76640135]], dtype=float32)

In [45]:
# Consider 2nd example
Rhine= Word_2_Vec.get_vector('Rhine') # Rhine is a major river in Germany
Siene = Word_2_Vec.get_vector('Siene') # Siene is a major river in France

In [46]:
Germany  = Word_2_Vec.get_vector('Germany')

In [47]:
France = Word_2_Vec.get_vector('France')

In [50]:
vect = Germany-Rhine + Siene
cosine_similarity(Germany.reshape(1,300), France.reshape(1,300))

array([[0.6270757]], dtype=float32)

### Calculating Cosine Similarity between 2 Sentences (Sentence Embeddings)

In [52]:
first_sentence = "India’s surgical strikes were a calculated and restrained response, rather than an act of full-scale warfare"
second_sentence = "India’s surgical strikes were a precise and restrained action, aimed at addressing specific threats without escalating into full-scale warfare"


In [53]:
first_sentence.split()

['India’s',
 'surgical',
 'strikes',
 'were',
 'a',
 'calculated',
 'and',
 'restrained',
 'response,',
 'rather',
 'than',
 'an',
 'act',
 'of',
 'full-scale',
 'warfare']

In [54]:
second_sentence.split()

['India’s',
 'surgical',
 'strikes',
 'were',
 'a',
 'precise',
 'and',
 'restrained',
 'action,',
 'aimed',
 'at',
 'addressing',
 'specific',
 'threats',
 'without',
 'escalating',
 'into',
 'full-scale',
 'warfare']

In [56]:
first_sentence_embedding = Word_2_Vec.get_mean_vector(first_sentence.split())
second_sentence_embedding = Word_2_Vec.get_mean_vector(second_sentence.split())
# .get_mean_vector looks up the Word2Vec embedding for each word following split. It then, 
# computes the mean (average) vector of all word embeddings in the sentence.
# Why use .get_mean_vector? Because, Word2Vec gives embeddings for individual words, but many tasks 
# (like sentence similarity or classification) require a single vector per sentence.
# So we take the average of all the word vectors in the sentence — a common technique called:
# Mean pooling or sentence embedding via averaging.

In [58]:
cosine_similarity(first_sentence_embedding.reshape(1,300), second_sentence_embedding.reshape(1,300))
# Reshaping since cosine-similarity requires 2-dimensional input

array([[0.7308821]], dtype=float32)

### Training your own model

In [59]:
from nltk.corpus import brown # The Brown Corpus is a standard, well-known collection of English text, created in the 1960s. 
                              # It was the first million-word electronic corpus of American English, and 
                              # it's widely used in linguistics and NLP research. It contains texts from 500 different sources, 
                              # grouped by genre, including:
                                                            # 1. News
                                                            # 2. Editorials
                                                            # 3. Fiction
                                                            # 4. Romance
                                                            # 5. Religion
                                                            # 6. Government
                                                            # 7. Science
# All categories (genres)
#    brown.categories()
# → ['adventure', 'editorial', 'fiction', 'government', 'hobbies', ...]

# Words from a specific category
# brown.words(categories='news')

# Sentences from the corpus
# brown.sents()


In [65]:
(brown.sents())[8]

['However',
 ',',
 'the',
 'jury',
 'said',
 'it',
 'believes',
 '``',
 'these',
 'two',
 'offices',
 'should',
 'be',
 'combined',
 'to',
 'achieve',
 'greater',
 'efficiency',
 'and',
 'reduce',
 'the',
 'cost',
 'of',
 'administration',
 "''",
 '.']

In [66]:
" ".join(brown.sents()[8])   # Retrieves the 8th sentence (Python uses 0-based indexing) from the Brown corpus.
                             # Joins the list of words in that sentence into a single string with spaces between words.

"However , the jury said it believes `` these two offices should be combined to achieve greater efficiency and reduce the cost of administration '' ."

In [69]:
# Creating your own model
my_model = gensim.models.Word2Vec(brown.sents(), vector_size = 25, epochs = 10, window = 10)

# What we are doing above is creating a Word2Vec model using the Brown corpus with specific hyperparameters. 
# Here's a detailed breakdown of this line:
                                            # brown.sents()	---> The training data: tokenized sentences from the Brown corpus
                                            # vector_size=20 ---> Each word will be represented by a 20-dimensional vector
                                            # epochs=10	---> The model will pass through the entire corpus 10 times (training cycles)
                                            # window=10	The context window: how many words before and after a target word to consider


In [71]:
my_model.wv.get_vector('faith')

array([-1.4107752 ,  2.0186083 , -1.7169677 ,  1.0043945 ,  0.5253399 ,
       -0.69897914, -1.1418812 ,  0.24403727, -0.6537245 , -0.7337003 ,
        0.33038673,  1.6087105 , -0.926237  , -0.70416117,  0.96861845,
        1.1036601 ,  0.17073087,  0.18519805,  0.21197456,  0.6128516 ,
        0.1080151 , -1.665362  , -0.27400574,  2.0222769 ,  0.51745445],
      dtype=float32)

In [72]:
my_model.wv.most_similar('worship')

[('defining', 0.8846367597579956),
 ('destiny', 0.8842817544937134),
 ('diplomacy', 0.8783668875694275),
 ('excellence', 0.8760520815849304),
 ('congregation', 0.8760414123535156),
 ('enterprise', 0.8731026649475098),
 ('participation', 0.8721271753311157),
 ('aesthetic', 0.8699518442153931),
 ('biology', 0.8696659803390503),
 ('aims', 0.8681924343109131)]