#### 01. Static Embeddings - Word2Vec

In [16]:
# Example: Using Word2Vec with a small sample corpus

# Import Word2Vec from gensim
from gensim.models import Word2Vec

# Basic tokenizer: splits sentences into lowercase words
def simple_tokenizer(text):
    # Lowercase and split on spaces (very basic, for demo only)
    return text.lower().replace('.', '').split()

# Example sentences with the word 'bank' in different contexts
raw_sentences = [
    "He sat by the river bank.",
    "She deposited money in the bank.",
    "The bank was closed on Sunday."
]

# Tokenize sentences
tokenized_sentences = [simple_tokenizer(sent) for sent in raw_sentences]

# Train Word2Vec model
# vector_size: dimension of the embedding vectors (higher = more expressive, but needs more data)
# min_count: ignore words with total frequency lower than this (set to 1 to include all words in this tiny corpus)
# window: maximum distance between the current and predicted word within a sentence (context window size)
w2v_model = Word2Vec(tokenized_sentences, vector_size=10, min_count=1, window=3)

# Get embedding for 'bank'
# The vector for 'bank' will be the same regardless of which sentence/context it appears in
print("Word2Vec vector for 'bank':\n", w2v_model.wv['bank'])
print("Word2Vec : Bank's Word Vector Shape", w2v_model.wv['bank'].shape)

words = list(w2v_model.wv.index_to_key)  # List all words in the vocabulary
similar = w2v_model.wv.most_similar('bank')  # Find words most similar to 'bank'

print("Similarity between 'bank' and 'river':", w2v_model.wv.similarity('bank', 'river'))
print("Similarity between 'bank' and 'money':", w2v_model.wv.similarity('bank', 'money'))

# NOTE:

# - The model uses a sliding window (set by the window parameter) to look at neighboring words 
# - and learns that words appearing in similar contexts should have similar vectors.

# - bank' has the same embedding in both "river bank" and "money bank" contexts.
# - the word "bank" appears in both "river bank" and "money bank" contexts. 
# - The model tries to capture both, but since static embeddings can only assign one vector 
# - per word, "bank" gets a single vector that is an average of all its contexts.

# - This is a limitation of static embeddings:
# - they cannot distinguish between different meanings (senses) of a word.

Word2Vec vector for 'bank':
 [-0.00536227  0.00236431  0.0510335   0.09009273 -0.0930295  -0.07116809
  0.06458873  0.08972988 -0.05015428 -0.03763372]
Word2Vec : Bank's Word Vector Shape (10,)
Similarity between 'bank' and 'river': 0.43182474
Similarity between 'bank' and 'money': -0.1311161


#### 02. Static Embeddings - FastText

In [None]:
from gensim.models import FastText

# Train FastText model on your tokenized sentences
ft_model = FastText(tokenized_sentences, vector_size=10, min_count=1, window=3)

# Get embedding for a known word
# Each word is broken into character n-grams (e.g., "bank" → <ba, ban, ank, nk>, etc.).
# The word vector is the sum (or average) of its n-gram vectors.
# Training is similar to Word2Vec (CBOW or Skip-gram), but on subwords.
print("FastText vector for 'bank':\n", ft_model.wv['bank'])

# FastText can handle OOV (out-of-vocabulary) words using subword information
print("FastText vector for OOV word 'banking':\n", ft_model.wv['banking'])

# Compare with a nonsense word (still gets a vector!)
print("FastText vector for OOV word 'bankzzz':\n", ft_model.wv['bankzzz'])

# Demo: Similarity between 'bank' and 'banking'
print("Similarity between 'bank' and 'banking':", ft_model.wv.similarity('bank', 'banking'))

# NOTE:
# - FastText is especially useful when you expect to encounter new words,
# - rare words, or work with morphologically rich languages.

FastText vector for 'bank':
 [-0.02681367 -0.00070708  0.01805622  0.00670038 -0.00706677 -0.01905808
  0.01269603  0.00208769  0.00945567 -0.02328101]
FastText vector for OOV word 'banking':
 [ 0.00135347 -0.00299125  0.00331595  0.01845975  0.0230606   0.00761079
  0.00130226  0.00841182  0.00700071 -0.01047717]
FastText vector for OOV word 'bankzzz':
 [-0.02014219  0.00406886 -0.00836108  0.01079835  0.01020753  0.00953312
  0.01484188 -0.01233316  0.0123117  -0.02921248]
Similarity between 'bank' and 'banking': 0.112472326
