<a href="https://colab.research.google.com/github/JunoJames-JJ/AI-ML-Learning/blob/main/Word_Embedding_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**One-hot vectors**

In [None]:
from sklearn.preprocessing import LabelBinarizer # Binarize labels in a one-vs-all fashion.

# LabelBinazier Library: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html

# Vocabulary
vocab = ["the", "cat", "sat", "on", "mat", "dog", "log"]

# Example word
word = ["sat", "log"]


lb = LabelBinarizer()
lb.fit(vocab)

one_hot = lb.transform(word)


print("One-hot for 'sat':", one_hot[0])
print("One-hot for 'log':", one_hot[1])
print("Feature order:", lb.classes_)  # The vocabulary is sorted alphabetically by default

One-hot for 'sat': [0 0 0 0 0 1 0]
One-hot for 'log': [0 0 1 0 0 0 0]
Feature order: ['cat' 'dog' 'log' 'mat' 'on' 'sat' 'the']


**Co-occurrence matrix**

In [None]:
import numpy as np
from collections import defaultdict
# defaultdict: It’s a subclass of Python’s normal dict (dictionary) with one special feature:
# if you try to access a key that doesn’t exist, it automatically creates it with a default value instead of raising a KeyError.

corpus = [
    ["the", "cat", "drinks", "milk"],
    ["the", "dog", "drinks", "water"]
]

window_size = 2
vocab = sorted(set([w for doc in corpus for w in doc]))
word2idx = {w: i for i, w in enumerate(vocab)}

co_matrix = np.zeros((len(vocab), len(vocab)), dtype=int)

for doc in corpus:
    for i, word in enumerate(doc):
        idx = word2idx[word]
        # look at context within window
        for j in range(max(i - window_size, 0), min(i + window_size + 1, len(doc))):
            if i != j:
                context = doc[j]
                co_matrix[idx, word2idx[context]] += 1

print("Vocabulary:", vocab)
print("Co-occurrence matrix:\n", co_matrix)


Vocabulary: ['cat', 'dog', 'drinks', 'milk', 'the', 'water']
Co-occurrence matrix:
 [[0 0 1 1 1 0]
 [0 0 1 0 1 1]
 [1 1 0 1 2 1]
 [1 0 1 0 0 0]
 [1 1 2 0 0 0]
 [0 1 1 0 0 0]]


In [None]:
word2idx

{'cat': 0, 'dog': 1, 'drinks': 2, 'milk': 3, 'the': 4, 'water': 5}

**Bag of Words**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# CountVectorizer Library: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# TfidfVectorizer Library: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

docs = ["The cat sat on the mat", "The dog sat on the log", "The mice sat on the log"]

# --- Raw Counts ---
count_vec = CountVectorizer()
X_counts = count_vec.fit_transform(docs)
print("Raw counts: \n", X_counts.toarray())
print("Vocabulary mapping:", count_vec.vocabulary_)
print("Feature order:\n", count_vec.get_feature_names_out())

# --- Binary presence ---
binary_vec = CountVectorizer(binary=True)
X_binary = binary_vec.fit_transform(docs)
print("\nBinary presence:\n", X_binary.toarray())

# --- Term Frequency (TF) ---
tf_vec = TfidfVectorizer(use_idf=False, norm="l1")
X_tf = tf_vec.fit_transform(docs)
print("\nTF:\n", X_tf.toarray())

# --- Raw Term Frequency (TF) ---
tf_vec = TfidfVectorizer(use_idf=False, norm=None)
X_tf = tf_vec.fit_transform(docs)
print("\nRaw TF:\n", X_tf.toarray())

# --- TF–IDF ---
tfidf_vec = TfidfVectorizer(smooth_idf=True, norm=None)

X_tfidf = tfidf_vec.fit_transform(docs)
# smooth_idf=True => idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html

print("\nTF–IDF:\n", X_tfidf.toarray())


# --- IDF values ---
print("\nIDF values:")
for term, val in zip(tfidf_vec.get_feature_names_out(), tfidf_vec.idf_):
    print(f"{term}: {val:.3f}")


Raw counts: 
 [[1 0 0 1 0 1 1 2]
 [0 1 1 0 0 1 1 2]
 [0 0 1 0 1 1 1 2]]
Vocabulary mapping: {'the': 7, 'cat': 0, 'sat': 6, 'on': 5, 'mat': 3, 'dog': 1, 'log': 2, 'mice': 4}
Feature order:
 ['cat' 'dog' 'log' 'mat' 'mice' 'on' 'sat' 'the']

Binary presence:
 [[1 0 0 1 0 1 1 1]
 [0 1 1 0 0 1 1 1]
 [0 0 1 0 1 1 1 1]]

TF:
 [[0.16666667 0.         0.         0.16666667 0.         0.16666667
  0.16666667 0.33333333]
 [0.         0.16666667 0.16666667 0.         0.         0.16666667
  0.16666667 0.33333333]
 [0.         0.         0.16666667 0.         0.16666667 0.16666667
  0.16666667 0.33333333]]

Raw TF:
 [[1. 0. 0. 1. 0. 1. 1. 2.]
 [0. 1. 1. 0. 0. 1. 1. 2.]
 [0. 0. 1. 0. 1. 1. 1. 2.]]

TF–IDF:
 [[1.69314718 0.         0.         1.69314718 0.         1.
  1.         2.        ]
 [0.         1.69314718 1.28768207 0.         0.         1.
  1.         2.        ]
 [0.         0.         1.28768207 0.         1.69314718 1.
  1.         2.        ]]

IDF values:
cat: 1.693
dog: 1.693
log: 

Note: The original textbook formula for IDF is idf(t) = log(n / (df(t) + 1)),
but scikit-learn uses idf(t) = log(n / df(t)) + 1 (or the smoothed variant log((1+n)/(1+df(t))) + 1).
This adjustment avoids division by zero and negative IDF values, making the weights more stable in practice.

https://nlp.stanford.edu/IR-book/


Working with ngrams

In [None]:

from sklearn.feature_extraction.text import CountVectorizer

docs = ["the cat sat"]

cv1= CountVectorizer(ngram_range=(1, 1)) #unigrams
print("unigrams:", cv1.fit(docs).get_feature_names_out())
# ['cat', 'sat', 'the']

cv2= CountVectorizer(ngram_range=(1, 2)) #unigrams + bigrams
print("unigrams + bigrams:", cv2.fit(docs).get_feature_names_out())
# ['cat', 'cat sat', 'sat', 'the', 'the cat']

cv3= CountVectorizer(ngram_range=(2, 2)) #only bigrams
print("only bigrams:", cv3.fit(docs).get_feature_names_out())
# ['cat sat', 'the cat']

cv4= CountVectorizer(ngram_range=(2, 3))
print("trigram: ",cv4.fit(docs).get_feature_names_out())
# ['cat sat' 'the cat' 'the cat sat']

cv5= CountVectorizer(ngram_range=(3, 3))
print("trigram +: ",cv5.fit(docs).get_feature_names_out())
#  ['the cat sat']



unigrams: ['cat' 'sat' 'the']
unigrams + bigrams: ['cat' 'cat sat' 'sat' 'the' 'the cat']
only bigrams: ['cat sat' 'the cat']
trigram:  ['cat sat' 'the cat' 'the cat sat']
trigram +:  ['the cat sat']


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

docs = ["the cat sat", "the dog ran"]

# --- CountVectorizer with bigrams only ---
cv3 = CountVectorizer(ngram_range=(2, 2))
X_counts = cv3.fit_transform(docs)
print("Bigrams:", cv3.get_feature_names_out())
print("Counts:\n", X_counts.toarray())

# --- TF-IDF with bigrams only ---
tfidf_vec = TfidfVectorizer(ngram_range=(2, 2), smooth_idf= True, norm="l2")
X_tfidf = tfidf_vec.fit_transform(docs)
print("\nBigrams:", tfidf_vec.get_feature_names_out())
print("TF-IDF:\n", X_tfidf.toarray())


Bigrams: ['cat sat' 'dog ran' 'the cat' 'the dog']
Counts:
 [[1 0 1 0]
 [0 1 0 1]]

Bigrams: ['cat sat' 'dog ran' 'the cat' 'the dog']
TF-IDF:
 [[0.70710678 0.         0.70710678 0.        ]
 [0.         0.70710678 0.         0.70710678]]


 **manually calculating TF * IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

# Get TF only
count_vec = CountVectorizer()
X_counts = count_vec.fit_transform(docs)

# Normalize TF (L1)
tf_vec = TfidfTransformer(use_idf=False, norm="l1")
X_tf = tf_vec.fit_transform(X_counts)

# Get IDF separately
idf_only = TfidfTransformer(use_idf=True, norm=None)
idf_only.fit(X_counts)
idf_values = idf_only.idf_

# Multiply manually: TF * IDF
X_tf_idf = X_tf.multiply(idf_values)
print("\nTF–IDF:\n", X_tf_idf.toarray())



TF–IDF:
 [[0.46848837 0.         0.         0.46848837 0.33333333]
 [0.         0.46848837 0.46848837 0.         0.33333333]]
