In [35]:
corpus = [
    "The cat drinks the milk",
    "The cow eats in the park",
    "The fish swims in the lake"
]

In [36]:
# Tokenize the corpus (split sentences into words)
tokens = [sentence.lower().split() for sentence in corpus]
tokens

[['the', 'cat', 'drinks', 'the', 'milk'],
 ['the', 'cow', 'eats', 'in', 'the', 'park'],
 ['the', 'fish', 'swims', 'in', 'the', 'lake']]

### One-Hot Encoding

One-Hot Encoding represents each word as a unique binary vector, where only one element is 1, and all others are 0.

‚öôÔ∏è What it does?
*   Identify unique words in the corpus.
*   Assign a unique index to each word.
*   Convert words into binary vectors, where each vector has a 1 in the corresponding index.

In [37]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [38]:
# Get unique words from the corpus
unique_words = sorted(set(word for sentence in tokens for word in sentence))

In [39]:
# One-Hot Encoding using sklearn
encoder = OneHotEncoder(sparse_output=False)
one_hot_vectors = encoder.fit_transform(np.array(unique_words).reshape(-1, 1))

OneHotEncoder(sparse=False): sparse=False ensures the output is a dense NumPy array instead of a sparse matrix.

fit_transform(): Learns unique word indices and encodes them into one-hot vectors.

In [40]:
# Display the one-hot encoded vectors
print("\nOne-Hot Encoding:")
for word, vector in zip(unique_words, one_hot_vectors):
    print(f"{word}: {vector}")


One-Hot Encoding:
cat: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
cow: [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
drinks: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
eats: [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
fish: [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
in: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
lake: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
milk: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
park: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
swims: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
the: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


### Bag Of Words (BOW)
The Bag of Words (BoW) model represents text as a word frequency matrix, ignoring word order but counting occurrences.

‚öôÔ∏è Mechanism
*   Tokenize sentences into words.
*   Create a vocabulary of unique words.
*   Count occurrences of each word in each document.




In [41]:
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
# Initialize BoW vectorizer
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(corpus)

# Extract feature names (words)
feature_names = vectorizer.get_feature_names_out()

CountVectorizer():Converts text into a word frequency matrix.

fit_transform(corpus): Learns vocabulary and creates the document-term matrix.

get_feature_names_out(): Retrieves feature names (words in the vocabulary).

In [43]:
# Display BoW representation
print("\nBag of Words Representation:")
for doc_id, vector in enumerate(bow_matrix.toarray()):
    print(f"\nDocument {doc_id+1}:")
    for word, count in zip(feature_names, vector):
        print(f"{word}: {count}")



Bag of Words Representation:

Document 1:
cat: 1
cow: 0
drinks: 1
eats: 0
fish: 0
in: 0
lake: 0
milk: 1
park: 0
swims: 0
the: 2

Document 2:
cat: 0
cow: 1
drinks: 0
eats: 1
fish: 0
in: 1
lake: 0
milk: 0
park: 1
swims: 0
the: 2

Document 3:
cat: 0
cow: 0
drinks: 0
eats: 0
fish: 1
in: 1
lake: 1
milk: 0
park: 0
swims: 1
the: 2


### N-Gram Model

An N-Gram Model captures word sequences (e.g., bigrams, trigrams) instead of individual words, preserving some context.

‚öôÔ∏è Mechanism
*   Tokenize text into n-word sequences.
*   Count occurrences of n-grams in documents.
*   Represent text as a frequency matrix.


In [44]:
# Initialize N-Gram Vectorizer (bigrams)
vectorizer_ngram = CountVectorizer(ngram_range=(2, 2))
ngram_matrix = vectorizer_ngram.fit_transform(corpus)

# Extract feature names (bigrams)
feature_names_ngram = vectorizer_ngram.get_feature_names_out()

ngram_range=(2,2): Extracts bigrams instead of single words.

fit_transform(): Converts text into an n-gram frequency matrix.

In [45]:
# Display N-Gram representation
print("\nN-Gram Representation (Bigrams):")
for doc_id, vector in enumerate(ngram_matrix.toarray()):
    print(f"\nDocument {doc_id+1}:")
    for word, count in zip(feature_names_ngram, vector):
        print(f"{word}: {count}")


N-Gram Representation (Bigrams):

Document 1:
cat drinks: 1
cow eats: 0
drinks the: 1
eats in: 0
fish swims: 0
in the: 0
swims in: 0
the cat: 1
the cow: 0
the fish: 0
the lake: 0
the milk: 1
the park: 0

Document 2:
cat drinks: 0
cow eats: 1
drinks the: 0
eats in: 1
fish swims: 0
in the: 1
swims in: 0
the cat: 0
the cow: 1
the fish: 0
the lake: 0
the milk: 0
the park: 1

Document 3:
cat drinks: 0
cow eats: 0
drinks the: 0
eats in: 0
fish swims: 1
in the: 1
swims in: 1
the cat: 0
the cow: 0
the fish: 1
the lake: 1
the milk: 0
the park: 0


### TF-IDF
TF-IDF measures word importance based on how frequently a word appears in a document and how rare it is across the corpus.

‚öôÔ∏è Mechanism

Compute TF (Term Frequency):
*  ùëáùêπ(ùë§)= Number of times word appears in a document / Total words in the document

Compute IDF (Inverse Document Frequency):
*   ùêºùê∑ùêπ(ùë§)= log (Total number of documents / Number of documents containing word w)

Multiply TF and IDF to get TF-IDF score.

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# Extract feature names (words)
feature_names = vectorizer.get_feature_names_out()

TfidfVectorizer():Converts text into a weighted word frequency matrix.

fit_transform(corpus): Computes TF-IDF scores for each word.


In [48]:
# Display the TF-IDF vectors for each document
print("\nTF-IDF Representation:")
for doc_id, vector in enumerate(tfidf_matrix.toarray()):
    print(f"\nDocument {doc_id+1}:")
    for word, score in zip(feature_names, vector):
        print(f"{word}: {score:.4f}")


TF-IDF Representation:

Document 1:
cat: 0.4770
cow: 0.0000
drinks: 0.4770
eats: 0.0000
fish: 0.0000
in: 0.0000
lake: 0.0000
milk: 0.4770
park: 0.0000
swims: 0.0000
the: 0.5634

Document 2:
cat: 0.0000
cow: 0.4484
drinks: 0.0000
eats: 0.4484
fish: 0.0000
in: 0.3410
lake: 0.0000
milk: 0.0000
park: 0.4484
swims: 0.0000
the: 0.5297

Document 3:
cat: 0.0000
cow: 0.0000
drinks: 0.0000
eats: 0.0000
fish: 0.4484
in: 0.3410
lake: 0.4484
milk: 0.0000
park: 0.0000
swims: 0.4484
the: 0.5297


### Word2Vec (CBOW & Skip-Gram)

Word2Vec is a neural network model that generates dense word embeddings based on context.

‚öôÔ∏è Mechanism

CBOW (Continuous Bag of Words): Predicts a word given its surrounding context.

Skip-Gram: Predicts surrounding words given a word.

In [49]:
from gensim.models import Word2Vec

#### CBOW

In [50]:
# Train Word2Vec model (CBOW)
w2v_model_cbow = Word2Vec(sentences=tokens, vector_size=5, window=2, min_count=1, sg=0)

vector_size=5:Word vectors have 5 dimensions.

window=2:Considers 2 words on either side as context.

sg=0:Uses CBOW (if sg=1, uses Skip-Gram)

In [51]:
# Display word vectors for CBOW
print("\nWord2Vec CBOW Representation:")
for word in unique_words:
    print(f"{word}: {w2v_model_cbow.wv[word]}")


Word2Vec CBOW Representation:
cat: [-0.16315834  0.08991597 -0.08274152  0.01649072  0.16997239]
cow: [-0.03875482  0.16154873 -0.11861791  0.00090325 -0.09507468]
drinks: [-0.00592363 -0.1532248   0.19229487  0.09964116  0.18466286]
eats: [-0.15023164 -0.01860085  0.19076237 -0.14638333 -0.04667537]
fish: [0.14623532 0.10140524 0.13515386 0.01525731 0.12701781]
in: [-0.14233617  0.12917745  0.17945977 -0.10030856 -0.07526743]
lake: [ 0.1476101  -0.03066943 -0.09073226  0.13108103 -0.09720321]
milk: [-0.192071    0.10014586 -0.17519172 -0.0878365  -0.000702  ]
park: [-0.06810732 -0.01892803  0.11537147 -0.15043275 -0.07872207]
swims: [-0.03632035  0.0575316   0.01983747 -0.1657043  -0.18897636]
the: [-0.01072454  0.00472863  0.10206699  0.18018547 -0.186059  ]


#### Skip-Gram

In [52]:
# Train Word2Vec model (Skip-Gram)
w2v_model_skipgram = Word2Vec(sentences=tokens, vector_size=5, window=2, min_count=1, sg=1)

# Display word vectors for Skip-Gram
print("\nWord2Vec Skip-Gram Representation:")
for word in unique_words:
    print(f"{word}: {w2v_model_skipgram.wv[word]}")


Word2Vec Skip-Gram Representation:
cat: [-0.16315834  0.08991597 -0.08274152  0.01649072  0.16997239]
cow: [-0.03875482  0.16154873 -0.11861791  0.00090325 -0.09507468]
drinks: [-0.00592363 -0.1532248   0.19229487  0.09964116  0.18466286]
eats: [-0.15023164 -0.01860085  0.19076237 -0.14638333 -0.04667537]
fish: [0.14623532 0.10140524 0.13515386 0.01525731 0.12701781]
in: [-0.14233617  0.12917745  0.17945977 -0.10030856 -0.07526743]
lake: [ 0.1476101  -0.03066943 -0.09073226  0.13108103 -0.09720321]
milk: [-0.192071    0.10014586 -0.17519172 -0.0878365  -0.000702  ]
park: [-0.06810732 -0.01892803  0.11537147 -0.15043275 -0.07872207]
swims: [-0.03632035  0.0575316   0.01983747 -0.1657043  -0.18897636]
the: [-0.01072454  0.00472863  0.10206699  0.18018547 -0.186059  ]
