<a href="https://colab.research.google.com/github/Mohamedh0/ML-Algorithms-from-scratch/blob/main/BOW_TFIDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **BOW**

## **Python Raw Implementation**

In [7]:
from collections import Counter

doc = ["I love NLP", "I want to be an NLP Engineer", "I enjoy mathematics", "Playing PS, and LOL"]

vocab = set(word for sent in doc for word in sent.lower().split()) # Extract Unique Words

vocab = sorted(vocab) # Sort Unique words

bow = [] # To save our frequencies

for sent in doc:
  word_count = Counter(sent.lower().split()) # count freq of each word
  bow_vector = [word_count.get(word,0) for word in vocab] # Save my freq with word
  bow.append(bow_vector)

print(f"Vocab:  {vocab}")
print(f"BOW Matrix: {bow}")

Vocab:  ['an', 'and', 'be', 'engineer', 'enjoy', 'i', 'lol', 'love', 'mathematics', 'nlp', 'playing', 'ps,', 'to', 'want']
BOW Matrix: [[0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0], [1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1], [0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0]]


## **Using Pandas**

In [8]:
import pandas as pd

bow_pd = []

for sent in doc:
  word_count = Counter(sent.lower().split())
  bow_pd.append({word:word_count.get(word,0) for word in vocab})

bow_pd = pd.DataFrame(bow_pd, columns=vocab)
bow_pd

Unnamed: 0,an,and,be,engineer,enjoy,i,lol,love,mathematics,nlp,playing,"ps,",to,want
0,0,0,0,0,0,1,0,1,0,1,0,0,0,0
1,1,0,1,1,0,1,0,0,0,1,0,0,1,1
2,0,0,0,0,1,1,0,0,1,0,0,0,0,0
3,0,1,0,0,0,0,1,0,0,0,1,1,0,0


## **Using Sklearn**

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(doc)

print(vectorizer.get_feature_names_out())
print(bow_matrix.toarray())

['an' 'and' 'be' 'engineer' 'enjoy' 'lol' 'love' 'mathematics' 'nlp'
 'playing' 'ps' 'to' 'want']
[[0 0 0 0 0 0 1 0 1 0 0 0 0]
 [1 0 1 1 0 0 0 0 1 0 0 1 1]
 [0 0 0 0 1 0 0 1 0 0 0 0 0]
 [0 1 0 0 0 1 0 0 0 1 1 0 0]]


# **TF-IDF**

## **Raw Python**

In [11]:
import math

def compute_tf(sent):
    word_count = Counter(sent.lower().split())
    sent_len = len(sent.split())
    return {word: word_count.get(word, 0) / sent_len for word in vocab}

def compute_idf(corpus):
    doc_count = len(corpus)
    idf = {}
    for word in vocab:
        containing_docs = sum(1 for doc in corpus if word in doc.lower().split())
        idf[word] = math.log(doc_count / (1 + containing_docs))
    return idf

# Compute TF and IDF
tf = [compute_tf(sent) for sent in doc]
idf = compute_idf(doc)

# Compute TF-IDF
tf_idf = []
for i in tf:
    tfidf_vector = [i.get(word, 0) * idf.get(word, 1) for word in vocab]
    tf_idf.append(tfidf_vector)

# Print results
print("Vocab:", vocab)
print("TF-IDF Vector:", tf_idf)

Vocab: ['an', 'and', 'be', 'engineer', 'enjoy', 'i', 'lol', 'love', 'mathematics', 'nlp', 'playing', 'ps,', 'to', 'want']
TF-IDF Vector: [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.23104906018664842, 0.0, 0.09589402415059362, 0.0, 0.0, 0.0, 0.0], [0.09902102579427789, 0.0, 0.09902102579427789, 0.09902102579427789, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04109743892168297, 0.0, 0.0, 0.09902102579427789, 0.09902102579427789], [0.0, 0.0, 0.0, 0.0, 0.23104906018664842, 0.0, 0.0, 0.0, 0.23104906018664842, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.17328679513998632, 0.0, 0.0, 0.0, 0.0, 0.17328679513998632, 0.0, 0.0, 0.0, 0.17328679513998632, 0.17328679513998632, 0.0, 0.0]]


## **Pandas**

In [12]:
tf_idf_df = pd.DataFrame(tf_idf, columns=vocab)
tf_idf_df

Unnamed: 0,an,and,be,engineer,enjoy,i,lol,love,mathematics,nlp,playing,"ps,",to,want
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231049,0.0,0.095894,0.0,0.0,0.0,0.0
1,0.099021,0.0,0.099021,0.099021,0.0,0.0,0.0,0.0,0.0,0.041097,0.0,0.0,0.099021,0.099021
2,0.0,0.0,0.0,0.0,0.231049,0.0,0.0,0.0,0.231049,0.0,0.0,0.0,0.0,0.0
3,0.0,0.173287,0.0,0.0,0.0,0.0,0.173287,0.0,0.0,0.0,0.173287,0.173287,0.0,0.0


## **Using Sklearn**

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(doc)

print(tfidf_vectorizer.get_feature_names_out())
print(tfidf_matrix.toarray())

['an' 'and' 'be' 'engineer' 'enjoy' 'lol' 'love' 'mathematics' 'nlp'
 'playing' 'ps' 'to' 'want']
[[0.         0.         0.         0.         0.         0.
  0.78528828 0.         0.6191303  0.         0.         0.
  0.        ]
 [0.42176478 0.         0.42176478 0.42176478 0.         0.
  0.         0.         0.3325242  0.         0.         0.42176478
  0.42176478]
 [0.         0.         0.         0.         0.70710678 0.
  0.         0.70710678 0.         0.         0.         0.
  0.        ]
 [0.         0.5        0.         0.         0.         0.5
  0.         0.         0.         0.5        0.5        0.
  0.        ]]
