<a href="https://colab.research.google.com/github/Mohamedh0/Amit/blob/main/BOW_%26_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BOW python

## Row Python

In [None]:
from collections import Counter
doc=['I love NLP','I want to be an NLP Engineer','I enjoy mathematics']
vocab=sorted(set(word for sent in doc for word in sent.lower().split()))
bow_matrix=[] # To save our freq
for sent in doc:
  word_count=Counter(sent.lower().split())
  bow_vector=[word_count.get(word,0) for word in vocab] # Save my freq with word
  bow_matrix.append(bow_vector)
print(f'Vocab : {vocab}')
print(f'BOW : {bow_matrix}')

Vocab : ['an', 'be', 'engineer', 'enjoy', 'i', 'love', 'mathematics', 'nlp', 'to', 'want']
BOW : [[0, 0, 0, 0, 1, 1, 0, 1, 0, 0], [1, 1, 1, 0, 1, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 0, 1, 0, 0, 0]]


## Using Pandas

In [None]:
import pandas as pd
text=['I love NLP','I want to be an NLP Engineer','I enjoy mathematics']
vocab=sorted(set(word for sent in doc for word in sent.lower().split()))
bow_data=[] # To save our freq
for sent in text:
    word_count = Counter(sent.lower().split())
    bow_data.append({word: word_count.get(word, 0) for word in vocab})

bow_df = pd.DataFrame(bow_data, columns=vocab)
bow_df

Unnamed: 0,an,be,engineer,enjoy,i,love,mathematics,nlp,to,want
0,0,0,0,0,1,1,0,1,0,0
1,1,1,1,0,1,0,0,1,1,1
2,0,0,0,1,1,0,1,0,0,0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
doc=['I love NLP','I want to be an NLP Engineer','I enjoy mathematics']
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(doc)
feature_names = vectorizer.get_feature_names_out()

print(bow_matrix.toarray())
print(feature_names)

[[0 0 0 0 1 0 1 0 0]
 [1 1 1 0 0 0 1 1 1]
 [0 0 0 1 0 1 0 0 0]]
['an' 'be' 'engineer' 'enjoy' 'love' 'mathematics' 'nlp' 'to' 'want']


# TF-IDF

## Row python

In [None]:
import math
from collections import Counter

documents = ['I love NLP', 'I want to be an NLP Engineer', 'I enjoy mathematics']
vocab = sorted(set(word for sent in documents for word in sent.lower().split()))

def compute_tf(doc):  # Calculate term frequency for each word in a document
    word_count = Counter(doc.lower().split())
    doc_length = len(doc.split())
    return {word: word_count.get(word, 0) / doc_length for word in vocab}

def compute_idf(corpus):  # Calculate inverse document frequency for each word in the corpus
    doc_count = len(corpus)
    idf = {}
    for word in vocab:
        containing_docs = sum(1 for doc in corpus if word in doc.lower().split())
        idf[word] = math.log(doc_count / (1 + containing_docs))
    return idf

# Compute TF and IDF
tf = [compute_tf(doc) for doc in documents]
idf = compute_idf(documents)

# Compute TF-IDF
tf_idf = []
for doc_tf in tf:
    tfidf_vector = [doc_tf[word] * idf[word] for word in vocab]
    tf_idf.append(tfidf_vector)
print(vocab)
print(tf_idf)


['an', 'be', 'engineer', 'enjoy', 'i', 'love', 'mathematics', 'nlp', 'to', 'want']
[[0.0, 0.0, 0.0, 0.0, -0.09589402415059363, 0.13515503603605478, 0.0, 0.0, 0.0, 0.0], [0.05792358687259491, 0.05792358687259491, 0.05792358687259491, 0.0, -0.04109743892168299, 0.0, 0.0, 0.0, 0.05792358687259491, 0.05792358687259491], [0.0, 0.0, 0.0, 0.13515503603605478, -0.09589402415059363, 0.0, 0.13515503603605478, 0.0, 0.0, 0.0]]


## Using Pandas

In [None]:
import pandas as pd
import math
from collections import Counter

# List of documents
documents = ['I love NLP', 'I want to be an NLP Engineer', 'I enjoy mathematics']

# Create a sorted vocabulary from the documents
vocab = sorted(set(word for sent in documents for word in sent.lower().split()))

# Function to calculate term frequency (TF) for a document
def compute_tf(doc):
    word_counts = Counter(doc.lower().split())
    doc_length = sum(word_counts.values())
    return {word: word_counts.get(word, 0) / doc_length for word in vocab}

# Total number of documents
doc_count = len(documents)

# Calculate inverse document frequency (IDF) for each word
idf = {
    word: math.log(doc_count / (1 + sum(word in doc.lower().split() for doc in documents)))
    for word in vocab
}

# Calculate TF-IDF for each document
tf_idf_data = []
for doc in documents:
    tf = compute_tf(doc)
    tf_idf_data.append({word: tf[word] * idf[word] for word in vocab})

# Create a DataFrame for TF-IDF values
tf_idf_df = pd.DataFrame(tf_idf_data)

# Display the TF-IDF DataFrame
print(tf_idf_df)


         an        be  engineer     enjoy         i      love  mathematics  \
0  0.000000  0.000000  0.000000  0.000000 -0.095894  0.135155     0.000000   
1  0.057924  0.057924  0.057924  0.000000 -0.041097  0.000000     0.000000   
2  0.000000  0.000000  0.000000  0.135155 -0.095894  0.000000     0.135155   

   nlp        to      want  
0  0.0  0.000000  0.000000  
1  0.0  0.057924  0.057924  
2  0.0  0.000000  0.000000  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# List of documents
documents = ['I love NLP', 'I want to be an NLP Engineer', 'I enjoy mathematics']

# Create TfidfVectorizer instance
vectorizer = TfidfVectorizer()

# Fit and transform the documents to compute TF-IDF scores
tf_idf_matrix = vectorizer.fit_transform(documents)

# Convert the TF-IDF matrix to a DataFrame for better visualization
tf_idf_df = pd.DataFrame(tf_idf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the TF-IDF DataFrame
print(tf_idf_df)


         an        be  engineer     enjoy      love  mathematics       nlp  \
0  0.000000  0.000000  0.000000  0.000000  0.795961     0.000000  0.605349   
1  0.423394  0.423394  0.423394  0.000000  0.000000     0.000000  0.322002   
2  0.000000  0.000000  0.000000  0.707107  0.000000     0.707107  0.000000   

         to      want  
0  0.000000  0.000000  
1  0.423394  0.423394  
2  0.000000  0.000000  
