In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import math
import pandas as pd

In [8]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

class TFIDF():
    def __init__(self,corpus):
        self.corpus=corpus
        
    def td(self, term, doc):
        """
        This is the term frequency calculation
        TF(word) = (Number of times word appears in document) / (Total words in document)
        """
        term_cnt=0
        word_cnt=0
        words=doc.split(" ")
        for i in words:
            word_cnt+=1
            if i==term:
                term_cnt+=1
        return term_cnt/word_cnt


    def idf(self,term):
        """
        Inverse document frequency calculation.
        IDF(word) = log(Total number of documents / Number of documents containing the word)
        """
        res=0
        for i in corpus:
            if term in i:
                res+=1
        return math.log(len(corpus)/res)

    def tf_idf(self , term , doc ):
        return self.td(term, doc) * self.idf(term)



In [10]:
tfidf=TFIDF(corpus)
tfidf.tf_idf("sun",corpus[0])

0.08109302162163289

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import math
import pandas as pd

# Manual TF-IDF implementation
class ManualTFIDF:
    def __init__(self, corpus):
        self.corpus = corpus
        self.tf = self.compute_tf()
        self.idf = self.compute_idf()

    def compute_tf(self):
        tf_list = []
        for doc in self.corpus:
            words = doc.split()
            tf = {word: words.count(word) / len(words) for word in set(words)}
            tf_list.append(tf)
        return tf_list

    def compute_idf(self):
        N = len(self.corpus)
        idf = {}
        all_words = set(word for doc in self.corpus for word in doc.split())
        for word in all_words:
            containing = sum(1 for doc in self.corpus if word in doc.split())
            idf[word] = math.log(N / containing)
        return idf

    def compute_tfidf(self):
        tfidf = []
        for doc_tf in self.tf:
            tfidf_doc = {word: doc_tf[word] * self.idf[word] for word in doc_tf}
            tfidf.append(tfidf_doc)
        return tfidf

manual_tfidf = ManualTFIDF(corpus).compute_tfidf()
manual_df = pd.DataFrame(manual_tfidf).fillna(0)
print("Manual TF-IDF:\n", manual_df)

# CountVectorizer
cv = CountVectorizer()
cv_matrix = cv.fit_transform(corpus)
cv_df = pd.DataFrame(cv_matrix.toarray(), columns=cv.get_feature_names_out())
print("\nCountVectorizer:\n", cv_df)

# TfidfVectorizer
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(corpus)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vec.get_feature_names_out())
print("\nTfidfVectorizer:\n", tfidf_df)

Manual TF-IDF:
    the        is       sun      star         a      moon  satellite    bodies  \
0  0.0  0.081093  0.081093  0.219722  0.081093  0.000000   0.000000  0.000000   
1  0.0  0.081093  0.000000  0.000000  0.081093  0.081093   0.219722  0.000000   
2  0.0  0.000000  0.057924  0.000000  0.000000  0.057924   0.000000  0.156945   

   celestial       are       and  
0   0.000000  0.000000  0.000000  
1   0.000000  0.000000  0.000000  
2   0.156945  0.156945  0.156945  

CountVectorizer:
    and  are  bodies  celestial  is  moon  satellite  star  sun  the
0    0    0       0          0   1     0          0     1    1    1
1    0    0       0          0   1     1          1     0    0    1
2    1    1       1          1   0     1          0     0    1    1

TfidfVectorizer:
         and       are    bodies  celestial        is      moon  satellite  \
0  0.000000  0.000000  0.000000   0.000000  0.480458  0.000000   0.000000   
1  0.000000  0.000000  0.000000   0.000000  0.480458  0