In [1]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(corpus)
vectors = vectorizer.transform(corpus)

In [47]:
d = vectorizer.vocabulary_
for i in sorted(d):
    print(i, ":", d[i]) 

and : 0
are : 1
bodies : 2
celestial : 3
is : 4
moon : 5
satellite : 6
star : 7
sun : 8
the : 9


In [48]:
print(vectors.toarray())

[[0 0 0 0 1 0 0 1 1 1]
 [0 0 0 0 1 1 1 0 0 1]
 [1 1 1 1 0 1 0 0 1 1]]


In [49]:
vectors

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 15 stored elements and shape (3, 10)>

In [50]:
import pandas as pd
df = pd.DataFrame(
    vectors.toarray(), 
    columns=vectorizer.get_feature_names_out()
)
print(df)

   and  are  bodies  celestial  is  moon  satellite  star  sun  the
0    0    0       0          0   1     0          0     1    1    1
1    0    0       0          0   1     1          1     0    0    1
2    1    1       1          1   0     1          0     0    1    1


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(corpus)

In [52]:
d = vectorizer.vocabulary_
for i in sorted(d):
    print(i, ":", d[i]) 

and : 0
are : 1
bodies : 2
celestial : 3
is : 4
moon : 5
satellite : 6
star : 7
sun : 8
the : 9


In [53]:
print(vectorizer.idf_)

[1.69314718 1.69314718 1.69314718 1.69314718 1.28768207 1.28768207
 1.69314718 1.69314718 1.28768207 1.        ]


In [54]:
print(vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 15 stored elements and shape (3, 10)>
  Coords	Values
  (0, 9)	0.3731188059313277
  (0, 8)	0.4804583972923858
  (0, 4)	0.4804583972923858
  (0, 7)	0.6317450542765208
  (1, 9)	0.3731188059313277
  (1, 4)	0.4804583972923858
  (1, 5)	0.4804583972923858
  (1, 6)	0.6317450542765208
  (2, 9)	0.2517108425440014
  (2, 8)	0.3241235393856436
  (2, 5)	0.3241235393856436
  (2, 0)	0.42618350336974425
  (2, 1)	0.42618350336974425
  (2, 3)	0.42618350336974425
  (2, 2)	0.42618350336974425


In [55]:
print(vectors.toarray())

[[0.         0.         0.         0.         0.4804584  0.
  0.         0.63174505 0.4804584  0.37311881]
 [0.         0.         0.         0.         0.4804584  0.4804584
  0.63174505 0.         0.         0.37311881]
 [0.4261835  0.4261835  0.4261835  0.4261835  0.         0.32412354
  0.         0.         0.32412354 0.25171084]]


In [56]:
import pandas as pd
df = pd.DataFrame(
    vectors.toarray(), 
    columns=vectorizer.get_feature_names_out()
)
print(df)

        and       are    bodies  celestial        is      moon  satellite  \
0  0.000000  0.000000  0.000000   0.000000  0.480458  0.000000   0.000000   
1  0.000000  0.000000  0.000000   0.000000  0.480458  0.480458   0.631745   
2  0.426184  0.426184  0.426184   0.426184  0.000000  0.324124   0.000000   

       star       sun       the  
0  0.631745  0.480458  0.373119  
1  0.000000  0.000000  0.373119  
2  0.000000  0.324124  0.251711  


In [60]:
print(" ".join(corpus).split())

['the', 'sun', 'is', 'a', 'star', 'the', 'moon', 'is', 'a', 'satellite', 'the', 'sun', 'and', 'moon', 'are', 'celestial', 'bodies']


In [65]:
vocab = list(set(" ".join(corpus).split()))
print(vocab)

['bodies', 'sun', 'satellite', 'star', 'the', 'moon', 'are', 'a', 'celestial', 'is', 'and']


In [66]:
d = {}
for i, word in enumerate(vocab):
    d[word] = i
print(d)

{'bodies': 0, 'sun': 1, 'satellite': 2, 'star': 3, 'the': 4, 'moon': 5, 'are': 6, 'a': 7, 'celestial': 8, 'is': 9, 'and': 10}


In [81]:
def tf(word, doc):
    count = 0
    for i in doc.split():
        if word==i:
            count+=1
    return count/len(doc.split())

In [82]:
import math
def idf(word,corpus):
    count = 0
    for doc in corpus:
        if word in doc.split():
            count+=1
    return math.log(len(corpus)/count)

In [83]:
df = pd.DataFrame(columns=vocab)
vector = []
for i in range(len(corpus)):
    for j in vocab:
        vector.append(tf(j, corpus[i])*idf(j,corpus))
    df.loc[i]=vector
    vector=[]
print(df)

     bodies       sun  satellite      star  the      moon       are         a  \
0  0.000000  0.081093   0.000000  0.219722  0.0  0.000000  0.000000  0.081093   
1  0.000000  0.000000   0.219722  0.000000  0.0  0.081093  0.000000  0.081093   
2  0.156945  0.057924   0.000000  0.000000  0.0  0.057924  0.156945  0.000000   

   celestial        is       and  
0   0.000000  0.081093  0.000000  
1   0.000000  0.081093  0.000000  
2   0.156945  0.000000  0.156945  
