### Corpus

In [26]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [28]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']




In [29]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [30]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [31]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [32]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Your custom implementation

In [33]:
# Compare your results with the above sklearn tfidf vectorizer


from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

In [34]:
# create a function 

def fit(dataset):
  unique_words= set()

  if isinstance(dataset,(list,)): #isinstance is used to check first argument is of class of secon arg-it is true or false
    for row in dataset: #reviews
      for word in row.split():
        if len(word)<2: # to check any punctuations
          continue
        unique_words.add(word)
    unique_words = sorted(list(unique_words))
    vocab = {j:i for i,j in enumerate(unique_words)}

    return vocab
  else:
    print("you need to pass list of sentance")

In [35]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

vocab=fit(corpus)
print(vocab)

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


In [36]:
# create a transform function for it

def transform(dataset,vocab):
  rows = []
  columns = []
  values = []
  idf= 0
  final_tf = []
  if isinstance(dataset, (list,)):
 
    for idx, row in enumerate(tqdm(dataset)):
      b=len(row.split())
      c=len(dataset)
      word_freq = dict((Counter(row.split())))
      for word, freq in word_freq.items():
        tf=(freq/b)
        for i in dataset:
          if word in i:
            idf =idf+ 1
        Num = 1 + c
        Den = 1 + idf
        idf_1 = (1 +(np.log(Num/Den)))
        tf_idf = ((tf)*(idf_1))
        idf=0 
        if len(word) < 2:
          continue
        col_index = vocab.get(word, -1)

        if col_index !=-1:
          rows.append(idx)
          columns.append(col_index)
          values.append(tf_idf)
    spa_mat= csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab)))
    normalised_sparse= normalize(spa_mat, norm='l2', axis=1, copy=True, return_norm=False)
    return normalised_sparse
  else:
    print("you need to pass list of strings")       
print(transform(corpus, vocab))


100%|██████████| 4/4 [00:00<00:00, 8359.35it/s]

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149





In [37]:
print(transform(corpus, vocab).toarray())

100%|██████████| 4/4 [00:00<00:00, 12255.09it/s]

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]





In [38]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type
from google.colab import drive
drive.mount("/content/drive")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
import pickle
with open('/content/drive/MyDrive/cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [40]:
from operator import itemgetter
def fit_50(dataset):
  idf= 0
  final_idf = {}
  if isinstance(dataset, (list,)):
    c=len(dataset)
    for idx, row in enumerate(tqdm(dataset)): 
      word_freq = dict(Counter(row.split()))
      for word, freq in word_freq.items():
        for i in dataset:
          if word in i:
            idf =idf+ 1
        Num = 1 + c
        Den = 1 + idf
        idf_1 = (1 +(np.log(Num/Den)))
        final_idf.update({word:idf_1})
        idf=0 
    res = dict(sorted(final_idf.items(), key = itemgetter(1), reverse = True)[:50])
    print(res)
    vocab = {j:i for i,j in enumerate(res)}
    return vocab
  else:
    print("you need to pass list of strings") 

print(fit_50(corpus))

100%|██████████| 746/746 [00:00<00:00, 967.24it/s]

{'aimless': 6.922918004572872, 'distressed': 6.922918004572872, 'drifting': 6.922918004572872, 'nearly': 6.922918004572872, 'attempting': 6.922918004572872, 'artiness': 6.922918004572872, 'gerardo': 6.922918004572872, 'emptiness': 6.922918004572872, 'messages': 6.922918004572872, 'buffet': 6.922918004572872, 'science': 6.922918004572872, 'teacher': 6.922918004572872, 'owls': 6.922918004572872, 'florida': 6.922918004572872, 'muppets': 6.922918004572872, 'overdue': 6.922918004572872, 'screenplay': 6.922918004572872, 'post': 6.922918004572872, 'practically': 6.922918004572872, 'structure': 6.922918004572872, 'tightly': 6.922918004572872, 'constructed': 6.922918004572872, 'vitally': 6.922918004572872, 'occurs': 6.922918004572872, 'content': 6.922918004572872, 'dozen': 6.922918004572872, 'highest': 6.922918004572872, 'superlative': 6.922918004572872, 'require': 6.922918004572872, 'puzzle': 6.922918004572872, 'solving': 6.922918004572872, 'fit': 6.922918004572872, 'pulls': 6.922918004572872,




In [41]:
vocab = fit_50(corpus)
print(transform(corpus, vocab))

100%|██████████| 746/746 [00:00<00:00, 938.01it/s]


{'aimless': 6.922918004572872, 'distressed': 6.922918004572872, 'drifting': 6.922918004572872, 'nearly': 6.922918004572872, 'attempting': 6.922918004572872, 'artiness': 6.922918004572872, 'gerardo': 6.922918004572872, 'emptiness': 6.922918004572872, 'messages': 6.922918004572872, 'buffet': 6.922918004572872, 'science': 6.922918004572872, 'teacher': 6.922918004572872, 'owls': 6.922918004572872, 'florida': 6.922918004572872, 'muppets': 6.922918004572872, 'overdue': 6.922918004572872, 'screenplay': 6.922918004572872, 'post': 6.922918004572872, 'practically': 6.922918004572872, 'structure': 6.922918004572872, 'tightly': 6.922918004572872, 'constructed': 6.922918004572872, 'vitally': 6.922918004572872, 'occurs': 6.922918004572872, 'content': 6.922918004572872, 'dozen': 6.922918004572872, 'highest': 6.922918004572872, 'superlative': 6.922918004572872, 'require': 6.922918004572872, 'puzzle': 6.922918004572872, 'solving': 6.922918004572872, 'fit': 6.922918004572872, 'pulls': 6.922918004572872,

100%|██████████| 746/746 [00:00<00:00, 945.28it/s]

  (0, 0)	0.5773502691896257
  (0, 1)	0.5773502691896257
  (0, 2)	0.5773502691896257
  (1, 3)	1.0
  (2, 4)	0.7071067811865476
  (2, 5)	0.7071067811865476
  (4, 6)	1.0
  (5, 7)	1.0
  (7, 8)	1.0
  (9, 9)	0.5773502691896257
  (9, 10)	0.5773502691896257
  (9, 11)	0.5773502691896257
  (10, 12)	1.0
  (11, 13)	1.0
  (12, 14)	1.0
  (16, 15)	1.0
  (17, 16)	0.7071067811865475
  (17, 17)	0.7071067811865475
  (18, 18)	1.0
  (19, 19)	0.14142135623730948
  (19, 20)	0.14142135623730948
  (19, 21)	0.14142135623730948
  (19, 22)	0.14142135623730948
  (19, 23)	0.14142135623730948
  (19, 24)	0.14142135623730948
  (19, 25)	0.14142135623730948
  (19, 26)	0.14142135623730948
  (19, 27)	0.14142135623730948
  (19, 28)	0.14142135623730948
  (19, 29)	0.14142135623730948
  (19, 30)	0.14142135623730948
  (19, 31)	0.28284271247461895
  (19, 32)	0.14142135623730948
  (19, 33)	0.14142135623730948
  (19, 34)	0.4242640687119284
  (19, 35)	0.14142135623730948
  (19, 36)	0.4242640687119284
  (19, 37)	0.14142135623730948



