# Assignment 5 : Implementing TFIDF vectorizer

## Task 1 : Build a TFIDF Vectorizer & compare its results with Sklearn

In [94]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

### Custom implementation

In [96]:
import pickle
import operator
import warnings
import numpy as np
from tqdm import tqdm
from collections import Counter
warnings.filterwarnings("ignore")
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer


def fit(corpus):    
    unique_words = set()
    if isinstance(corpus, (list,)):
        for row in corpus:
            for word in row.split(" "): 
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        return vocab
    else:
        print("you need to pass list of sentance")

vocab = fit(corpus)

def Word_Count(vocab):
    unique_wordset = set(vocab)
    word_count = {}
    word_count = dict.fromkeys(unique_wordset, 0)
  
    for row in corpus:
        for word in word_count:
            if word in row:
                word_count[word] += 1
      
    return word_count
    
word_countDict = Word_Count(vocab) 
word_countDict = dict(sorted(word_countDict.items()))
        
def IDF():
    idfDict = {}
    N = len(corpus)
    for word, value in word_countDict.items():
        idfDict[word] = 1 + np.log((len(corpus) + 1) / (value + 1))
    for word, value in idfDict.items():
        idfDict[word] = round(value, 8)
        
    return idfDict

def transform(corpus,vocab):
    rows = []
    columns = []
    values = []
    idf = IDF()
    if isinstance(corpus, (list,)):
        for idx, row in enumerate(tqdm(corpus)): 
            word_freq = dict(Counter(row.split()))
            for word, freq in word_freq.items():                
                if len(word) < 2:
                    continue
                col_index = vocab.get(word, -1)
                a=idf.get(word)
                f = (operator.mul(operator.truediv(freq,len(row.split())),a))

                if col_index !=-1:
                    rows.append(idx)
                    columns.append(col_index)
                    values.append(f)
                    
        return normalize(csr_matrix((values, (rows,columns)), shape=(len(corpus),len(vocab))))
    else:
        print("you need to pass list of strings")


In [97]:
print("From Sklearn Implementation : feature names, they are sorted in alphabetic order by default ")
print(vectorizer.get_feature_names())
print()
print("From Custom Implementation : feature names, they are sorted in alphabetic order by default ")
print(list(vocab.keys()))

From Sklearn Implementation : feature names, they are sorted in alphabetic order by default 
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

From Custom Implementation : feature names, they are sorted in alphabetic order by default 
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [98]:
print("From Sklearn Implementation : idf values after applying the fit method")
print(vectorizer.idf_)
print()
print("From Custom Implementation : idf values after applying the fit method")
idf = IDF()
print(idf.values())

From Sklearn Implementation : idf values after applying the fit method
[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]

From Custom Implementation : idf values after applying the fit method
dict_values([1.91629073, 1.22314355, 1.51082562, 1.0, 1.91629073, 1.91629073, 1.0, 1.91629073, 1.0])


In [99]:
print("From Sklearn Implementation : shape of sklearn tfidf vectorizer output after applying transform method")
print(skl_output.shape)
print()
print("From Custom Implementation : shape of sklearn tfidf vectorizer output after applying transform method")
X= transform(corpus, vocab)
print(X.shape)


100%|██████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 4005.06it/s]

From Sklearn Implementation : shape of sklearn tfidf vectorizer output after applying transform method
(4, 9)

From Custom Implementation : shape of sklearn tfidf vectorizer output after applying transform method
(4, 9)





In [100]:
print("From Sklearn Implementation : output is a sparse matrix")
print(skl_output[0])
print()
print("From Custom Implementation : output is a sparse matrix")
print(X[0])

From Sklearn Implementation : output is a sparse matrix
  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045

From Custom Implementation : output is a sparse matrix
  (0, 1)	0.46979138558088085
  (0, 2)	0.5802858228626505
  (0, 3)	0.3840852413282814
  (0, 6)	0.3840852413282814
  (0, 8)	0.3840852413282814


In [101]:
print("From Sklearn Implementation : converting the sparse output matrix to dense matrix")
print(skl_output[0].toarray())
print()
print("From Custom Implementation : converting the sparse output matrix to dense matrix")
print(X[0].toarray())

From Sklearn Implementation : converting the sparse output matrix to dense matrix
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]

From Custom Implementation : converting the sparse output matrix to dense matrix
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


## Task - 2 : Implement max features functionality

In [102]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [103]:
def fit(corpus):    
    unique_words = set()
    if isinstance(corpus, (list,)):
        for row in corpus:
            for word in row.split(" "):
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        return vocab
    else:
        print("you need to pass list of sentance")
        
def IDF(corpus,vocab):
    idf = dict()
    value_list = list(vocab.keys())
    t = []
    x = []
    for word in value_list:
        a = 0
        if isinstance(corpus, (list,)):
            for row in corpus: 
                if word in row.split():
                    a+=1
        t.append(operator.add(1,np.log(operator.truediv(operator.add(1,len(corpus)),operator.add(1,a)))))
    n = sorted(range(len(t)),reverse=True, key = lambda k: t[k])
    for i in range(50):
        x.append(value_list[n[i]])
    for i in range(50):
        idf[x[i]] = t[n[i]]
    x.sort()
    vocab = {j:i for i,j in enumerate(x)}
    return vocab,idf

def transform(corpus,vocab):
    rows = []
    columns = []
    values = []
    vocab,idf = IDF(corpus,vocab)
    print('\nTop idf valued words in the corpus and their idf values :\n',idf.items())
    print('\nThe updated vocabulary with only top 50 idf valued words :\n',vocab.items())
    if isinstance(corpus, (list,)):
        for idx, row in enumerate(tqdm(corpus)):
            word_freq = dict(Counter(row.split()))
            for word, freq in word_freq.items():  
                if word in list(vocab.keys()):
                    if len(word) < 2:
                        continue
                    col_index = vocab.get(word, -1)
                    a=idf.get(word)
                    f = (operator.mul(operator.truediv(freq,len(row.split())),a))
                    if col_index !=-1:
                        rows.append(idx)
                        columns.append(col_index)
                        values.append(f)
                else:
                    continue
        return normalize(csr_matrix((values, (rows,columns)), shape=(len(corpus),len(vocab))))
    else:
        print("you need to pass list of strings")
        
vocab = fit(corpus)
X= transform(corpus, vocab)
print()
print(X,'\n\n',X[0].shape)
print()
print(X[0].toarray())

100%|█████████████████████████████████████████████████████████████████████████████| 746/746 [00:00<00:00, 14666.50it/s]


Top idf valued words in the corpus and their idf values :
 dict_items([('aailiyah', 6.922918004572872), ('abandoned', 6.922918004572872), ('abroad', 6.922918004572872), ('abstruse', 6.922918004572872), ('academy', 6.922918004572872), ('accents', 6.922918004572872), ('accessible', 6.922918004572872), ('acclaimed', 6.922918004572872), ('accolades', 6.922918004572872), ('accurate', 6.922918004572872), ('accurately', 6.922918004572872), ('achille', 6.922918004572872), ('ackerman', 6.922918004572872), ('actions', 6.922918004572872), ('adams', 6.922918004572872), ('add', 6.922918004572872), ('added', 6.922918004572872), ('admins', 6.922918004572872), ('admiration', 6.922918004572872), ('admitted', 6.922918004572872), ('adrift', 6.922918004572872), ('adventure', 6.922918004572872), ('aesthetically', 6.922918004572872), ('affected', 6.922918004572872), ('affleck', 6.922918004572872), ('afternoon', 6.922918004572872), ('aged', 6.922918004572872), ('ages', 6.922918004572872), ('agree', 6.922918


