### CORPUS-1

In [1]:
#Corpus
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [3]:
print(vectorizer.get_feature_names())
skl_output.shape
print(skl_output[0])
print(skl_output[0].toarray())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


###  custom implementation

In [4]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

'''
The class is a custom implementation of TFIDF Vectorizer. It is a basic tfidf vectorizer which requires a clean corpus as input where corpus is a list of strings.
The transform() function returns a sparse normalized matrix.
'''
class cust_tfidf_vectorizer:
    def __init__(self):
        pass
    
    def featurize(self,text):
        data=[j  for i in text for j in i]
        vect=sorted(list(set(data)))
        return vect
    
    def getfeatures(self):
        return vector
    
    def get_idf(self):
        return idf
    
    def term_freq(self,final,inpt,vector):
        for i in range(len(inpt)):
            for j in range(len(inpt[i])):
                inpt[i][j]=final[i][vector[j]]/len(final[i])
        return inpt
    
    def inverse_doc_freq(self,vector,final,text):
        idf=[]
        for i in vector:
            c=0
            for j in range(len(final)):
                    if final[j][i]>0.:
                        c=c+1
            x=1+math.log((1+len(text))/(1+c))
            idf.append(x)
        idf=np.array(idf)
        return idf

    def fit(self,corpus):
        '''
        The function fits on the corpus to compute tf, idf and build vocabulary for the corpus.
        '''
        text=[i.split() for i in corpus]
        final=[Counter(i) for i in text]
        global vector,tf,idf
        vector=self.featurize(text)
        inpt=csr_matrix((len(text), len(vector))).toarray()
        tf=self.term_freq(final,inpt,vector)
        idf=self.inverse_doc_freq(vector,final,text)
    
    
    def transform(self,corpus):
        '''
        The function returns a sparse normalized matrix thereby transforming a given corpus into a mtrix of tfidf vectors
        '''
        self.fit(corpus)
        tfidf=[]
        for i in tf:
            a=np.multiply(i,idf)
            tfidf.append(a)
        tfidf=np.array(tfidf)
        tfidf_new=normalize(tfidf,norm='l2')
        tfidf=csr_matrix(tfidf_new)
        return tfidf
        
   

In [5]:
#Implementation of above class on 1st corpus
v=cust_tfidf_vectorizer()
v.fit(corpus)
print('The unique features are:',v.getfeatures())
print('---------------------------------------------------------------------------------------------------------------------------')
print('The IDF vector is: ',v.get_idf())
print('---------------------------------------------------------------------------------------------------------------------------')
o=v.transform(corpus)
print('The shape of output sparse matrix: ',o.shape)
print('---------------------------------------------------------------------------------------------------------------------------')
print('The first element/document after transformation:')
print(o[0])
print(o[0].toarray())

The unique features are: ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
---------------------------------------------------------------------------------------------------------------------------
The IDF vector is:  [1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]
---------------------------------------------------------------------------------------------------------------------------
The shape of output sparse matrix:  (4, 9)
---------------------------------------------------------------------------------------------------------------------------
The first element/document after transformation:
  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### CORPUS- 2

In [6]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [7]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

'''
The class is a custom implementation of TFIDF Vectorizer with a limit on maximum number of features. 
It uses a clean corpus as input where corpus is a list of strings. The vectorizer limits the number of features to a given value. 
It is mandatory to pass a number to the class while initializing or else the code would give error. This number of features should be less than max number of unique features.
Here, The function computes the top n features based on IDF values unlike sklearn where features are ranked based on their TF values.
The tfidf vectors are generated for each document using these top n features, such that each vector has n columns representing n features.
The transform() function returns a sparse normalized matrix with a shape of (len(no of documents), number_of_features(as given during initialising))
'''

class cust_tfidf_vectorizer:
    def __init__(self,no_ft):
        self.no_ft=no_ft
    
    def featurize(self,text):
        data=[j  for i in text for j in i]
        vect=sorted(list(set(data)))
        return vect
    
    def getfeatures(self):
        return vector
    
    def get_idf(self):
        return idf
    
    def get_topfeatures(self):
        return frame
    
    def term_freq(self,final,inpt,vector):
        for i in range(len(inpt)):
            for j in range(len(inpt[i])):
                inpt[i][j]=final[i][vector[j]]/len(final[i])
        return inpt
    
    def inverse_doc_freq(self,vector,final,text):
        idf=[]
        for i in vector:
            c=0
            for j in range(len(final)):
                    if final[j][i]>0.:
                        c=c+1
            x=1+math.log((1+len(text))/(1+c))
            idf.append(x)
        idf=np.array(idf)
        return idf

    def fit(self,corpus):
        '''
        The function fits on the corpus to compute tf, idf and build vocabulary for the corpus.
        '''
        text=[i.split() for i in corpus]
        final=[Counter(i) for i in text]
        global vector,tf,idf,frame
        vector=self.featurize(text)
        idf=self.inverse_doc_freq(vector,final,text)
        frame=list(zip(vector,idf))
        frame.sort(key=operator.itemgetter(1),reverse=True)      # https://stackoverflow.com/questions/14466068/sort-a-list-of-tuples-by-second-value-reverse-true-and-then-by-key-reverse-fal
        frame=[frame[i] for i in range(self.no_ft)]
        vector=[frame[i][0] for i in range(self.no_ft)]
        idf=[frame[i][1] for i in range(self.no_ft)]
        inpt=csr_matrix((len(text), len(vector))).toarray()
        tf=self.term_freq(final,inpt,vector)
    
    def transform(self,corpus):
        '''
        The function returns a sparse normalized matrix thereby transforming a given corpus into a mtrix of tfidf vectors
        '''
        self.fit(corpus)
        tfidf=[]
        for i in tf:
            a=np.multiply(i,idf)
            tfidf.append(a)
        tfidf=np.array(tfidf)
        tfidf_new=normalize(tfidf,norm='l2')
        tfidf=csr_matrix(tfidf_new)
        return tfidf

In [8]:
#Implementation of above class for n=50
v=cust_tfidf_vectorizer(50)
v.fit(corpus)
print('Top 50 features based on their IDF values: ')
print(v.getfeatures())
print('---------------------------------------------------------------------------------------------------------------------------')
print('IDF values of top 50 features:')
print(v.get_idf())
print('---------------------------------------------------------------------------------------------------------------------------')
print('Top features alongwith their IDF values:')
print(v.get_topfeatures())
print('---------------------------------------------------------------------------------------------------------------------------')
o=v.transform(corpus)
print('Shape of output matrix:',o.shape)
print('---------------------------------------------------------------------------------------------------------------------------')
print('One document of corpus after transformation:')
print(o[135])
print(o[135].toarray())

Top 50 features based on their IDF values: 
['aailiyah', 'abandoned', 'abroad', 'abstruse', 'academy', 'accents', 'accessible', 'acclaimed', 'accolades', 'accurate', 'accurately', 'achille', 'ackerman', 'actions', 'adams', 'add', 'added', 'admins', 'admiration', 'admitted', 'adrift', 'adventure', 'aesthetically', 'affected', 'affleck', 'afternoon', 'aged', 'ages', 'agree', 'agreed', 'aimless', 'aired', 'akasha', 'akin', 'alert', 'alike', 'allison', 'allow', 'allowing', 'alongside', 'amateurish', 'amaze', 'amazed', 'amazingly', 'amusing', 'amust', 'anatomist', 'angel', 'angela', 'angelina']
---------------------------------------------------------------------------------------------------------------------------
IDF values of top 50 features:
[6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 