Build a TFIDF Vectorizer & compare its results with Sklearn

In [1]:
import pandas as pd
import numpy as np
import os
import tqdm 
import math
import scipy
from collections import Counter
from scipy.sparse import lil_matrix
from sklearn.preprocessing import normalize

In [2]:
## SkLearn Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [3]:
def fit(corpus):
    '''
    Function to get unique words in corpus which will be columns in output sparse matrix 
    
    '''
    unique_words = [] # Initialising empty list of unique words
    for doc in corpus:
        words = doc.split(" ")
        for word in words:
            if word in unique_words:
                continue
            elif len(word)<2:   #To eliminate punctuations
                continue
            else:
                unique_words.append(word)
    return unique_words 

In [4]:
vocab = fit(corpus) 
print(vocab)

['this', 'is', 'the', 'first', 'document', 'second', 'and', 'third', 'one']


In [5]:
def IDF(corpus):
    '''
    Function to get IDF values of all unique words in corpus
    '''
    N = len(corpus)
    IDF_dict = {}
    for word in vocab:
        count = 0
        for doc in corpus:
            if word in doc.split():
                count += 1
            IDF = (math.log((1+N)/(count+1))) +1
            IDF_dict[word] = IDF
            
    return IDF_dict

In [6]:
IDF(corpus)

{'this': 1.0,
 'is': 1.0,
 'the': 1.0,
 'first': 1.5108256237659907,
 'document': 1.2231435513142097,
 'second': 1.916290731874155,
 'and': 1.916290731874155,
 'third': 1.916290731874155,
 'one': 1.916290731874155}

###

Above IDF values are verified and same as Sklearn implementation.

In [7]:
def TF(corpus):
    '''
    Function to get TF values of unique words
    '''
    TF = {}
    for row in range(len(corpus)):
        word_list = corpus[row].split() 
        total_words_count = len(word_list)
        word_counter = Counter(corpus[row].split())
        for word in word_list:
            if word in vocab:
                TF[word] = word_counter[word] / total_words_count
                
    return TF
            

In [8]:
TF(corpus)

{'this': 0.2,
 'is': 0.2,
 'the': 0.2,
 'first': 0.2,
 'document': 0.2,
 'second': 0.16666666666666666,
 'and': 0.16666666666666666,
 'third': 0.16666666666666666,
 'one': 0.16666666666666666}

In [9]:
#Source : https://analyticsindiamag.com/hands-on-implementation-of-tf-idf-from-scratch-in-python/


def transform(corpus):
    '''
    Function to get TFIDF output in sparse matrix representation 
    '''
    sparse_matrix = lil_matrix((len(corpus), len(vocab)),dtype= np.float64) # Initialising sparse matrix with dimensions - corpus length and no.of unique words
    for doc in range(len(corpus)):
        for word in corpus[doc].split():
            if word in vocab:
                TFIDF_value = TF(corpus)[word] * IDF(corpus)[word]
                sparse_matrix[doc,vocab.index(word)] = TFIDF_value 
    norm = normalize(sparse_matrix, norm = 'l2', axis=1,copy= True, return_norm = False) # Performing L2 normalization for output sparse matrix
            
    return norm

In [10]:
output= transform(corpus)

In [11]:
print(output[0])

  (0, 0)	0.3840852409148149
  (0, 1)	0.3840852409148149
  (0, 2)	0.3840852409148149
  (0, 3)	0.580285823684436
  (0, 4)	0.4697913855799205


Implementing max features functionality

In [12]:
import pickle
with open('F:\Downloads\cleaned_strings', 'rb') as f:
    corpus_cleaned_strings = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus_cleaned_strings))

Number of documents in corpus =  746


In [13]:
def fit_cleaned_strings(corpus):
    '''
    Function to get unique words in corpus which will be columns in output sparse matrix 
    
    '''
    unique_words = [] # Initialising empty list of unique words
    for doc in corpus:
        words = doc.split(" ")
        for word in words:
            if word in unique_words:
                continue
            elif len(word)<2:   #To eliminate punctuations
                continue
            else:
                unique_words.append(word)
    return unique_words 

In [14]:
vocab_cleaned_strings = fit_cleaned_strings(corpus_cleaned_strings) 
print(len(vocab_cleaned_strings))

2886


In [15]:
def IDF_cleaned_strings(corpus):
    '''
    Function to get IDF values of all unique words in corpus
    '''
    N = len(corpus)
    IDF_dict = {}
    for word in vocab_cleaned_strings:
        count = 0
        for doc in corpus:
            if word in doc.split():
                count += 1
            IDF = (math.log((1+N)/(count+1))) +1
            IDF_dict[word] = IDF
    #Sorting and taking words with 50 highest IDF values
    sorted_IDF_dict = {i: j for i, j in sorted(IDF_dict.items(), key=lambda item: item[1],reverse = True)[:50]}
    
    return sorted_IDF_dict

In [29]:
output = IDF_cleaned_strings(corpus_cleaned_strings)
print(output)

{'aimless': 6.922918004572872, 'distressed': 6.922918004572872, 'drifting': 6.922918004572872, 'nearly': 6.922918004572872, 'attempting': 6.922918004572872, 'artiness': 6.922918004572872, 'existent': 6.922918004572872, 'gerardo': 6.922918004572872, 'emptiness': 6.922918004572872, 'effort': 6.922918004572872, 'messages': 6.922918004572872, 'buffet': 6.922918004572872, 'science': 6.922918004572872, 'teacher': 6.922918004572872, 'baby': 6.922918004572872, 'owls': 6.922918004572872, 'florida': 6.922918004572872, 'muppets': 6.922918004572872, 'person': 6.922918004572872, 'overdue': 6.922918004572872, 'screenplay': 6.922918004572872, 'post': 6.922918004572872, 'practically': 6.922918004572872, 'structure': 6.922918004572872, 'tightly': 6.922918004572872, 'constructed': 6.922918004572872, 'vitally': 6.922918004572872, 'occurs': 6.922918004572872, 'content': 6.922918004572872, 'fill': 6.922918004572872, 'dozen': 6.922918004572872, 'highest': 6.922918004572872, 'superlative': 6.922918004572872,

In [30]:
len(output)

50

In [31]:
top_vocab = list(output.keys()) #Taking list of unique words of corpus with top 50 IDF values
print(top_vocab)

['aimless', 'distressed', 'drifting', 'nearly', 'attempting', 'artiness', 'existent', 'gerardo', 'emptiness', 'effort', 'messages', 'buffet', 'science', 'teacher', 'baby', 'owls', 'florida', 'muppets', 'person', 'overdue', 'screenplay', 'post', 'practically', 'structure', 'tightly', 'constructed', 'vitally', 'occurs', 'content', 'fill', 'dozen', 'highest', 'superlative', 'require', 'puzzle', 'solving', 'fit', 'pulls', 'punches', 'graphics', 'number', 'th', 'insane', 'massive', 'unlockable', 'properly', 'aye', 'rocks', 'doomed', 'conception']


In [19]:
def TF_cleaned_strings(corpus):
    '''
    Function to get TF values of unique words
    '''
    TF = {}
    for row in range(len(corpus)):
        word_list = corpus[row].split() 
        total_words_count = len(word_list)
        word_counter = Counter(corpus[row].split())
        for word in word_list:
            if word in top_vocab:
                TF[word] = word_counter[word] / total_words_count
                
    return TF
            

In [20]:
TF_cleaned_strings(corpus_cleaned_strings)

{'aimless': 0.125,
 'distressed': 0.125,
 'drifting': 0.125,
 'nearly': 0.1111111111111111,
 'attempting': 0.05263157894736842,
 'artiness': 0.05263157894736842,
 'existent': 0.05263157894736842,
 'gerardo': 0.1,
 'emptiness': 0.1,
 'effort': 0.1111111111111111,
 'messages': 0.1111111111111111,
 'buffet': 0.16666666666666666,
 'science': 0.16666666666666666,
 'teacher': 0.16666666666666666,
 'baby': 0.3333333333333333,
 'owls': 0.3333333333333333,
 'florida': 0.125,
 'muppets': 0.25,
 'person': 0.125,
 'overdue': 0.07692307692307693,
 'screenplay': 0.07142857142857142,
 'post': 0.07142857142857142,
 'practically': 0.14285714285714285,
 'structure': 0.002277904328018223,
 'tightly': 0.002277904328018223,
 'constructed': 0.002277904328018223,
 'vitally': 0.002277904328018223,
 'occurs': 0.002277904328018223,
 'content': 0.002277904328018223,
 'fill': 0.002277904328018223,
 'dozen': 0.002277904328018223,
 'highest': 0.002277904328018223,
 'superlative': 0.002277904328018223,
 'require': 0

In [21]:
def transform_cleaned_strings(corpus):
    '''
    Function to get TFIDF output in sparse matrix representation 
    '''
    sparse_matrix = lil_matrix((len(corpus), len(top_vocab)),dtype= np.float64) # Initialising sparse matrix with dimensions - corpus length and no.of unique words
    for doc in range(len(corpus)):
        for word in corpus[doc].split():
            if word in top_vocab:
                TFIDF_value = TF_cleaned_strings(corpus)[word] * IDF_cleaned_strings(corpus)[word]
                sparse_matrix[doc,top_vocab.index(word)] = TFIDF_value 
    norm = normalize(sparse_matrix, norm = 'l2', axis=1,copy= True, return_norm = False) # Performing L2 normalization for output sparse matrix
       
    return norm

In [23]:
output= transform_cleaned_strings(corpus_cleaned_strings)


In [27]:
print(output)

  (0, 0)	0.5773502691896257
  (0, 1)	0.5773502691896257
  (0, 2)	0.5773502691896257
  (1, 3)	1.0
  (2, 4)	0.5773502691896258
  (2, 5)	0.5773502691896258
  (2, 6)	0.5773502691896258
  (4, 7)	1.0
  (5, 8)	1.0
  (7, 9)	0.7071067811865475
  (7, 10)	0.7071067811865475
  (9, 11)	0.5773502691896257
  (9, 12)	0.5773502691896257
  (9, 13)	0.5773502691896257
  (10, 14)	0.7071067811865476
  (10, 15)	0.7071067811865476
  (11, 16)	1.0
  (12, 17)	1.0
  (15, 18)	1.0
  (16, 19)	1.0
  (17, 20)	0.7071067811865475
  (17, 21)	0.7071067811865475
  (18, 22)	1.0
  (19, 23)	0.14744195615489714
  (19, 24)	0.14744195615489714
  (19, 25)	0.14744195615489714
  (19, 26)	0.14744195615489714
  (19, 27)	0.14744195615489714
  (19, 28)	0.14744195615489714
  (19, 29)	0.14744195615489714
  (19, 30)	0.14744195615489714
  (19, 31)	0.14744195615489714
  (19, 32)	0.14744195615489714
  (19, 33)	0.14744195615489714
  (19, 34)	0.14744195615489714
  (19, 35)	0.14744195615489714
  (19, 36)	0.29488391230979427
  (19, 37)	0.1474419

In [24]:
output_one_doc = output[0]
print(output_one_doc)

  (0, 0)	0.5773502691896257
  (0, 1)	0.5773502691896257
  (0, 2)	0.5773502691896257


In [25]:
dense_matrix = output_one_doc.todense()

In [26]:
print(dense_matrix.shape)

(1, 50)
