In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
from tqdm import tqdm
import os

In [2]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [4]:
print(vectorizer.get_feature_names_out())

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [5]:
print(vectorizer.idf_)
sklvectorizer= list(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [6]:
skl_output.shape

(4, 9)

In [7]:
print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [8]:
print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


#### Build a TFIDF Vectorizer & compare its results with Sklearn

In [9]:
from collections import Counter
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy

In [10]:
def fit(dataset):    
    unique_words = set() 
    if isinstance(dataset, (list,)):
        for row in dataset: 
            for word in row.split(" "): 
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        
        return vocab
    else:
        print("you need to pass list of sentance")

In [11]:
Dict1 = fit(corpus)
Dict1_keys = list(Dict1.keys())
print(Dict1)

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


In [12]:
def count_of_word_in_corpus(dataset, word):
  count = 0
  for row in dataset:
    if word in row:
      count = count+1
  return count

In [13]:
def transform(dataset, vocab):
    rows = []
    columns = []
    values = []
    tf_val = []
    idf_val = []
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(dataset):
            word_freq = dict(Counter(row.split()))
            for word, freq in word_freq.items():
                if len(word) < 2:
                    continue
                col_index = vocab.get(word, -1)
                if col_index!=-1:
                    rows.append(idx)
                    columns.append(col_index)
                    tf_idf_value = (freq/len(row.split()))*(1 + (  np.log( (1 + len(dataset) ) / (1 + count_of_word_in_corpus(dataset, word) ))))
                    values.append(tf_idf_value)
                    sparse_matrix = csr_matrix((values, (rows,columns)), shape=(len(dataset), len(vocab)))
                    final_normalized_output = normalize(sparse_matrix)
        return final_normalized_output
    else:
        print("you need to pass list of strings")

In [14]:
print(transform(corpus, Dict1))

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149


In [15]:
tf_idf_vectorized  = transform(corpus, Dict1)
print(tf_idf_vectorized.toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


#### SkLearn Implementation

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_tf_idf = vectorizer.transform(corpus)
print(skl_tf_idf.toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


## Task-2. Implement max features functionality:

In [17]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [18]:
def fit_top_50(dataset):
    unique_word_list = []
    idf_values_list = []
    for row in dataset:
        for word in row.split(" "):
            if (len(word) >=2) and word not in unique_word_list:
                unique_word_list.append(word)
    for word in unique_word_list:
        idf_values = 1 + (  np.log( (1 + len(dataset) ) / (1 + count_of_word_in_corpus(dataset, word) )))
        idf_values_list.append(idf_values)

    ziped_list = zip(idf_values_list, unique_word_list )
    sorted_ziped_list = sorted(ziped_list, reverse=True)
    sorted_unique_word_list = [element for _, element in sorted_ziped_list]

    word_dict_top_50 = {j:i for i,j in enumerate(sorted_unique_word_list[:50]) }
    return word_dict_top_50

In [19]:
word_dict_top_50 = fit_top_50(corpus)
print(word_dict_top_50)

{'zombiez': 0, 'zillion': 1, 'yun': 2, 'youtube': 3, 'youthful': 4, 'younger': 5, 'yelps': 6, 'yawn': 7, 'yardley': 8, 'wrote': 9, 'writers': 10, 'wrap': 11, 'wow': 12, 'woven': 13, 'wouldnt': 14, 'worthwhile': 15, 'worthless': 16, 'worry': 17, 'worked': 18, 'wont': 19, 'wong': 20, 'wondered': 21, 'woa': 22, 'witticisms': 23, 'within': 24, 'wily': 25, 'willie': 26, 'william': 27, 'wild': 28, 'wih': 29, 'wife': 30, 'widmark': 31, 'wide': 32, 'whoever': 33, 'whites': 34, 'whine': 35, 'whenever': 36, 'went': 37, 'welsh': 38, 'weight': 39, 'wedding': 40, 'website': 41, 'weaving': 42, 'weariness': 43, 'weaker': 44, 'wayne': 45, 'waylaid': 46, 'wave': 47, 'wasting': 48, 'waster': 49}


In [20]:
def transform_top_50(dataset, word_dict_top_50):
    rows = []
    columns = []
    values = []
    tf_val = []
    idf_val = []
    for idx, row in enumerate(dataset):
        word_freq = dict(Counter(row.split()))
        for word, freq in word_freq.items():
            if len(word) < 2:
                continue
            col_index = word_dict_top_50.get(word, -1)
            if col_index!=-1:
                rows.append(idx)
                columns.append(col_index)
                tf_idf_value = (freq/len(row.split()))*(1 + (np.log((1 + len(dataset)) / (1 + count_of_word_in_corpus(dataset,word)))))
                values.append(tf_idf_value)
                sparse_matrix = csr_matrix((values, (rows,columns)), shape=(len(dataset), len(word_dict_top_50)))
                final_normalized_output = normalize(sparse_matrix)
    return final_normalized_output

In [21]:
print(transform_top_50(corpus,word_dict_top_50))

  (19, 4)	0.5773502691896258
  (19, 17)	0.5773502691896258
  (19, 33)	0.5773502691896258
  (55, 44)	1.0
  (68, 30)	1.0
  (70, 40)	1.0
  (80, 35)	1.0
  (109, 0)	1.0
  (134, 45)	1.0
  (135, 11)	0.408248290463863
  (135, 22)	0.408248290463863
  (135, 23)	0.408248290463863
  (135, 29)	0.408248290463863
  (135, 41)	0.408248290463863
  (135, 43)	0.408248290463863
  (148, 7)	0.5773502691896257
  (148, 32)	0.5773502691896257
  (148, 46)	0.5773502691896257
  (155, 10)	1.0
  (191, 25)	1.0
  (222, 6)	1.0
  (251, 12)	1.0
  (270, 2)	1.0
  (321, 1)	1.0
  (326, 31)	1.0
  (337, 34)	1.0
  (340, 14)	1.0
  (341, 26)	1.0
  (350, 20)	0.7071067811865476
  (350, 27)	0.7071067811865476
  (361, 9)	1.0
  (366, 18)	1.0
  (378, 39)	1.0
  (421, 38)	1.0
  (452, 13)	1.0
  (464, 37)	1.0
  (495, 19)	1.0
  (514, 8)	1.0
  (518, 3)	1.0
  (521, 49)	1.0
  (525, 48)	1.0
  (535, 36)	1.0
  (562, 28)	1.0
  (633, 24)	1.0
  (634, 21)	1.0
  (644, 5)	1.0
  (680, 15)	1.0
  (719, 47)	1.0
  (720, 16)	1.0
  (734, 42)	1.0


In [22]:
tf_idf_vectorized_max_feature = transform_top_50(corpus, word_dict_top_50)
print(tf_idf_vectorized_max_feature.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### SkLearn Implementation for max feature

In [23]:
vectorizer.fit(corpus)
skl_tf_idf_max_feature_vectorized = vectorizer.transform(corpus)
print(skl_tf_idf_max_feature_vectorized.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
