In [1]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np
import sklearn

### Custom implementation of TF-IDF 


In [2]:
def fit(dataset):
    unique_words = set()
    
    if isinstance(dataset, (list,)):
        for row in dataset:
            for word in row.split():
                if len(word)<2:
                    continue
                
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {i:j for i,j in enumerate(unique_words)}
        
        return vocab
                
        

In [3]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [4]:
vocab = fit(corpus)
vocab

{0: 'and',
 1: 'document',
 2: 'first',
 3: 'is',
 4: 'one',
 5: 'second',
 6: 'the',
 7: 'third',
 8: 'this'}

In [5]:
def fit_transform(dataset , vocab):
    idf=[]
    for i in vocab.values():
        g=0
        for j in corpus:
            if i in j:
                g+=1
        numerator = 1+len(corpus)
        denominator = 1+g
        idf.append((1+(np.log(numerator/denominator))))
        
    Row=[]
    column=[]
    element=[]
    if isinstance(dataset, (list,)):
        for index,row in enumerate(dataset):
            word_freq=dict(Counter(row.split()))
            tf = [word_freq[word]/len(row.split()) if word in row else 0  for word in vocab.values()]
            
            for i in range(len(tf)):
                tf_idf = tf[i] *idf[i]
                
                if tf_idf>0:
                    Row.append(index)
                    column.append(i)
                    element.append(tf_idf)
                    
    sparse_matrix = csr_matrix((element, (Row,column)), shape=(len(dataset),len(vocab)))
    normalized=sklearn.preprocessing.normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    
    return normalized

In [6]:
print(fit_transform(corpus,vocab).toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [7]:
#Sklearns implementation of tf-idf

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [8]:
print(skl_output.toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Implementing max features functionality (top 50 featurese with large idf value)

In [9]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [10]:
corpus

['slow moving aimless movie distressed drifting young man',
 'not sure lost flat characters audience nearly half walked',
 'attempting artiness black white clever camera angles movie disappointed became even ridiculous acting poor plot lines almost non existent',
 'little music anything speak',
 'best scene movie gerardo trying find song keeps running head',
 'rest movie lacks art charm meaning emptiness works guess empty',
 'wasted two hours',
 'saw movie today thought good effort good messages kids',
 'bit predictable',
 'loved casting jimmy buffet science teacher',
 'baby owls adorable',
 'movie showed lot florida best made look appealing',
 'songs best muppets hilarious',
 'cool',
 'right case movie delivers everything almost right face',
 'average acting main person low budget clearly see',
 'review long overdue since consider tale two sisters single greatest film ever made',
 'put gem movie terms screenplay cinematography acting post production editing directing aspect film makin

In [11]:
def fit(dataset):
    unique_words = set()
    
    if isinstance(dataset, (list,)):
        for row in dataset:
            for word in row.split():
                if len(word)<2:
                    continue
                
                unique_words.add(word)
        idf=[]
        for i in unique_words:
            g=0
            for j in dataset:
                if i in j:
                    g+=1
            numerator = 1+len(corpus)
            denominator = 1+g
            idf.append([1+(np.log(numerator/denominator)),i])
            
        #unique_words = sorted(list(unique_words))
        vocab = {i:j for i,j in enumerate(sorted(idf)[:-51:-1])}
        
        return vocab
                

In [12]:
vocab = fit(corpus)
vocab

{0: [6.922918004572872, 'zombiez'],
 1: [6.922918004572872, 'zillion'],
 2: [6.922918004572872, 'yun'],
 3: [6.922918004572872, 'youtube'],
 4: [6.922918004572872, 'youthful'],
 5: [6.922918004572872, 'younger'],
 6: [6.922918004572872, 'yelps'],
 7: [6.922918004572872, 'yawn'],
 8: [6.922918004572872, 'yardley'],
 9: [6.922918004572872, 'wrote'],
 10: [6.922918004572872, 'writers'],
 11: [6.922918004572872, 'wrap'],
 12: [6.922918004572872, 'wow'],
 13: [6.922918004572872, 'woven'],
 14: [6.922918004572872, 'wouldnt'],
 15: [6.922918004572872, 'worthwhile'],
 16: [6.922918004572872, 'worthless'],
 17: [6.922918004572872, 'worry'],
 18: [6.922918004572872, 'worked'],
 19: [6.922918004572872, 'wont'],
 20: [6.922918004572872, 'wong'],
 21: [6.922918004572872, 'wondered'],
 22: [6.922918004572872, 'woa'],
 23: [6.922918004572872, 'witticisms'],
 24: [6.922918004572872, 'within'],
 25: [6.922918004572872, 'wily'],
 26: [6.922918004572872, 'willie'],
 27: [6.922918004572872, 'william'],
 2

In [13]:
vocab = { i[0]:i[1][1] for i in vocab.items()}
vocab

{0: 'zombiez',
 1: 'zillion',
 2: 'yun',
 3: 'youtube',
 4: 'youthful',
 5: 'younger',
 6: 'yelps',
 7: 'yawn',
 8: 'yardley',
 9: 'wrote',
 10: 'writers',
 11: 'wrap',
 12: 'wow',
 13: 'woven',
 14: 'wouldnt',
 15: 'worthwhile',
 16: 'worthless',
 17: 'worry',
 18: 'worked',
 19: 'wont',
 20: 'wong',
 21: 'wondered',
 22: 'woa',
 23: 'witticisms',
 24: 'within',
 25: 'wily',
 26: 'willie',
 27: 'william',
 28: 'wild',
 29: 'wih',
 30: 'wife',
 31: 'widmark',
 32: 'wide',
 33: 'whoever',
 34: 'whites',
 35: 'whine',
 36: 'whenever',
 37: 'went',
 38: 'welsh',
 39: 'weight',
 40: 'wedding',
 41: 'website',
 42: 'weaving',
 43: 'weariness',
 44: 'weaker',
 45: 'wayne',
 46: 'waylaid',
 47: 'wave',
 48: 'wasting',
 49: 'waster'}

In [14]:
def fit_transform(dataset , vocab):
    idf=[]
    for i in vocab.values():
        g=0
        for j in corpus:
            if i in j:
                g+=1
        numerator = 1+len(corpus)
        denominator = 1+g
        idf.append((1+(np.log(numerator/denominator))))
        
    Row=[]
    column=[]
    element=[]
    if isinstance(dataset, (list,)):
        for index,row in enumerate(dataset):
            word_freq=dict(Counter(row.split()))
            tf = [word_freq[word]/len(row.split()) if word in row else 0  for word in vocab.values()]
            
            for i in range(len(tf)):
                tf_idf = tf[i] *idf[i]
                
                if tf_idf>0:
                    Row.append(index)
                    column.append(i)
                    element.append(tf_idf)
                    
    sparse_matrix = csr_matrix((element, (Row,column)), shape=(len(dataset),len(vocab)))
    normalized=sklearn.preprocessing.normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    
    return normalized

In [15]:
print(fit_transform(corpus , vocab))

  (19, 4)	0.5773502691896258
  (19, 17)	0.5773502691896258
  (19, 33)	0.5773502691896258
  (55, 44)	1.0
  (68, 30)	1.0
  (70, 40)	1.0
  (80, 35)	1.0
  (109, 0)	1.0
  (134, 45)	1.0
  (135, 11)	0.408248290463863
  (135, 22)	0.408248290463863
  (135, 23)	0.408248290463863
  (135, 29)	0.408248290463863
  (135, 41)	0.408248290463863
  (135, 43)	0.408248290463863
  (148, 7)	0.5773502691896257
  (148, 32)	0.5773502691896257
  (148, 46)	0.5773502691896257
  (155, 10)	1.0
  (191, 25)	1.0
  (222, 6)	1.0
  (251, 12)	1.0
  (270, 2)	1.0
  (321, 1)	1.0
  (326, 31)	1.0
  (337, 34)	1.0
  (340, 14)	1.0
  (341, 26)	1.0
  (350, 20)	0.7071067811865476
  (350, 27)	0.7071067811865476
  (361, 9)	1.0
  (366, 18)	1.0
  (378, 39)	1.0
  (421, 38)	1.0
  (452, 13)	1.0
  (464, 37)	1.0
  (495, 19)	1.0
  (514, 8)	1.0
  (518, 3)	1.0
  (521, 49)	1.0
  (525, 48)	1.0
  (535, 36)	1.0
  (562, 28)	1.0
  (633, 24)	1.0
  (634, 21)	1.0
  (644, 5)	1.0
  (680, 15)	1.0
  (719, 47)	1.0
  (720, 16)	1.0
  (734, 42)	1.0


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [17]:
print(skl_output)

  (0, 2878)	0.35781145622317734
  (0, 2287)	0.3377679916467555
  (0, 1653)	0.35781145622317734
  (0, 1651)	0.16192317905848022
  (0, 1545)	0.30566026894803877
  (0, 720)	0.4123943870778812
  (0, 688)	0.4123943870778812
  (0, 53)	0.4123943870778812
  (1, 2764)	0.3766212179754884
  (1, 2446)	0.359997059084325
  (1, 1712)	0.16744040526541593
  (1, 1676)	0.4000516539584003
  (1, 1511)	0.34710235966502884
  (1, 1132)	0.32765878476171706
  (1, 966)	0.3766212179754884
  (1, 374)	0.2398332744620992
  (1, 149)	0.3365666231014131
  (2, 2812)	0.22138111197246843
  (2, 2085)	0.2379459683681101
  (2, 1889)	0.23164830224699004
  (2, 1873)	0.17804028847989228
  (2, 1704)	0.2453945264455564
  (2, 1651)	0.11105027844840966
  (2, 1482)	0.22619301571578695
  (2, 853)	0.28282863381171675
  :	:
  (741, 2785)	0.29098436017585044
  (741, 2471)	0.35318031934226285
  (741, 1422)	0.4059564651452501
  (741, 1354)	0.4312119115047646
  (741, 1096)	0.3375265480660245
  (741, 429)	0.4312119115047646
  (741, 268)	0.3