In [0]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy

In [0]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [0]:
def fit(dataset):    
    unique_words = set() # at first we will initialize an empty set
    # check if its list type or not
    if isinstance(dataset, (list,)):
        for row in dataset: # for each review in the dataset
            for word in row.split(" "): # for each word in the review. #split method converts a string into list of words
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        return vocab
    else:
        print("you need to pass list of sentance")

In [86]:
vocab = fit(corpus)
print(vocab)

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


In [87]:
#Unique Words are 
print(list(vocab.keys()))

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


Calculate IDF

In [88]:
n_documents = len(corpus)
print(n_documents)

4


In [89]:
 
def transform(dataset,vocab):
  #Calculating IDF
  idf_={}
  tfdic=[]
  di={}
  for d in range (len(dataset)):
    doc=list(dataset[d].split())
    for word in doc:
      if word in vocab.keys():
        tf_cnt = doc.count(word)
      
      tf = (tf_cnt/len(doc))
      di.update({word:tf})
    
    tfdic.append(di)
    di={}
     
  
  for word in vocab:
      count = 0
      idf=0
      for doc in dataset:
        if word in doc:
          count = count +1
      idf=1+(math.log((1+n_documents)/(1+count)))   
      idf_[word]=idf
  rows = []
  columns = []
  values=[]
  l1=[]
  for idx, row in enumerate(tqdm(dataset)): #Loop through the corpus
    l1=list(row.split(" ")) #Get all the words for current row
    for word in l1:
      col_index = vocab.get(word, -1)#Find the index of word in vocab
      if col_index !=-1:
        
        df=idf_.get(word,-1)#Get the IDF value of the word
        tf1=tfdic[idx] #Getting the TF scores corresponding to the document
        tf2=tf1.get(word,-1)#Getting TF value
        val=tf2*df#Calculating TF-IDF
        # we are storing the dimensions of the word
        rows.append(idx)
        # we are storing the index of the document
        columns.append(col_index)           
        values.append(val)
  return csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab))) 

print(transform(corpus,vocab).toarray())
X=transform(corpus,vocab)


100%|██████████| 4/4 [00:00<00:00, 3856.83it/s]
100%|██████████| 4/4 [00:00<00:00, 2426.91it/s]

[[0.         0.24462871 0.30216512 0.2        0.         0.
  0.2        0.         0.2       ]
 [0.         0.81542903 0.         0.16666667 0.         0.31938179
  0.16666667 0.         0.16666667]
 [0.31938179 0.         0.         0.16666667 0.31938179 0.
  0.16666667 0.31938179 0.16666667]
 [0.         0.24462871 0.30216512 0.2        0.         0.
  0.2        0.         0.2       ]]





In [0]:
nl2=normalize(X, norm='l2', axis=1, copy=True, return_norm=False)#L2 normalisation

In [91]:
nl2.shape #Rows in corpus and Columns are vocab

(4, 9)

In [92]:
print(nl2[0]) #Document1 sparse matrix

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149


In [93]:
print(nl2[0].toarray()) #Dense Matrix

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [94]:
print(nl2)#Displaying whole Sparse Matrix

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.8843203931656719
  (1, 3)	0.18074746668441155
  (1, 5)	0.3463646952170705
  (1, 6)	0.18074746668441155
  (1, 8)	0.18074746668441155
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149


#TASK 2

In [95]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [0]:
def fit(dataset):
    unique_words = set() # at first we will initialize an empty set
    # check if its list type or not
    if isinstance(dataset, (list,)):
        for row in dataset: # for each review in the dataset
            for word in row.split(" "): # for each word in the review. #split method converts a string into list of words
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        return vocab
    else:
        print("you need to pass list of sentance")

In [0]:
vocab = fit(corpus)
n_documents = len(corpus)

In [0]:
def transform(dataset,vocab):
  unique= []
  tf_={}
  idf_={}
  tfdic=[]
  di={}
  
  for word in vocab:
      count = 0
      idf=0
      for doc in dataset:
        if word in doc:
          count = count +1
      idf=1+(math.log((1+n_documents)/(1+count)))   
      idf_[word]=idf
      
  #Sorting the IDF Dictionary based on top IDF values
  l=list(idf_.items())
  l.sort(reverse=True)
  idf_=dict(l)
  idf_50={}
  cnt=0
  vocab_50={}
  #Generating top 50 vocab and IDF
  for key,value in idf_.items():
    if cnt<50:
      idf_50[key]=value
      unique.append(key)
      unique = sorted(list(unique))
      vocab_50 = {j:i for i,j in enumerate(unique)}
      cnt = cnt+1
  print("Top Features and IDF Scores")
  for item in idf_50.items():
    print(item)
  
  #Generating a list of dictionary to store the TF values for each document
  for d in range (len(dataset)):
    doc=list(dataset[d].split())
    for word in doc:
      if word in vocab_50.keys():
        tf_cnt = doc.count(word)
        tf = (tf_cnt/len(doc)) 
        di.update({word:tf})
    
    tfdic.append(di)
    di={}        
  rows = []
  columns = []
  values=[]
  l1=[]
  for idx, row in enumerate(tqdm(dataset)):
    l1=list(row.split(" "))
    for word in l1:
      col_index = vocab_50.get(word, -1)
      if col_index !=-1:
        # we are storing the index of the document
        df=idf_50.get(word,-1)#Getting IDF value
        tf1=tfdic[idx] #Getting the TF scores corresponding to the document
        tf2=tf1.get(word,-1)#Getting TF value
        val=tf2*df#Calculating TF-IDF
        rows.append(idx)     
        columns.append(col_index)            
        values.append(val)

  return csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab_50))) 



In [99]:
#print(transform(corpus,vocab).toarray())
X=transform(corpus,vocab)

100%|██████████| 746/746 [00:00<00:00, 246471.11it/s]

Top Features and IDF Scores
('zombiez', 6.922918004572872)
('zombie', 6.229770824012927)
('zillion', 6.922918004572872)
('yun', 6.922918004572872)
('youtube', 6.922918004572872)
('youthful', 6.922918004572872)
('younger', 6.922918004572872)
('young', 5.824305715904762)
('yet', 5.824305715904762)
('yes', 5.536623643452981)
('yelps', 6.922918004572872)
('years', 5.05111582767128)
('year', 4.843476462893037)
('yeah', 6.517452896464707)
('yawn', 6.922918004572872)
('yardley', 6.922918004572872)
('wrote', 6.922918004572872)
('wrong', 6.229770824012927)
('written', 5.536623643452981)
('writing', 4.9770078555175585)
('writers', 6.922918004572872)
('writer', 5.536623643452981)
('write', 5.536623643452981)
('wrap', 6.922918004572872)
('wow', 6.922918004572872)
('woven', 6.922918004572872)
('wouldnt', 6.922918004572872)
('would', 4.248769355146344)
('worthy', 6.229770824012927)
('worthwhile', 6.922918004572872)
('worthless', 6.922918004572872)
('worth', 4.620332911578826)
('worst', 5.21816991233




In [0]:
nl2=normalize(X, norm='l2', axis=1, copy=True, return_norm=False)

In [101]:
nl2.shape

(746, 50)

In [102]:
print(nl2[0])

  (0, 42)	1.0


In [103]:
print(nl2)

  (0, 42)	1.0
  (5, 13)	1.0
  (19, 9)	0.13340045056356076
  (19, 10)	0.3837896897173447
  (19, 15)	0.15375023937679422
  (19, 17)	0.11588969746575903
  (19, 18)	0.10261240862400675
  (19, 22)	0.8492435913311929
  (19, 30)	0.11053375883703274
  (19, 31)	0.12296219743724018
  (19, 40)	0.12296219743724018
  (19, 44)	0.15375023937679422
  (20, 38)	1.0
  (21, 8)	1.0
  (26, 31)	1.0
  (29, 18)	1.0
  (45, 18)	1.0
  (48, 30)	1.0
  (49, 30)	1.0
  (62, 22)	1.0
  (66, 14)	1.0
  (68, 22)	1.0
  (79, 10)	0.595755244213343
  (79, 14)	0.8031660407364719
  (82, 18)	1.0
  :	:
  (644, 43)	0.2467542120355904
  (644, 48)	0.22204830243926363
  (654, 17)	1.0
  (658, 7)	1.0
  (661, 2)	1.0
  (674, 2)	0.5168710901842515
  (674, 31)	0.6324212867344452
  (674, 38)	0.5769641169231339
  (677, 10)	1.0
  (678, 2)	1.0
  (679, 2)	1.0
  (680, 20)	1.0
  (693, 41)	1.0
  (696, 28)	0.7071067811865476
  (696, 31)	0.7071067811865476
  (698, 41)	1.0
  (700, 22)	1.0
  (710, 28)	1.0
  (713, 18)	1.0
  (714, 3)	1.0
  (717, 3)	1.0
 