# Demo 2, Term Frequency - Inverse Document Frequency

TF-IDF is similar to bag of words, however the final counts are divided by the number of documents in the corpus.

It can be useful for identifying words that are not well represented by your corpus of data, that may need more samples provided.

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import os
try:
    os.chdir("Sonnets")
except:
    pass

Use the same data-loader as before.

In [12]:
file_list = os.listdir('.')
for file in file_list:
    if file.startswith("Sonnet") and file.endswith(".txt"):
        pass
    else:
        print(file)
        file_list.remove(file)
file_list.sort()

shakespeares-sonnets_TXT_FolgerShakespeare.txt
stopwords.txt


We must tell the TF-IDF vectorizer algorithm what kind of data to expect, in this case, we're using files, so we give the algorithm "filename" as an input.

In [13]:
tf_idf_algorithm = TfidfVectorizer("filename")
tfidf = tf_idf_algorithm.fit_transform(file_list)

In [14]:
print(type(tfidf))

<class 'scipy.sparse.csr.csr_matrix'>


In [15]:
numpy_tfidf = tfidf.toarray()

In [16]:
print(numpy_tfidf.shape)

(154, 3074)


In [17]:
first_sonnet = numpy_tfidf[0]
print(first_sonnet)
print(first_sonnet.shape)

[0. 0. 0. ... 0. 0. 0.]
(3074,)


In [18]:
#Find non-zero
np.nonzero(numpy_tfidf[0])

(array([   7,  101,  137,  139,  197,  199,  210,  350,  362,  371,  376,
         378,  442,  516,  520,  556,  570,  617,  669,  692,  771,  800,
         812,  909,  919,  930,  964, 1003, 1020, 1081, 1085, 1091, 1108,
        1143, 1167, 1262, 1270, 1290, 1360, 1365, 1512, 1515, 1588, 1593,
        1631, 1644, 1731, 1738, 1757, 1791, 1798, 1801, 1825, 1902, 2168,
        2183, 2262, 2315, 2438, 2442, 2515, 2562, 2604, 2616, 2617, 2618,
        2628, 2636, 2644, 2648, 2669, 2670, 2677, 2685, 2697, 2897, 2908,
        2937, 2993, 2996, 3023], dtype=int64),)

In [9]:
#Un-tokenize word
print(tf_idf_algorithm.get_feature_names()[151])

astonished


In [23]:
#print out the tf-idf value for a word
print(np.nonzero(numpy_tfidf[0]))

(array([   7,  101,  137,  139,  197,  199,  210,  350,  362,  371,  376,
        378,  442,  516,  520,  556,  570,  617,  669,  692,  771,  800,
        812,  909,  919,  930,  964, 1003, 1020, 1081, 1085, 1091, 1108,
       1143, 1167, 1262, 1270, 1290, 1360, 1365, 1512, 1515, 1588, 1593,
       1631, 1644, 1731, 1738, 1757, 1791, 1798, 1801, 1825, 1902, 2168,
       2183, 2262, 2315, 2438, 2442, 2515, 2562, 2604, 2616, 2617, 2618,
       2628, 2636, 2644, 2648, 2669, 2670, 2677, 2685, 2697, 2897, 2908,
       2937, 2993, 2996, 3023], dtype=int64),)
(154,)


In [27]:
print(numpy_tfidf[:,7].shape)
print(numpy_tfidf[:,7])
print(np.sum(numpy_tfidf[:,7]))

(154,)
[0.11337761 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.11211088 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.11779593 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.  

In [31]:
TF_IDF = []
for i in range(0,153):
    TF_IDF.append(np.sum(numpy_tfidf[:,i]))

In [35]:
print(min(TF_IDF))
print(TF_IDF.index(min(TF_IDF)))

0.10969418531007252
52


In [36]:
print(tf_idf_algorithm.get_feature_names()[52])

afar


In [37]:
print(numpy_tfidf[:,52])

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         

## Gensim

In [49]:
from gensim.models import TfidfModel
from gensim import corpora
from gensim.utils import simple_preprocess, deaccent

Copied from Demo #1, we need to first put the data into a Bag of Words, then convert it over.

In [65]:
tokenized_sonnets = []
for sonnet in file_list:
    tokenized_sonnets.append((simple_preprocess(deaccent(open(sonnet, 'r').read()))))
    
dictionary = corpora.Dictionary()
BoW = []
for sonnet in tokenized_sonnets: 
    BoW.append(dictionary.doc2bow(sonnet, allow_update=True))

bow_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_sonnets]
print(bow_corpus[0])

[(0, 1), (1, 3), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 2), (57, 1), (58, 1), (59, 2), (60, 1), (61, 1), (62, 2), (63, 2), (64, 6), (65, 1), (66, 1), (67, 2), (68, 1), (69, 2), (70, 4), (71, 1), (72, 1), (73, 4), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 3)]


In [66]:
gensim_tfidf = TfidfModel(bow_corpus)
print(gensim_tfidf)
vector = gensim_tfidf[bow_corpus[0]]

TfidfModel(num_docs=154, num_nnz=12166)


In [67]:
print(vector)

[(0, 0.12136892489684474), (1, 0.004639780481169176), (2, 0.04740961375271943), (3, 0.024817735990948636), (4, 0.022615224507605128), (5, 0.08218363822383518), (6, 0.043996797901172585), (7, 0.09832471682821257), (8, 0.1444131329654769), (9, 0.16745734103410906), (10, 0.022979786414189754), (11, 0.05838418524463214), (12, 0.1444131329654769), (13, 0.1444131329654769), (14, 0.1444131329654769), (15, 0.1444131329654769), (16, 0.09832471682821257), (17, 0.1309331353865112), (18, 0.09440892973891331), (19, 0.08484471924924684), (20, 0.10788892731787901), (21, 0.1444131329654769), (22, 0.113950346895122), (23, 0.049257075948727934), (24, 0.1309331353865112), (25, 0.16745734103410906), (26, 0.1444131329654769), (27, 0.1309331353865112), (28, 0.16745734103410906), (29, 0.102764070087715), (30, 0.03133772511025979), (31, 0.16745734103410906), (32, 0.16745734103410906), (33, 0.16745734103410906), (34, 0.1309331353865112), (35, 0.1309331353865112), (36, 0.16745734103410906), (37, 0.0672628917636

['Sonnet_001.txt', 'Sonnet_002.txt', 'Sonnet_003.txt', 'Sonnet_004.txt', 'Sonnet_005.txt', 'Sonnet_006.txt', 'Sonnet_007.txt', 'Sonnet_008.txt', 'Sonnet_009.txt', 'Sonnet_010.txt', 'Sonnet_011.txt', 'Sonnet_012.txt', 'Sonnet_013.txt', 'Sonnet_014.txt', 'Sonnet_015.txt', 'Sonnet_016.txt', 'Sonnet_017.txt', 'Sonnet_018.txt', 'Sonnet_019.txt', 'Sonnet_020.txt', 'Sonnet_021.txt', 'Sonnet_022.txt', 'Sonnet_023.txt', 'Sonnet_024.txt', 'Sonnet_025.txt', 'Sonnet_026.txt', 'Sonnet_027.txt', 'Sonnet_028.txt', 'Sonnet_029.txt', 'Sonnet_030.txt', 'Sonnet_031.txt', 'Sonnet_032.txt', 'Sonnet_033.txt', 'Sonnet_034.txt', 'Sonnet_035.txt', 'Sonnet_036.txt', 'Sonnet_037.txt', 'Sonnet_038.txt', 'Sonnet_039.txt', 'Sonnet_040.txt', 'Sonnet_041.txt', 'Sonnet_042.txt', 'Sonnet_043.txt', 'Sonnet_044.txt', 'Sonnet_045.txt', 'Sonnet_046.txt', 'Sonnet_047.txt', 'Sonnet_048.txt', 'Sonnet_049.txt', 'Sonnet_050.txt', 'Sonnet_051.txt', 'Sonnet_052.txt', 'Sonnet_053.txt', 'Sonnet_054.txt', 'Sonnet_055.txt', 'Sonnet_0