# TF-IDF
Natural Language Processing (NLP) is a sub-field of artificial intelligence that deals understanding and processing human language. In light of new advancements in machine learning, many organizations have begun applying natural language processing for translation, chatbots and candidate filtering.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
#documentA = 'the man went out for a walk'
#documentB = 'the children sat around the fire'

documentA ="Data science is an interdisciplinary field focused on extracting knowledge from data sets, which are typically large (see big data), and applying the knowledge and actionable insights from data to solve problems in a wide range of application domains.[6] The field encompasses preparing data for analysis, formulating data science problems, analyzing data, developing data-driven solutions, and presenting findings to inform high-level decisions in a broad range of application domains"
documentB ="Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks. For simple tasks assigned to computers, it is possible to program algorithms telling the machine how to execute all steps required to solve the problem at hand; on the computer's part, no learning is needed. For more advanced tasks, it can be challenging for a human to manually create the needed algorithms"

Machine learning algorithms cannot work with raw text directly. Rather, the text must be converted into vectors of numbers. In natural language processing, a common technique for extracting features from text is to place all of the words that occur in the text in a bucket. This aproach is called a bag of words model or BoW for short. It’s referred to as a “bag” of words because any information about the structure of the sentence is lost.

In [43]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [44]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [47]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

print(numOfWordsA)
print()
print("*"*50)
print()
print(numOfWordsB)


{'possible': 0, 'sets,': 1, 'developing': 1, 'how': 0, 'can': 0, 'domains': 1, 'without': 0, 'certain': 0, 'big': 1, 'needed': 0, 'of': 2, 'application': 2, 'required': 0, 'out': 0, 'machine': 0, 'decisions': 1, 'an': 1, 'domains.[6]': 1, 'programmed': 0, 'hand;': 0, 'discovering': 0, 'more': 0, 'program': 0, 'wide': 1, 'knowledge': 2, 'findings': 1, 'computers': 0, 'assigned': 0, 'inform': 1, 'learning': 0, 'solve': 1, 'part,': 0, 'focused': 1, 'tasks,': 0, 'to': 2, 'they': 0, 'being': 0, 'needed.': 0, 'interdisciplinary': 1, 'and': 3, 'do': 0, 'human': 0, 'insights': 1, 'applying': 1, 'field': 2, 'provided': 0, 'large': 1, 'challenging': 0, 'computers,': 0, 'solutions,': 1, 'so.': 0, 'the': 1, 'advanced': 0, 'are': 1, 'at': 0, 'on': 1, 'typically': 1, 'telling': 0, 'steps': 0, 'problem': 0, 'The': 1, '(see': 1, 'science': 2, 'broad': 1, 'be': 0, "computer's": 0, 'it': 0, 'simple': 0, 'Data': 1, 'tasks': 0, 'tasks.': 0, 'data,': 1, 'which': 1, 'problems,': 1, 'Machine': 0, 'data),': 1

Another problem with the bag of words approach is that it doesn’t account for noise. In other words, certain words are used to formulate sentences but do not add any semantic meaning to the text. For example, the most commonly used word in the english language is 'THE' which represents 7% of all words written or spoken. You couldn’t make deduce anything about a text given the fact that it contains the word the. On the other hand, words like good and awesome could be used to determine whether a rating was positive or not.

In natural language processing, useless words are referred to as stop words. The python natural language toolkit library provides a list of english stop words.

In [48]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# Term Frequency (TF)
The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.
<img src="tf.png">

In [49]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    print("*"*20)
    print('length of bag is ', bagOfWordsCount)
    print("*"*20)
    for word, count in wordDict.items():
        print(word, count)
        tfDict[word] = count / float(bagOfWordsCount)
        # The count of word / total len or count in bag
    return tfDict

In [50]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
print(tfA)
print("*"*30)
print(tfB)


********************
length of bag is  69
********************
possible 0
sets, 1
developing 1
how 0
can 0
domains 1
without 0
certain 0
big 1
needed 0
of 2
application 2
required 0
out 0
machine 0
decisions 1
an 1
domains.[6] 1
programmed 0
hand; 0
discovering 0
more 0
program 0
wide 1
knowledge 2
findings 1
computers 0
assigned 0
inform 1
learning 0
solve 1
part, 0
focused 1
tasks, 0
to 2
they 0
being 0
needed. 0
interdisciplinary 1
and 3
do 0
human 0
insights 1
applying 1
field 2
provided 0
large 1
challenging 0
computers, 0
solutions, 1
so. 0
the 1
advanced 0
are 1
at 0
on 1
typically 1
telling 0
steps 0
problem 0
The 1
(see 1
science 2
broad 1
be 0
computer's 0
it 0
simple 0
Data 1
tasks 0
tasks. 0
data, 1
which 1
problems, 1
Machine 0
data), 1
formulating 1
algorithms 0
presenting 1
data-driven 1
create 0
for 1
no 0
is 1
a 2
that 0
explicitly 0
so 0
manually 0
data 4
analyzing 1
in 2
preparing 1
execute 0
range 2
perform 0
encompasses 1
all 0
extracting 1
It 0
problems 1
involves

# Inverse Data Frequency (IDF)
The log of the number of documents divided by the number of documents that contain the word w. Inverse data frequency determines the weight of rare words across all documents in the corpus.
<img src="idf.png">

In [51]:
def computeIDF(documents):
    import math
    N = len(documents)
    print(N) # here N = 2
    print("*"*30)
    idfDict = dict.fromkeys(documents[1].keys(), 0)
    print(idfDict)
    print("*"*30)
    for document in documents:
        for word, count in document.items():
            print(word , count)
            if count > 0:
                idfDict[word] += 1
        print("*"*10)
    print(idfDict)
    print("*"*30)
    for word, count in idfDict.items():
        idfDict[word] = math.log(N / float(count))
    return idfDict

In [52]:
idfs = computeIDF([numOfWordsA, numOfWordsB])
print(idfs)

2
******************************
{'possible': 0, 'sets,': 0, 'developing': 0, 'how': 0, 'can': 0, 'domains': 0, 'without': 0, 'certain': 0, 'big': 0, 'needed': 0, 'of': 0, 'application': 0, 'required': 0, 'out': 0, 'machine': 0, 'decisions': 0, 'an': 0, 'domains.[6]': 0, 'programmed': 0, 'hand;': 0, 'discovering': 0, 'more': 0, 'program': 0, 'wide': 0, 'knowledge': 0, 'findings': 0, 'computers': 0, 'assigned': 0, 'inform': 0, 'learning': 0, 'solve': 0, 'part,': 0, 'focused': 0, 'tasks,': 0, 'to': 0, 'they': 0, 'being': 0, 'needed.': 0, 'interdisciplinary': 0, 'and': 0, 'do': 0, 'human': 0, 'insights': 0, 'applying': 0, 'field': 0, 'provided': 0, 'large': 0, 'challenging': 0, 'computers,': 0, 'solutions,': 0, 'so.': 0, 'the': 0, 'advanced': 0, 'are': 0, 'at': 0, 'on': 0, 'typically': 0, 'telling': 0, 'steps': 0, 'problem': 0, 'The': 0, '(see': 0, 'science': 0, 'broad': 0, 'be': 0, "computer's": 0, 'it': 0, 'simple': 0, 'Data': 0, 'tasks': 0, 'tasks.': 0, 'data,': 0, 'which': 0, 'problem

Lastly, the TF-IDF is simply the TF multiplied by IDF.
<img src="tfidf.png">

In [53]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [54]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])
print(df)

'''
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'
'''

   possible     sets,  developing       how       can   domains   without  \
0  0.000000  0.010046    0.010046  0.000000  0.000000  0.010046  0.000000   
1  0.008351  0.000000    0.000000  0.016702  0.016702  0.000000  0.008351   

    certain       big    needed  ...  extracting        It  problems  \
0  0.000000  0.010046  0.000000  ...    0.010046  0.000000  0.010046   
1  0.008351  0.000000  0.008351  ...    0.000000  0.008351  0.000000   

   involves       For     carry  actionable  analysis,  from  high-level  
0  0.000000  0.000000  0.000000    0.010046   0.010046   0.0    0.010046  
1  0.016702  0.016702  0.008351    0.000000   0.000000   0.0    0.000000  

[2 rows x 108 columns]


"\ndocumentA = 'the man went out for a walk'\ndocumentB = 'the children sat around the fire'\n"

In [56]:
# Using Sklearn pkg

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
print(df)

   actionable  advanced  algorithms       all        an  analysis  analyzing  \
0    0.093049  0.000000    0.000000  0.000000  0.093049  0.093049   0.093049   
1    0.000000  0.083353    0.166707  0.083353  0.000000  0.000000   0.000000   

        and  application  applying  ...     tasks   telling      that  \
0  0.279146     0.186097  0.093049  ...  0.000000  0.000000  0.000000   
1  0.000000     0.000000  0.000000  ...  0.333413  0.083353  0.083353   

        the      they       to  typically     which      wide   without  
0  0.132410  0.000000  0.13241   0.093049  0.093049  0.093049  0.000000  
1  0.237226  0.166707  0.35584   0.000000  0.000000  0.000000  0.083353  

[2 rows x 94 columns]
