# Introduction to NLP with Python - BoW and tfidf

Written by Sumithra Velupillai June 2019


## Working with datasets and packages

Importing packages you need for a project is the first step.

pandas is a very useful package for working with datasets

In [1]:
import pandas as pd
import numpy as np

spaCy has a default language model for English that we will load into the variable 'nlp'

In [2]:
try:
    import spacy
except ImportError as e:
    !pip install spacy
    import spacy
try:
    nlp = spacy.load('en_core_web_sm')
except Error as e:
    !python -m spacy download en_core_web_sm
    nlp = spacy.load('en_core_web_sm')

In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sumithra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Bag-of-words and tf-idf
From: https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

In [4]:
pd.set_option('max_colwidth', 1000)

In [5]:
import string
def get_spacy_tokens(row):
  return [str(token) for token in row.doc if str(token) not in string.punctuation]



In [6]:
documentA = 'patient with abdominal pain. she has taken aspirin.'
documentB = 'she has had abdominal pain in the past. pain free today.'
documentC = 'no abdominal pain.'
documentD = 'takes aspirin for pain. has no pain today.'

In [7]:
#bagOfWordsA = get_spacy_lemmas(nlp(documentA))
#bagOfWordsB = get_spacy_lemmas(nlp(documentB))
#bagOfWordsC = get_spacy_lemmas(nlp(documentC))
#bagOfWordsD = get_spacy_lemmas(nlp(documentD))
bagOfWordsA = get_spacy_tokens(nlp(documentA))
bagOfWordsB = get_spacy_tokens(nlp(documentB))
bagOfWordsC = get_spacy_tokens(nlp(documentC))
bagOfWordsD = get_spacy_tokens(nlp(documentD))

In [8]:
bagOfWordsA

['patient', 'with', 'abdominal', 'pain', 'she', 'has', 'taken', 'aspirin']

In [9]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB)).union(set(bagOfWordsC)).union(set(bagOfWordsD))

In [10]:
uniqueWords

{'abdominal',
 'aspirin',
 'for',
 'free',
 'had',
 'has',
 'in',
 'no',
 'pain',
 'past',
 'patient',
 'she',
 'taken',
 'takes',
 'the',
 'today',
 'with'}

In [11]:
def getBow(uniqueWords, bow):
    numOfWords = dict.fromkeys(uniqueWords, 0)
    for word in bow:
        if word in numOfWords:
            numOfWords[word] +=1
    return numOfWords

In [12]:
numOfWordsA = getBow(uniqueWords, bagOfWordsA)
numOfWordsB = getBow(uniqueWords, bagOfWordsB)
numOfWordsC = getBow(uniqueWords, bagOfWordsC)
numOfWordsD = getBow(uniqueWords, bagOfWordsD)

In [13]:
dfbow = pd.DataFrame([numOfWordsA, numOfWordsB, numOfWordsC, numOfWordsD])
dfbow['text'] = [documentA, documentB, documentC, documentD]
dfbow

Unnamed: 0,free,abdominal,the,in,no,today,takes,taken,past,had,patient,with,pain,has,aspirin,for,she,text
0,0,1,0,0,0,0,0,1,0,0,1,1,1,1,1,0,1,patient with abdominal pain. she has taken aspirin.
1,1,1,1,1,0,1,0,0,1,1,0,0,2,1,0,0,1,she has had abdominal pain in the past. pain free today.
2,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,no abdominal pain.
3,0,0,0,0,1,1,1,0,0,0,0,0,2,1,1,1,0,takes aspirin for pain. has no pain today.


In [14]:
uniqueWords = set([uw for uw in list(uniqueWords) if uw not in stopwords.words('english')])

In [15]:
numOfWordsA = getBow(uniqueWords, bagOfWordsA)
numOfWordsB = getBow(uniqueWords, bagOfWordsB)
numOfWordsC = getBow(uniqueWords, bagOfWordsC)
numOfWordsD = getBow(uniqueWords, bagOfWordsD)

In [16]:
dfbow = pd.DataFrame([numOfWordsA, numOfWordsB, numOfWordsC, numOfWordsD])
dfbow['text'] = [documentA, documentB, documentC, documentD]
dfbow

Unnamed: 0,today,takes,taken,free,past,abdominal,patient,pain,aspirin,text
0,0,0,1,0,0,1,1,1,1,patient with abdominal pain. she has taken aspirin.
1,1,0,0,1,1,1,0,2,0,she has had abdominal pain in the past. pain free today.
2,0,0,0,0,0,1,0,1,0,no abdominal pain.
3,1,1,0,0,0,0,0,2,1,takes aspirin for pain. has no pain today.


In [17]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [18]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
tfC = computeTF(numOfWordsC, bagOfWordsC)
tfD = computeTF(numOfWordsD, bagOfWordsD)

In [19]:
tfA

{'today': 0.0,
 'takes': 0.0,
 'taken': 0.125,
 'free': 0.0,
 'past': 0.0,
 'abdominal': 0.125,
 'patient': 0.125,
 'pain': 0.125,
 'aspirin': 0.125}

In [20]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [21]:
idfs = computeIDF([numOfWordsA, numOfWordsB, numOfWordsC, numOfWordsD])

In [22]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = round(val * idfs[word], 3)
    return tfidf

In [23]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
tfidfC = computeTFIDF(tfC, idfs)
tfidfD = computeTFIDF(tfD, idfs)
dftfidf = pd.DataFrame([tfidfA, tfidfB, tfidfC, tfidfD])

In [24]:

dftfidf['text'] = [documentA, documentB, documentC, documentD]
dftfidf

Unnamed: 0,today,takes,taken,free,past,abdominal,patient,pain,aspirin,text
0,0.0,0.0,0.173,0.0,0.0,0.036,0.173,0.0,0.087,patient with abdominal pain. she has taken aspirin.
1,0.063,0.0,0.0,0.126,0.126,0.026,0.0,0.0,0.0,she has had abdominal pain in the past. pain free today.
2,0.0,0.0,0.0,0.0,0.0,0.096,0.0,0.0,0.0,no abdominal pain.
3,0.087,0.173,0.0,0.0,0.0,0.0,0.0,0.0,0.087,takes aspirin for pain. has no pain today.
