# Bag of Words

## Scoring: Binary

In [1]:
from nltk import word_tokenize
from collections import defaultdict
import string

## Step 1: Collect Data

In [2]:
corpus = [
        'This is the first document.',
        'This is the second second document.',
        'And the third one.',
        'Is this the first document?']

## Step 2: Preprocessing

In [3]:
corpus_tokens = []
for sentence in corpus:
    sentence = sentence.lower()
    corpus_tokens.append(word_tokenize(sentence))

In [4]:
corpus_tokens

[['this', 'is', 'the', 'first', 'document', '.'],
 ['this', 'is', 'the', 'second', 'second', 'document', '.'],
 ['and', 'the', 'third', 'one', '.'],
 ['is', 'this', 'the', 'first', 'document', '?']]

## Step 3: Design the Vocabulary

In [5]:
vocab = defaultdict(int)
for tokens in corpus_tokens:
    for token in tokens:
        vocab[token] += 1

In [6]:
vocab

defaultdict(int,
            {'this': 3,
             'is': 3,
             'the': 4,
             'first': 2,
             'document': 3,
             '.': 3,
             'second': 2,
             'and': 1,
             'third': 1,
             'one': 1,
             '?': 1})

### Step 3.1 Removing Punctuation from vocab and tokens

In [7]:
for i,tokens in enumerate(corpus_tokens):
    temp = []
    for token in tokens:
        if token in string.punctuation:
            continue
        else:
            temp.append(token)
    corpus_tokens[i] = temp

vocab = {k:v for k,v in vocab.items() if k not in string.punctuation}

In [8]:
vocab

{'this': 3,
 'is': 3,
 'the': 4,
 'first': 2,
 'document': 3,
 'second': 2,
 'and': 1,
 'third': 1,
 'one': 1}

In [9]:
corpus_tokens

[['this', 'is', 'the', 'first', 'document'],
 ['this', 'is', 'the', 'second', 'second', 'document'],
 ['and', 'the', 'third', 'one'],
 ['is', 'this', 'the', 'first', 'document']]

## Step 4: Creating Document Vectors

In [10]:
class BoW(object):
    
    def __init__(self,vocab: dict):
        self.vocab_keys = sorted(vocab.keys())

    def vectorizer(self,sentence: list):
        vector = []
        for key in self.vocab_keys:
            if key in sentence:
                vector.append(1)
            else:
                vector.append(0)
        return vector

In [11]:
bow = BoW(vocab)

In [12]:
vectors = list(map(bow.vectorizer,corpus_tokens))
vectors

[[0, 1, 1, 1, 0, 0, 1, 0, 1],
 [0, 1, 0, 1, 0, 1, 1, 0, 1],
 [1, 0, 0, 0, 1, 0, 1, 1, 0],
 [0, 1, 1, 1, 0, 0, 1, 0, 1]]

## Step 5: Testing

In [13]:
sentence = 'Something completely new.'

In [14]:
tokens = word_tokenize(sentence)
print("Tokens")
print(tokens)
tokens = [token.lower() for token in tokens if token not in string.punctuation]
print("Preprocessed tokens")
print(tokens)

Tokens
['Something', 'completely', 'new', '.']
Preprocessed tokens
['something', 'completely', 'new']


In [15]:
bow.vectorizer(tokens)

[0, 0, 0, 0, 0, 0, 0, 0, 0]

# CountVectorizer

## Scoring: Count

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
vectorizer = CountVectorizer()

In [18]:
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [19]:
corpus = [
        'This is the first document.',
        'This is the second second document.',
        'And the third one.',
        'Is this the first document?']

In [20]:
vectorizer.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [21]:
vectorizer.vocabulary_

{'this': 8,
 'is': 3,
 'the': 6,
 'first': 2,
 'document': 1,
 'second': 5,
 'and': 0,
 'third': 7,
 'one': 4}

In [22]:
vectors = vectorizer.transform(corpus)

In [23]:
vectors.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [24]:
vectorizer.transform(['first document']).toarray()

array([[0, 1, 1, 0, 0, 0, 0, 0, 0]])

## Scoring : Frequency

#### Calculate the frequency that each word appears in a document out of all the words in the document.

In [25]:
from sklearn.feature_extraction.text import TfidfTransformer

In [26]:
tf_transformer = TfidfTransformer(use_idf=False)

In [27]:
tf_transformer.fit(vectors)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=False)

In [28]:
freq_vectors = tf_transformer.transform(vectors)

In [29]:
for vec in freq_vectors.toarray():
    print(vec)

[0.        0.4472136 0.4472136 0.4472136 0.        0.        0.4472136
 0.        0.4472136]
[0.         0.35355339 0.         0.35355339 0.         0.70710678
 0.35355339 0.         0.35355339]
[0.5 0.  0.  0.  0.5 0.  0.5 0.5 0. ]
[0.        0.4472136 0.4472136 0.4472136 0.        0.        0.4472136
 0.        0.4472136]


## TF-IDF

In [30]:
print("TF-IDF(t,d) = TF(t,d) × IDF(t)\n")

print("TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).\n")
print("IDF(t) = log_e(Total number of documents / Number of documents with term t in it).")

TF-IDF(t,d) = TF(t,d) × IDF(t)

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

IDF(t) = log_e(Total number of documents / Number of documents with term t in it).


In [31]:
tf_transformer = TfidfTransformer()

In [32]:
tf_transformer.fit(vectors)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [33]:
tf_freq_vectors = tf_transformer.transform(vectors)

In [34]:
print(tf_freq_vectors.toarray())

[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]


## Limitations of the Bag of Words representation

## bi-gram

In [35]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)

In [36]:
analyze = bigram_vectorizer.build_analyzer()

In [37]:
analyze('Bi-grams are cool!')

['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool']

In [38]:
bigram_vectorizer.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='\\b\\w+\\b', tokenizer=None,
                vocabulary=None)

In [39]:
bigram_vectorizer.vocabulary_

{'this': 18,
 'is': 5,
 'the': 12,
 'first': 3,
 'document': 2,
 'this is': 19,
 'is the': 6,
 'the first': 13,
 'first document': 4,
 'second': 9,
 'the second': 14,
 'second second': 11,
 'second document': 10,
 'and': 0,
 'third': 16,
 'one': 8,
 'and the': 1,
 'the third': 15,
 'third one': 17,
 'is this': 7,
 'this the': 20}

In [40]:
vectors = bigram_vectorizer.transform(corpus)

In [41]:
tf_transformer = TfidfTransformer()

In [42]:
tf_transformer.fit(vectors)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [43]:
tf_freq_vectors = tf_transformer.transform(vectors)

In [44]:
print(tf_freq_vectors.toarray())

[[0.         0.         0.29752161 0.36749838 0.36749838 0.29752161
  0.36749838 0.         0.         0.         0.         0.
  0.24324341 0.36749838 0.         0.         0.         0.
  0.29752161 0.36749838 0.        ]
 [0.         0.         0.20454415 0.         0.         0.20454415
  0.25265271 0.         0.         0.64091586 0.32045793 0.32045793
  0.16722824 0.         0.32045793 0.         0.         0.
  0.20454415 0.25265271 0.        ]
 [0.39928771 0.39928771 0.         0.         0.         0.
  0.         0.         0.39928771 0.         0.         0.
  0.20836489 0.         0.         0.39928771 0.39928771 0.39928771
  0.         0.         0.        ]
 [0.         0.         0.27571531 0.34056326 0.34056326 0.27571531
  0.         0.43196131 0.         0.         0.         0.
  0.22541533 0.34056326 0.         0.         0.         0.
  0.27571531 0.         0.43196131]]
