# Get data

In [3]:
doc1 = "The quick brown fox jumps over the lazy dog."
doc2 = "A journey of a thousand miles begins with a single step."
doc3 = "Honesty is the best policy."
doc4 = "The early bird catches the worm."
doc5 = "A watched pot never boils."
doc6 = "Actions speak louder than words."
doc7 = "Beauty is in the eye of the beholder."
doc8 = "Better late than never."
doc9 = "Birds of a feather flock together."
doc10 = "Cleanliness is next to godliness."

docs = [doc1, doc2, doc3, doc4, doc5, doc6, doc7, doc8, doc9, doc10]

# Tokenize sentences

In [18]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Show a tokenized example.
print('Tokens from doc1:')
print(tokenizer.tokenize(doc1))

# Show the token IDs from the BERT tokenizer.
print('\nToken IDs from doc1:')
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(doc1)))

Tokens from doc1:
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']

Token IDs from doc1:
[1996, 4248, 2829, 4419, 14523, 2058, 1996, 13971, 3899, 1012]


In [46]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
doc_tokens = tokenizer(docs, add_special_tokens=False, padding=True).input_ids
doc_tokens

[[1996, 4248, 2829, 4419, 14523, 2058, 1996, 13971, 3899, 1012, 0, 0],
 [1037, 4990, 1997, 1037, 4595, 2661, 4269, 2007, 1037, 2309, 3357, 1012],
 [16718, 2003, 1996, 2190, 3343, 1012, 0, 0, 0, 0, 0, 0],
 [1996, 2220, 4743, 11269, 1996, 15485, 1012, 0, 0, 0, 0, 0],
 [1037, 3427, 8962, 2196, 26077, 2015, 1012, 0, 0, 0, 0, 0],
 [4506, 3713, 10989, 2084, 2616, 1012, 0, 0, 0, 0, 0, 0],
 [5053, 2003, 1999, 1996, 3239, 1997, 1996, 27541, 2121, 1012, 0, 0],
 [2488, 2397, 2084, 2196, 1012, 0, 0, 0, 0, 0, 0, 0],
 [5055, 1997, 1037, 15550, 19311, 2362, 1012, 0, 0, 0, 0, 0],
 [4550, 20942, 2003, 2279, 2000, 2643, 20942, 1012, 0, 0, 0, 0]]

In [56]:
# Store the tokenized sequences in a dataframe.
df_doc_tokens = pd.DataFrame(doc_tokens, columns=['w'+str(i) for i in range(len(doc_tokens[0]))])
df_doc_tokens

Unnamed: 0,w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11
0,1996,4248,2829,4419,14523,2058,1996,13971,3899,1012,0,0
1,1037,4990,1997,1037,4595,2661,4269,2007,1037,2309,3357,1012
2,16718,2003,1996,2190,3343,1012,0,0,0,0,0,0
3,1996,2220,4743,11269,1996,15485,1012,0,0,0,0,0
4,1037,3427,8962,2196,26077,2015,1012,0,0,0,0,0
5,4506,3713,10989,2084,2616,1012,0,0,0,0,0,0
6,5053,2003,1999,1996,3239,1997,1996,27541,2121,1012,0,0
7,2488,2397,2084,2196,1012,0,0,0,0,0,0,0
8,5055,1997,1037,15550,19311,2362,1012,0,0,0,0,0
9,4550,20942,2003,2279,2000,2643,20942,1012,0,0,0,0


# Apply TF-IDF

In [73]:
# Get all unique tokens
unique_tokens_ids = list(set([token for doc in doc_tokens for token in doc]))

# Count occurence of each tokens for each documents
df_count = pd.DataFrame(columns=unique_tokens_ids)
for index, row in df_doc_tokens.iterrows():
    df_count.loc[index] = df_doc_tokens.loc[index].value_counts() 
df_count.fillna(0, inplace=True)

# Change column names to token names
unique_tokens = tokenizer.convert_ids_to_tokens(unique_tokens_ids)
df_count.columns = unique_tokens

# Remove [PAD] token
df_count = df_count.drop('[PAD]', axis=1)
df_count

Unnamed: 0,speak,pot,single,catches,bird,over,brown,a,best,policy,...,##s,watched,miles,next,louder,flock,thousand,.,worm,journey
0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [74]:
# Divide each value by the number of tokens in the document
df_freq = df_count.div(df_count.sum(axis=1), axis=0)
df_freq

Unnamed: 0,speak,pot,single,catches,bird,over,brown,a,best,policy,...,##s,watched,miles,next,louder,flock,thousand,.,worm,journey
0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0
1,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.083333,0.0,0.0,0.0,0.083333,0.083333,0.0,0.083333
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0
3,0.0,0.0,0.0,0.142857,0.142857,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0
4,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,...,0.142857,0.142857,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0
5,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.142857,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.125,0.0,0.0


In [84]:
# Multiply each value by the inverse document frequency
import math
df_freq.apply(lambda x: x * math.log(len(docs) / sum(x)), axis=1)

Unnamed: 0,speak,pot,single,catches,bird,over,brown,a,best,policy,...,##s,watched,miles,next,louder,flock,thousand,.,worm,journey
0,0.0,0.0,0.0,0.0,0.0,0.230259,0.230259,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230259,0.0,0.0
1,0.0,0.0,0.191882,0.0,0.0,0.0,0.0,0.575646,0.0,0.0,...,0.0,0.0,0.191882,0.0,0.0,0.0,0.191882,0.191882,0.0,0.191882
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.383764,0.383764,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.383764,0.0,0.0
3,0.0,0.0,0.0,0.328941,0.328941,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.328941,0.328941,0.0
4,0.0,0.328941,0.0,0.0,0.0,0.0,0.0,0.328941,0.0,0.0,...,0.328941,0.328941,0.0,0.0,0.0,0.0,0.0,0.328941,0.0,0.0
5,0.383764,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.383764,0.0,0.0,0.383764,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230259,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.460517,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.328941,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.328941,0.0,0.328941,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.287823,0.0,0.0,0.0,0.287823,0.0,0.0
