<a href="https://colab.research.google.com/github/Nawapon19/NLP/blob/main/TFIDF_From_Scratch_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TFIDF From Scratch**

In [1]:
# download bbc news dataset from Kaggle
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

--2023-09-15 19:52:44--  https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 172.67.213.166, 104.21.23.210, 2606:4700:3031::6815:17d2, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|172.67.213.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5085081 (4.8M) [text/csv]
Saving to: ‘bbc_text_cls.csv’


2023-09-15 19:52:45 (69.3 MB/s) - ‘bbc_text_cls.csv’ saved [5085081/5085081]



In [3]:
# import pandas, numpy and nltk libraries
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize

In [4]:
# download 'punkt' package for tokenization
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# read dataset from csv file to pandas dataframe
df = pd.read_csv('bbc_text_cls.csv')

In [6]:
# inspect the bbc news dataset
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [7]:
# populate word2idx mapping
# convert documents into sequences of ints/ IDs/ indices
idx = 0
word2idx = {}
tokenized_docs = [] #sequences of ints/ IDs/ indices
for doc in df['text']:
  words = word_tokenize(doc.lower())
  doc_as_int = [] #list to store int which represent each doc
  for word in words:
    if word not in word2idx:
      word2idx[word] = idx
      idx += 1
    doc_as_int.append(word2idx[word]) #save for later
  tokenized_docs.append(doc_as_int)

In [8]:
# populate reverse mapping
idx2word = {v:k for k, v in word2idx.items()}

In [9]:
# number of documents(row)
N = len(df['text'])

In [10]:
# number of words(column)
V = len(word2idx)

In [11]:
# instantiate TF(term-frequency) matrix
tf = np.zeros((N,V))

In [12]:
# populate TF(term-frequency) counts
for i, doc_as_int in enumerate(tokenized_docs): # iterate through each doc(row)
  for j in doc_as_int: # iterate through each word(column)
    tf[i,j] += 1

In [13]:
# compute IDF(inversed-document frequency)
document_freq = np.sum(tf > 0, axis=0) # sum value of tf if >0, axis=0 means column, result in shape (V,)
idf = np.log(N / document_freq) # calculate log of inversed-df

In [14]:
# compute TF-IDF
tf_idf = tf * idf

In [15]:
# set random seed for practice
np.random.seed(123)

In [17]:
# pick a random document, show the top 5 terms (from tf_idf score)
i = np.random.choice(N) # random from number of documents
row = df.iloc[i] # retreive record of the document
print("Label:", row['labels'])
print("Text", row['text'].split("\n", 1)[0]) # str.split() text by /n, print only first line
print("Top 5 terms:")

scores = tf_idf[i] # retreive tf_idf scores of selected document
indices = (-scores).argsort() # sort indices by tf_idf scores from max to min

# print top 5 terms by reversed mapping from indices to words
for j in indices[:5]:
  print(idx2word[j])

Label: politics
Text Clarke faces ID cards rebellion
Top 5 terms:
cards
clarke
rebellion
id
bill
