In [1]:
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

--2023-07-19 00:58:41--  https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 104.21.23.210, 172.67.213.166, 2606:4700:3030::ac43:d5a6, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|104.21.23.210|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5085081 (4.8M) [text/csv]
Saving to: ‘bbc_text_cls.csv’


2023-07-19 00:58:41 (246 MB/s) - ‘bbc_text_cls.csv’ saved [5085081/5085081]



In [2]:
import pandas as pd
import nltk
import numpy as np

from nltk import word_tokenize


In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

##Before implementation, we hypothise that words with the highest tf-idf values should consist of two attributes:
###1. Their frequency among the documents is high, they are common in all the documents. (TF)
###2. It is the most unique word in a document, as more occurence in a document makes it fall in the category of stopwords. (IDF)

In [4]:
df = pd.read_csv('bbc_text_cls.csv')

In [None]:
df.head()



Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [15]:
#word to index mapping
#this helps us to keep track which coloumn represents which word in tf-idf matrix

idx =0
word2idx={}
tokenized_docs=[]
for doc in df['text']:
  words=word_tokenize(doc.lower())
  doc_as_int=[]
  for word in words:
    if word not in word2idx:
      word2idx[word]= idx
      idx+=1
    doc_as_int.append(word2idx[word])
  tokenized_docs.append(doc_as_int)

In [22]:
print(doc_as_int,end=" ")

[4586, 18792, 40, 104, 2054, 104, 2338, 5486, 2980, 410, 30305, 27, 76, 28087, 549, 28, 6651, 31, 76, 410, 299, 1217, 290, 758, 9040, 1639, 986, 6597, 2010, 145, 2077, 8084, 654, 37, 32575, 238, 23, 7674, 40, 229, 540, 1150, 7484, 5037, 40, 16, 23, 1253, 3494, 16, 2380, 758, 17788, 356, 46, 16, 1543, 63, 1058, 31, 160, 195, 1101, 16, 63, 8084, 935, 312, 34, 1049, 292, 7516, 40, 3, 87, 347, 7364, 367, 34682, 356, 370, 4454, 37, 3, 27, 173, 5037, 16206, 13625, 27, 40, 2309, 31, 299, 37, 23, 338, 57, 17722, 463, 160, 312, 8084, 34, 441, 615, 288, 8550, 452, 1616, 16, 1880, 72, 27, 910, 92, 1444, 221, 288, 367, 1810, 23, 1554, 288, 12699, 1227, 5486, 72, 27, 173, 5037, 7557, 16218, 27, 28, 3091, 31, 76, 299, 1282, 63, 292, 794, 6590, 273, 23, 3473, 31326, 104, 2338, 5486, 8084, 19, 34683, 21, 463, 160, 19014, 385, 16, 1880, 308, 292, 130, 367, 368, 23, 8469, 37, 347, 34683, 31, 8470, 465, 410, 898, 292, 1455, 27, 191, 1976, 2429, 312, 2807, 34, 763, 1616, 16, 34684, 27, 173, 5037, 4928, 34

In [16]:
#reverse mapping to go back from integers to corresponding word after output of ML algos.
idx2word = {v:k for k,v in word2idx.items()}

In [17]:
#no. of documents
n=len(df['text'])
n

2225

In [18]:
#no. of words
v=len(word2idx)
v

34762

In [19]:
#instantiate term frequency matrix of size NxV
#note: could have been done with count vectorizer as well
tf= np.zeros((n,v))
tf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
#fill the term-frequency matrix
for i,doc_as_int in enumerate(tokenized_docs):
  for j in doc_as_int:
    tf[i,j]+=1
tf

array([[1., 4., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 1., 1.]])

In [27]:
#Compute IDF
document_freq = np.sum(tf>0,axis=0)
idf=np.log(n/document_freq)


In [28]:
#compute tf-idf
tf_idf=idf*tf

In [29]:
np.random.seed(123)

In [33]:
from numpy.core.numeric import indices
#pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(n)
row = df.iloc[i]
print("Label: ",row['labels'])
print("Text: ",row['text'].split("\n",1)[0])
print("Top 5 terms: ")

scores = tf_idf[i]
indices=(-scores).argsort()

for j in indices[:5]:
  print(idx2word[j])



Label:  tech
Text:  IBM puts cash behind Linux push
Top 5 terms: 
linux
ibm
workplace
software
programs


In [None]:
#doing thsi through count vectorizer would have been easier as it does the word2index mapping itself.