In [12]:
import pandas as pd

# Load the dataset
df = pd.read_csv('abcnews-date-text.csv')

# Explore the first few rows to understand the data
print(df.head())

# Check the unique topics/categories
unique_topics = df['headline_text'].unique()
print("Unique Topics: ", unique_topics)

# Check the distribution of articles across topics
topic_distribution = df['headline_text'].value_counts()
print("Topic Distribution: \n", topic_distribution)

   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers
Unique Topics:  ['aba decides against community broadcasting licence'
 'act fire witnesses must be aware of defamation'
 'a g calls for infrastructure protection summit' ...
 'wa delays adopting new close contact definition'
 'western ringtail possums found badly dehydrated in heatwave'
 'what makes you a close covid contact here are the new rules']
Topic Distribution: 
 national rural news                                            983
abc sport                                                      718
abc weather                                                    714
abc business news and m

# Bag-of-Words (BoW)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
text=['aba decides against community broadcasting licence','act fire witnesses must be aware of defamation',
      'ambitious olsson wins triple jump']
#Initialize the vector
vectorizer=CountVectorizer()
#transform
x=vectorizer.fit_transform(text)
#get feature names(words)
feature_names=vectorizer.get_feature_names_out()
#Display BOW-Matrix
print('Feature names:\n',feature_names)
print()
print('BOW Matrix:\n')
print(x.toarray())


Feature names:
 ['aba' 'act' 'against' 'ambitious' 'aware' 'be' 'broadcasting' 'community'
 'decides' 'defamation' 'fire' 'jump' 'licence' 'must' 'of' 'olsson'
 'triple' 'wins' 'witnesses']

BOW Matrix:

[[1 0 1 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0]
 [0 1 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 1]
 [0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0]]


# TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
text=['aba decides against community broadcasting licence','act fire witnesses must be aware of defamation',
      'ambitious olsson wins triple jump']

tfidf=TfidfVectorizer()
x_tfidf=tfidf.fit_transform(text)
features_names_tfidf=tfidf.get_feature_names_out()
print('Feature Names(TF-IDF):\n',features_names_tfidf)
print()
print('TF-IDF Matrix:\n',x_tfidf.toarray())

Feature Names(TF-IDF):
 ['aba' 'act' 'against' 'ambitious' 'aware' 'be' 'broadcasting' 'community'
 'decides' 'defamation' 'fire' 'jump' 'licence' 'must' 'of' 'olsson'
 'triple' 'wins' 'witnesses']

TF-IDF Matrix:
 [[0.40824829 0.         0.40824829 0.         0.         0.
  0.40824829 0.40824829 0.40824829 0.         0.         0.
  0.40824829 0.         0.         0.         0.         0.
  0.        ]
 [0.         0.35355339 0.         0.         0.35355339 0.35355339
  0.         0.         0.         0.35355339 0.35355339 0.
  0.         0.35355339 0.35355339 0.         0.         0.
  0.35355339]
 [0.         0.         0.         0.4472136  0.         0.
  0.         0.         0.         0.         0.         0.4472136
  0.         0.         0.         0.4472136  0.4472136  0.4472136
  0.        ]]


In [19]:
from nltk.tokenize import sent_tokenize

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
text=input('Enter a statement:\n\t')
sent_token=sent_tokenize(text)
#print('Sentences:\n',sent_token)
for sent in sent_token:
 print(sent)
 print(len(sent))
 
print()
tfidf=TfidfVectorizer()
x_tfidf=tfidf.fit_transform(sent_token)
features_names_tfidf=tfidf.get_feature_names_out()
print('Feature Names(TF-IDF):\n',features_names_tfidf)
print()
print('TF-IDF Matrix:\n',x_tfidf.toarray())


Enter a statement:
	commonwealth bank cuts fixed home loan rates .community urged to help homeless youth
commonwealth bank cuts fixed home loan rates .community urged to help homeless youth
84

Feature Names(TF-IDF):
 ['bank' 'commonwealth' 'community' 'cuts' 'fixed' 'help' 'home' 'homeless'
 'loan' 'rates' 'to' 'urged' 'youth']

TF-IDF Matrix:
 [[0.2773501 0.2773501 0.2773501 0.2773501 0.2773501 0.2773501 0.2773501
  0.2773501 0.2773501 0.2773501 0.2773501 0.2773501 0.2773501]]


# N-grams:

In [21]:
import nltk
from nltk import ngrams
text='aba decides against community broadcasting licence'
tokens=nltk.word_tokenize(text)
#Generate bigrams
n=2
bigrams=list(ngrams(tokens,n))
print(f"Original text:{text}")
print()
print(f"Generated {n}-grams:")
for gram in bigrams:
 print(gram)

Original text:aba decides against community broadcasting licence

Generated 2-grams:
('aba', 'decides')
('decides', 'against')
('against', 'community')
('community', 'broadcasting')
('broadcasting', 'licence')


In [22]:
import nltk
from nltk import ngrams
text='aba decides against community broadcasting licence'
tokens=nltk.word_tokenize(text)
#Generate bigrams
n=3
bigrams=list(ngrams(tokens,n))
print(f"Original text:{text}")
print()
print(f"Generated {n}-grams:")
for gram in bigrams:
 print(gram)

Original text:aba decides against community broadcasting licence

Generated 3-grams:
('aba', 'decides', 'against')
('decides', 'against', 'community')
('against', 'community', 'broadcasting')
('community', 'broadcasting', 'licence')


In [24]:
import nltk
from nltk import ngrams
text='aba decides against community broadcasting licence'
tokens=nltk.word_tokenize(text)
#Generate bigrams
n=4
bigrams=list(ngrams(tokens,n))
print(f"Original text:{text}")
print()
print(f"Generated {n}-grams:")
for gram in bigrams:
 print(gram)

Original text:aba decides against community broadcasting licence

Generated 4-grams:
('aba', 'decides', 'against', 'community')
('decides', 'against', 'community', 'broadcasting')
('against', 'community', 'broadcasting', 'licence')


In [26]:
#generate n-grams with list comprehension
n=2
text='aba decides against community broadcasting licence'
tokens=nltk.word_tokenize(text)
ngrams=[tokens[i:i+n] for i in range(len(tokens)-n+1)]
print(f"Original text:{text}")
print()
print(f"Generated {n}-grams:")
for gram in ngrams:
 print(gram)

Original text:aba decides against community broadcasting licence

Generated 2-grams:
['aba', 'decides']
['decides', 'against']
['against', 'community']
['community', 'broadcasting']
['broadcasting', 'licence']


# onehot encoding

In [28]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
text=['aba decides against community broadcasting licence','act fire witnesses must be aware of defamation',
      'ambitious olsson wins triple jump']
#step 1- tokens
tokens=[word for sent in text for word in sent.lower().split()]
#step 2- vocabulary
vocabulary=list(set(tokens)) # unique words in the text
#initialize encoder
encoder=OneHotEncoder(categories=[vocabulary],sparse=False)
#Perform the one-hot encoding
one_hot_encoded=[]
for sent in text:
    sent_encoded=[]
    for word in sent.lower().split():
        word_index=vocabulary.index(word)
        word_vector=np.zeros(len(vocabulary))
        word_vector[word_index]=1
        sent_encoded.append(word_vector)
        one_hot_encoded.append(sent_encoded)
 
for sent in one_hot_encoded:
    print(sent)

[array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1.]), array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0.])]
[array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1.]), array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
   

In [29]:
sent[0]


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0.])

In [30]:
sent[1]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.])

In [31]:

len(sent)


5

In [32]:

type(sent)

list

# Word2vec

In [33]:
import gensim

In [37]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
#sample text
text=['freedom records net profit for third successive',
'funds allocated for domestic violence victims',
'funds allocated for youth at risk']
tokenized_text=[word_tokenize(sentence.lower()) for sentence in text]
#Train Word2vec model
model=Word2Vec(sentences=tokenized_text,vector_size=20,window=5,min_count=1,workers=4)
#Find word vectors
vector_freedom=model.wv['freedom']
vector_risk=model.wv['risk']
#similarity b/w words
similarity=model.wv.similarity('youth','funds')
print(f"Vector for 'freedom':{vector_freedom}")
print('='*100)
print(f"Vector for 'risk':{vector_risk}")
print('='*100)
print(f"Similarity b/w 'youth' and 'funds':{similarity}")


Vector for 'freedom':[ 0.00609409 -0.04229162 -0.04111972 -0.00115508  0.00618644 -0.0287169
 -0.02362637 -0.03673037  0.04164308  0.00060649 -0.022547    0.02850853
  0.04590008 -0.02049936  0.03982341  0.02687717  0.02939562  0.00256295
  0.04106542 -0.0350952 ]
Vector for 'risk':[-0.00788826  0.00160686 -0.02070315 -0.03841344 -0.00754004  0.01234897
 -0.00444013  0.02766831 -0.01371489  0.01130033  0.02727897  0.04172977
 -0.0072687  -0.04604071  0.02185276  0.00285892  0.03720954 -0.00406641
 -0.01319207 -0.04376505]
Similarity b/w 'youth' and 'funds':-0.014377474784851074


# Doc2Vec

In [38]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
#sample text
documents=['freedom records net profit for third successive',
'funds allocated for domestic violence victims',
'funds allocated for youth at risk']

#Tokenize & tag documents
tagged_data=[TaggedDocument(words=word_tokenize(doc.lower()),
 tags=[str(i)]) for i,doc in enumerate(documents)]
print(tagged_data)

[TaggedDocument(words=['freedom', 'records', 'net', 'profit', 'for', 'third', 'successive'], tags=['0']), TaggedDocument(words=['funds', 'allocated', 'for', 'domestic', 'violence', 'victims'], tags=['1']), TaggedDocument(words=['funds', 'allocated', 'for', 'youth', 'at', 'risk'], tags=['2'])]


In [39]:
#Train Doc2vec model
model=Doc2Vec(vector_size=100,window=2,min_count=1,workers=5,epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data,total_examples=model.corpus_count,epochs=model.epochs)


In [40]:
vector_doc_1=model.infer_vector(word_tokenize("freedom records net profit for third successive"))

In [41]:
vector_doc_1


array([-4.8438748e-03, -2.5978431e-03, -3.8807129e-03,  8.7385590e-04,
       -4.5379894e-03,  2.7983743e-03, -7.7920483e-04, -1.5675016e-03,
        1.7300955e-03,  2.8467223e-03,  1.9490751e-03,  2.1480031e-03,
       -4.8568421e-03,  2.2760492e-03, -3.2177817e-03,  3.0648448e-03,
        8.6565455e-04, -1.1514719e-03, -4.6044379e-03, -3.8735222e-03,
       -3.6619022e-03,  3.0280990e-03,  4.1800435e-03, -1.7276757e-03,
        2.7898829e-03, -3.2570173e-03, -3.6517088e-03,  3.5115185e-03,
       -6.1682286e-04, -1.1673481e-03, -4.7423951e-03,  2.5605732e-03,
        4.8454008e-03, -4.6623601e-03, -4.1753789e-03,  4.6036257e-03,
       -3.9505102e-03, -3.3979402e-03, -4.6186447e-03, -2.7303833e-03,
        2.9597878e-03,  1.7572048e-03,  2.9991469e-03,  3.3683283e-03,
       -3.6163135e-03,  4.4444315e-03, -4.9657328e-03, -1.0408321e-03,
        1.5926962e-03, -1.3073049e-03,  6.5732212e-04,  4.8791706e-03,
        1.0177080e-03,  4.3808823e-03, -4.2612716e-03, -2.5693392e-03,
      

In [42]:
#find the most similar document
similar_doc=model.docvecs.most_similar(positive=[vector_doc_1])
print(f"vector for 'freedom records net profit for third successive':{vector_doc_1}")
print()
print(f"Most similar document:{similar_doc}")


vector for 'freedom records net profit for third successive':[-4.8438748e-03 -2.5978431e-03 -3.8807129e-03  8.7385590e-04
 -4.5379894e-03  2.7983743e-03 -7.7920483e-04 -1.5675016e-03
  1.7300955e-03  2.8467223e-03  1.9490751e-03  2.1480031e-03
 -4.8568421e-03  2.2760492e-03 -3.2177817e-03  3.0648448e-03
  8.6565455e-04 -1.1514719e-03 -4.6044379e-03 -3.8735222e-03
 -3.6619022e-03  3.0280990e-03  4.1800435e-03 -1.7276757e-03
  2.7898829e-03 -3.2570173e-03 -3.6517088e-03  3.5115185e-03
 -6.1682286e-04 -1.1673481e-03 -4.7423951e-03  2.5605732e-03
  4.8454008e-03 -4.6623601e-03 -4.1753789e-03  4.6036257e-03
 -3.9505102e-03 -3.3979402e-03 -4.6186447e-03 -2.7303833e-03
  2.9597878e-03  1.7572048e-03  2.9991469e-03  3.3683283e-03
 -3.6163135e-03  4.4444315e-03 -4.9657328e-03 -1.0408321e-03
  1.5926962e-03 -1.3073049e-03  6.5732212e-04  4.8791706e-03
  1.0177080e-03  4.3808823e-03 -4.2612716e-03 -2.5693392e-03
 -3.2427472e-03  9.8913454e-04 -1.8309279e-03  1.5332844e-03
  4.1352110e-03  5.15141

  similar_doc=model.docvecs.most_similar(positive=[vector_doc_1])
