In [1]:
import re

import nltk
import pandas as pd
from nltk import PorterStemmer, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')
import numpy as np
news_data = pd.read_csv('../abcnews-date-text.csv')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
news = pd.read_csv('../abcnews-date-text.csv')
top_news_part1 = news[0:25]
top_news_part2 = pd.DataFrame({'headline_text': top_news_part1['headline_text']})

top_news_part2.to_csv('top_news_25.csv', index=False)



In [3]:
top_news_part1.duplicated() == True

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
dtype: bool

In [4]:
top_news_part1 = top_news_part1.drop(columns='publish_date')

In [5]:
top_news_part1

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers
5,ambitious olsson wins triple jump
6,antic delighted with record breaking barca
7,aussie qualifier stosur wastes four memphis match
8,aust addresses un security council over iraq
9,australia is locked into war timetable opp


In [6]:
top_news_part1.insert(0, "doc_id", range(1, 1 + len(top_news_part1)))

In [7]:
top_news_part1

Unnamed: 0,doc_id,headline_text
0,1,aba decides against community broadcasting lic...
1,2,act fire witnesses must be aware of defamation
2,3,a g calls for infrastructure protection summit
3,4,air nz staff in aust strike for pay rise
4,5,air nz strike to affect australian travellers
5,6,ambitious olsson wins triple jump
6,7,antic delighted with record breaking barca
7,8,aussie qualifier stosur wastes four memphis match
8,9,aust addresses un security council over iraq
9,10,australia is locked into war timetable opp


In [8]:
def stemmer_sentence(headline):
    stemmer = PorterStemmer()
    headline = re.sub(r'[^a-zA-Z0-9\s]', '', headline).lower()
    tokens = word_tokenize(headline)
    tokens = [stemmer.stem(w) for w in tokens if len(w) > 3]
    return " ".join(tokens)

In [9]:
top_news_part1.loc[:, 'headline_text'] = top_news_part1['headline_text'].apply(stemmer_sentence)

In [10]:
top_news_part1

Unnamed: 0,doc_id,headline_text
0,1,decid against commun broadcast licenc
1,2,fire wit must awar defam
2,3,call infrastructur protect summit
3,4,staff aust strike rise
4,5,strike affect australian travel
5,6,ambiti olsson win tripl jump
6,7,antic delight with record break barca
7,8,aussi qualifi stosur wast four memphi match
8,9,aust address secur council over iraq
9,10,australia lock into timet


# Transform corpus into vector

In [11]:
count_vectorizer = CountVectorizer(stop_words='english',
                                   max_features=5000)
doc_term_matrixed = count_vectorizer.fit_transform(top_news_part1['headline_text'])

In [12]:

doc_term_matrixed.shape

(25, 101)

# transform text documents into numerical vectors

In [13]:
tf_idf_vectorizerd = TfidfVectorizer(stop_words='english', max_features=5000)
doc_term_matrixed_tf_idf = tf_idf_vectorizerd.fit_transform(top_news_part1['headline_text'])
print(doc_term_matrixed_tf_idf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 110 stored elements and shape (25, 101)>
  Coords	Values
  (0, 39)	0.5
  (0, 34)	0.5
  (0, 22)	0.5
  (0, 62)	0.5
  (1, 100)	0.5773502691896257
  (1, 12)	0.5773502691896257
  (1, 40)	0.5773502691896257
  (2, 54)	0.5773502691896257
  (2, 74)	0.5773502691896257
  (2, 87)	0.5773502691896257
  (3, 83)	0.5291875165150637
  (3, 8)	0.4689995440985196
  (3, 86)	0.4689995440985196
  (3, 78)	0.5291875165150637
  (4, 86)	0.45551568364217737
  (4, 1)	0.5139732359945084
  (4, 10)	0.5139732359945084
  (4, 91)	0.5139732359945084
  (5, 3)	0.4472135954999579
  (5, 69)	0.4472135954999579
  (5, 99)	0.4472135954999579
  (5, 92)	0.4472135954999579
  (5, 56)	0.4472135954999579
  (6, 4)	0.4677328981134654
  (6, 42)	0.4677328981134654
  :	:
  (19, 95)	0.5
  (19, 31)	0.5
  (20, 26)	0.5
  (20, 73)	0.5
  (20, 89)	0.5
  (20, 6)	0.5
  (21, 27)	0.408248290463863
  (21, 11)	0.408248290463863
  (21, 47)	0.408248290463863
  (21, 41)	0.408248290463863
  (21, 

In [14]:
doc_term_matrixed_tf_idf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 110 stored elements and shape (25, 101)>

In [15]:
kata = count_vectorizer.vocabulary_.keys()
kata_counts = doc_term_matrixed.sum(axis=0).A1

kata_counts_df = pd.DataFrame({'word': kata, 'count': kata_counts})
kata_counts_df = kata_counts_df.sort_values(by='count', ascending=False)

In [16]:
kata_counts_df

Unnamed: 0,word,count
2,broadcast,2
9,summit,2
13,rise,2
8,protect,2
55,suppli,2
...,...,...
96,roma,1
97,ruin,1
98,cemeteri,1
99,miss,1


In [17]:
headline_query_1 = top_news_part1['headline_text'][1]
headline_query_2 = top_news_part1['headline_text'][2]
query_random = "Australian police sentenced two years"
headline_query_3 ="war and violence"

headline_query_1

'fire wit must awar defam'

In [18]:
top_news_part1

Unnamed: 0,doc_id,headline_text
0,1,decid against commun broadcast licenc
1,2,fire wit must awar defam
2,3,call infrastructur protect summit
3,4,staff aust strike rise
4,5,strike affect australian travel
5,6,ambiti olsson win tripl jump
6,7,antic delight with record break barca
7,8,aussi qualifi stosur wast four memphi match
8,9,aust address secur council over iraq
9,10,australia lock into timet


In [19]:
query_vector_1 = tf_idf_vectorizerd.transform([headline_query_1])
query_vector_2 = tf_idf_vectorizerd.transform([headline_query_2])
query_vector_3 = tf_idf_vectorizerd.transform([headline_query_3])
print(query_vector_1)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3 stored elements and shape (1, 101)>
  Coords	Values
  (0, 12)	0.5773502691896257
  (0, 40)	0.5773502691896257
  (0, 100)	0.5773502691896257


In [20]:
# Calculate cosine similarity between query vector and all documents
query_1_similarities = cosine_similarity(query_vector_1, doc_term_matrixed).flatten()


In [31]:
print(top_news_part1.count())

doc_id           25
headline_text    25
dtype: int64


In [21]:
print(doc_term_matrixed_tf_idf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 110 stored elements and shape (25, 101)>
  Coords	Values
  (0, 39)	0.5
  (0, 34)	0.5
  (0, 22)	0.5
  (0, 62)	0.5
  (1, 100)	0.5773502691896257
  (1, 12)	0.5773502691896257
  (1, 40)	0.5773502691896257
  (2, 54)	0.5773502691896257
  (2, 74)	0.5773502691896257
  (2, 87)	0.5773502691896257
  (3, 83)	0.5291875165150637
  (3, 8)	0.4689995440985196
  (3, 86)	0.4689995440985196
  (3, 78)	0.5291875165150637
  (4, 86)	0.45551568364217737
  (4, 1)	0.5139732359945084
  (4, 10)	0.5139732359945084
  (4, 91)	0.5139732359945084
  (5, 3)	0.4472135954999579
  (5, 69)	0.4472135954999579
  (5, 99)	0.4472135954999579
  (5, 92)	0.4472135954999579
  (5, 56)	0.4472135954999579
  (6, 4)	0.4677328981134654
  (6, 42)	0.4677328981134654
  :	:
  (19, 95)	0.5
  (19, 31)	0.5
  (20, 26)	0.5
  (20, 73)	0.5
  (20, 89)	0.5
  (20, 6)	0.5
  (21, 27)	0.408248290463863
  (21, 11)	0.408248290463863
  (21, 47)	0.408248290463863
  (21, 41)	0.408248290463863
  (21, 

In [22]:
query_2_similarities = cosine_similarity(query_vector_2, doc_term_matrixed).flatten()

query_2_similarities

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0.])

In [23]:
# query_random_similarities = cosine_similarity(query_vector_random, doc_term_matrixed).flatten()
#
# query_vector_random

In [24]:
query_3_similarities = cosine_similarity(query_vector_3, doc_term_matrixed).flatten()

print(query_3_similarities)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]


In [25]:
ranked_indices = np.argsort(query_3_similarities)[::-1]

ranked_indices

array([24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10,  9,  8,
        7,  6,  5,  4,  3,  2,  1,  0])

In [26]:
top_news_part1

Unnamed: 0,doc_id,headline_text
0,1,decid against commun broadcast licenc
1,2,fire wit must awar defam
2,3,call infrastructur protect summit
3,4,staff aust strike rise
4,5,strike affect australian travel
5,6,ambiti olsson win tripl jump
6,7,antic delight with record break barca
7,8,aussi qualifi stosur wast four memphi match
8,9,aust address secur council over iraq
9,10,australia lock into timet


In [27]:
top_news_genap_024 = top_news_part1[top_news_part1.index % 2 == 0]
top_news_genap_024 = top_news_genap_024.reset_index(drop=True)

In [28]:
len_news_p1 = len(top_news_genap_024)
for i in range(len_news_p1):
    filenames = '../docs/'f'top_news{i}.txt'

    with open(filenames, 'w') as file:
        file.write(top_news_genap_024.loc[i, 'headline_text'])