In [1]:

import re

import nltk
import pandas as pd
from nltk import SnowballStemmer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')

news = pd.read_csv('abcnews-date-text.csv')
news

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
...,...,...
1244179,20211231,two aged care residents die as state records 2...
1244180,20211231,victoria records 5;919 new cases and seven deaths
1244181,20211231,wa delays adopting new close contact definition
1244182,20211231,western ringtail possums found badly dehydrate...


In [2]:
top_news = news[0:25]

In [3]:
top_news

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
5,20030219,ambitious olsson wins triple jump
6,20030219,antic delighted with record breaking barca
7,20030219,aussie qualifier stosur wastes four memphis match
8,20030219,aust addresses un security council over iraq
9,20030219,australia is locked into war timetable opp


In [4]:
top_news.isnull()

Unnamed: 0,publish_date,headline_text
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,False,False
6,False,False
7,False,False
8,False,False
9,False,False


In [5]:
top_news.dropna()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
5,20030219,ambitious olsson wins triple jump
6,20030219,antic delighted with record breaking barca
7,20030219,aussie qualifier stosur wastes four memphis match
8,20030219,aust addresses un security council over iraq
9,20030219,australia is locked into war timetable opp


In [6]:
top_news = top_news.drop(columns='publish_date')

In [7]:
top_news.insert(0, "doc_id", range(1, 1 + len(top_news)))

In [8]:
top_news

Unnamed: 0,doc_id,headline_text
0,1,aba decides against community broadcasting lic...
1,2,act fire witnesses must be aware of defamation
2,3,a g calls for infrastructure protection summit
3,4,air nz staff in aust strike for pay rise
4,5,air nz strike to affect australian travellers
5,6,ambitious olsson wins triple jump
6,7,antic delighted with record breaking barca
7,8,aussie qualifier stosur wastes four memphis match
8,9,aust addresses un security council over iraq
9,10,australia is locked into war timetable opp


In [9]:
top_news_dropped = top_news[top_news.index % 2 == 0]
top_news_dropped

Unnamed: 0,doc_id,headline_text
0,1,aba decides against community broadcasting lic...
2,3,a g calls for infrastructure protection summit
4,5,air nz strike to affect australian travellers
6,7,antic delighted with record breaking barca
8,9,aust addresses un security council over iraq
10,11,australia to contribute 10 million in aid to iraq
12,13,bathhouse plans move ahead
14,15,big plan to boost paroo water supplies
16,17,brigadier dismisses reports troops harassed in
18,19,bryant leads lakers to double overtime win


In [10]:
from nltk.tokenize import word_tokenize


def stemmer_sentence(headline):
    stemmer = SnowballStemmer('english')
    headline = re.sub(r'[^a-zA-Z0-9\s]', '', headline).lower()
    tokens = word_tokenize(headline)
    tokens = [stemmer.stem(w) for w in tokens if len(w) > 3]
    return " ".join(tokens)

In [11]:
top_news_dropped.loc[:, 'headline_text'] = top_news_dropped['headline_text'].apply(stemmer_sentence)

top_news_dropped

Unnamed: 0,doc_id,headline_text
0,1,decid against communiti broadcast licenc
2,3,call infrastructur protect summit
4,5,strike affect australian travel
6,7,antic delight with record break barca
8,9,aust address secur council over iraq
10,11,australia contribut million iraq
12,13,bathhous plan move ahead
14,15,plan boost paroo water suppli
16,17,brigadi dismiss report troop harass
18,19,bryant lead laker doubl overtim


In [12]:
top_news_dropped_reset_index = top_news_dropped.reset_index(drop=True)

print(top_news_dropped.index)

top_news_dropped_reset_index

Index([0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24], dtype='int64')


Unnamed: 0,doc_id,headline_text
0,1,decid against communiti broadcast licenc
1,3,call infrastructur protect summit
2,5,strike affect australian travel
3,7,antic delight with record break barca
4,9,aust address secur council over iraq
5,11,australia contribut million iraq
6,13,bathhous plan move ahead
7,15,plan boost paroo water suppli
8,17,brigadi dismiss report troop harass
9,19,bryant lead laker doubl overtim


# Export to .txt


In [13]:
len_news = len(top_news_dropped)

for i in range(len_news):
    filename = './documents/'f'top_news{i}.txt'

    with open(filename, 'w') as file:
        file.write(top_news_dropped_reset_index.loc[i, 'headline_text'])
