In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
import nltk
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package wordnet to /Users/imogen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/imogen/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/imogen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
news = pd.read_csv('../../csv_files/news/abcnews-date-text.csv')

In [36]:
pd.set_option('display.max_colwidth', None)

In [37]:
news.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting licence
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [38]:
news.shape

(1186018, 2)

In [39]:
news.dtypes

publish_date      int64
headline_text    object
dtype: object

### Change publish_date to datetime format

In [40]:
news['publish_date'] = pd.to_datetime(news['publish_date'].astype(str), format='%Y%m%d')

In [41]:
news.dtypes

publish_date     datetime64[ns]
headline_text            object
dtype: object

In [42]:
news_copy = news.copy()

### Make new column: financial year categories

My second dataset has immigration numbers by Australian financial year, which starts 1 July and ends 30 June, so I will make a column in the same format to make the data comparable.  

financial_year:
- 2003-07-01: 2003-07-01 to 2004-06-30
- 2004-07-01: 2004-07-01 to 2005-06-30
- 2005-07-01: 2005-07-01 to 2006-06-30
- etc.

Note that the 2002-3 and 2019-20 are incomplete and numbers of values should not be compared with other years.

In [43]:
news_copy['publish_date'].min()

Timestamp('2003-02-19 00:00:00')

In [44]:
news_copy['publish_date'].max()

Timestamp('2019-12-31 00:00:00')

#### 1. Make new empty column

In [47]:
news_copy = news_copy.assign(financial_year=pd.NA)

In [48]:
news_copy

Unnamed: 0,publish_date,headline_text,financial_year
0,2003-02-19,aba decides against community broadcasting licence,
1,2003-02-19,act fire witnesses must be aware of defamation,
2,2003-02-19,a g calls for infrastructure protection summit,
3,2003-02-19,air nz staff in aust strike for pay rise,
4,2003-02-19,air nz strike to affect australian travellers,
...,...,...,...
1186013,2019-12-31,vision of flames approaching corryong in victoria,
1186014,2019-12-31,wa police and government backflip on drug amnesty bins,
1186015,2019-12-31,we have fears for their safety: victorian premier,
1186016,2019-12-31,when do the 20s start,


#### 2. add values to new column

In [49]:
news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2002-07-01"), 
                        pd.to_datetime("2003-06-30")), 'financial_year')] = '2002-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2003-07-01"), 
                        pd.to_datetime("2004-06-30")), 'financial_year')] = '2003-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2004-07-01"), 
                        pd.to_datetime("2005-06-30")), 'financial_year')] = '2004-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2005-07-01"), 
                        pd.to_datetime("2006-06-30")), 'financial_year')] = '2005-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2006-07-01"), 
                        pd.to_datetime("2007-06-30")), 'financial_year')] = '2006-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2007-07-01"), 
                        pd.to_datetime("2008-06-30")), 'financial_year')] = '2007-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2008-07-01"), 
                        pd.to_datetime("2009-06-30")), 'financial_year')] = '2008-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2009-07-01"), 
                        pd.to_datetime("2010-06-30")), 'financial_year')] = '2009-07-01'



In [50]:
news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2010-07-01"), 
                        pd.to_datetime("2011-06-30")), 'financial_year')] = '2010-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2011-07-01"), 
                        pd.to_datetime("2012-06-30")), 'financial_year')] = '2011-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2012-07-01"), 
                        pd.to_datetime("2013-06-30")), 'financial_year')] = '2012-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2013-07-01"), 
                        pd.to_datetime("2014-06-30")), 'financial_year')] = '2013-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2014-07-01"), 
                        pd.to_datetime("2015-06-30")), 'financial_year')] = '2014-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2015-07-01"), 
                        pd.to_datetime("2016-06-30")), 'financial_year')] = '2015-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2016-07-01"), 
                        pd.to_datetime("2017-06-30")), 'financial_year')] = '2016-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2017-07-01"), 
                        pd.to_datetime("2018-06-30")), 'financial_year')] = '2017-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2018-07-01"), 
                        pd.to_datetime("2019-06-30")), 'financial_year')] = '2018-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2019-07-01"), 
                        pd.to_datetime("2020-06-30")), 'financial_year')] = '2019-07-01'

In [51]:
news_copy['financial_year'].value_counts()

2012-07-01    93574
2013-07-01    86497
2011-07-01    81546
2007-07-01    79743
2008-07-01    78062
2010-07-01    76406
2014-07-01    76302
2009-07-01    75475
2005-07-01    73499
2015-07-01    73027
2004-07-01    72856
2003-07-01    72753
2006-07-01    67690
2016-07-01    53234
2017-07-01    44629
2018-07-01    36344
2002-07-01    27436
2019-07-01    16945
Name: financial_year, dtype: int64

In [52]:
news_copy.dtypes

publish_date      datetime64[ns]
headline_text             object
financial_year            object
dtype: object

In [53]:
news_copy['financial_year'] = pd.to_datetime(news_copy['financial_year'])
                                             #, format='%Y%m%d')

In [54]:
news_copy.dtypes

publish_date      datetime64[ns]
headline_text             object
financial_year    datetime64[ns]
dtype: object

In [55]:
news_copy

Unnamed: 0,publish_date,headline_text,financial_year
0,2003-02-19,aba decides against community broadcasting licence,2002-07-01
1,2003-02-19,act fire witnesses must be aware of defamation,2002-07-01
2,2003-02-19,a g calls for infrastructure protection summit,2002-07-01
3,2003-02-19,air nz staff in aust strike for pay rise,2002-07-01
4,2003-02-19,air nz strike to affect australian travellers,2002-07-01
...,...,...,...
1186013,2019-12-31,vision of flames approaching corryong in victoria,2019-07-01
1186014,2019-12-31,wa police and government backflip on drug amnesty bins,2019-07-01
1186015,2019-12-31,we have fears for their safety: victorian premier,2019-07-01
1186016,2019-12-31,when do the 20s start,2019-07-01


In [56]:
news = news_copy

In [57]:
news_copy = news.copy()

### Check for missing values

In [58]:
news_copy.isna().sum()

publish_date      0
headline_text     0
financial_year    0
dtype: int64

In [59]:
news_copy.loc[(news_copy['headline_text'] == ' ') 
        | (news_copy['headline_text'] == '')]

Unnamed: 0,publish_date,headline_text,financial_year


In [60]:
news_copy = news_copy[~(news_copy['headline_text'] == ' ')]


In [61]:
news_copy = news_copy[~(news_copy['headline_text'] == '')]

In [62]:
news_copy[(news_copy['headline_text'] == '')]

Unnamed: 0,publish_date,headline_text,financial_year


In [63]:
news_copy.loc[(news_copy['headline_text'] == ' ') 
        | (news_copy['headline_text'] == '')]

Unnamed: 0,publish_date,headline_text,financial_year


In [66]:
news = news_copy

In [67]:
news_copy = news.copy()

## Text pre-processing

The text is already in lower case, with no square brackets or quotes. Numbers and punctuation need to be removed prior to lemmatization and stemming. 

In [68]:
news_copy.tail()
# numbers: 2019, 29, 4000
# puntuation: ; : $

Unnamed: 0,publish_date,headline_text,financial_year
1186013,2019-12-31,vision of flames approaching corryong in victoria,2019-07-01
1186014,2019-12-31,wa police and government backflip on drug amnesty bins,2019-07-01
1186015,2019-12-31,we have fears for their safety: victorian premier,2019-07-01
1186016,2019-12-31,when do the 20s start,2019-07-01
1186017,2019-12-31,yarraville shooting woman dead man critically injured,2019-07-01


In [69]:
news_copy[news_copy["headline_text"].str.contains('[%s]' % re.escape(string.punctuation))]

Unnamed: 0,publish_date,headline_text,financial_year
26046,2003-06-24,egon schiele painting sells for record usd 20.93 million,2002-07-01
94005,2004-06-01,greene races to 100 meters in 9.78 seconds,2003-07-01
170396,2005-06-17,us current account deficit surges to record usd 195.1 billion,2004-07-01
181062,2005-08-09,primary health care nets 27.9 million profit,2005-07-01
186139,2005-09-03,us congress passes usd 10.5 billion hurricane aid,2005-07-01
...,...,...,...
1185701,2019-12-26,as facebook pivots to private; family group chats fire up,2019-07-01
1185939,2019-12-30,us strikes in iraq; syria target iranian backed shiite group,2019-07-01
1185976,2019-12-31,house saved; shed gone; everyones alive: daniel marshall,2019-07-01
1186004,2019-12-31,the fire is engulfing the house: tracey corbin,2019-07-01


In [70]:
news_copy[news_copy["headline_text"].str.contains('\w*\d\w*')]

Unnamed: 0,publish_date,headline_text,financial_year
10,2003-02-19,australia to contribute 10 million in aid to iraq,2002-07-01
38,2003-02-19,de villiers to learn fate on march 5,2002-07-01
42,2003-02-19,dog mauls 18 month old toddler in nsw,2002-07-01
56,2003-02-19,german court to give verdict on sept 11 accused,2002-07-01
84,2003-02-19,korean subway fire 314 still missing,2002-07-01
...,...,...,...
1186002,2019-12-31,tasmania new years eve guide 2019,2019-07-01
1186005,2019-12-31,these are the top stories of 2019 as decided by abc news readers,2019-07-01
1186006,2019-12-31,the year that was 2019 highlights memorable popular stories,2019-07-01
1186009,2019-12-31,up to 4000 people are seeking refuge on a beach in mallacoota,2019-07-01


In [71]:
news_copy[news_copy["headline_text"].str.contains('\n')]

Unnamed: 0,publish_date,headline_text,financial_year


In [72]:
news_copy.tail()

Unnamed: 0,publish_date,headline_text,financial_year
1186013,2019-12-31,vision of flames approaching corryong in victoria,2019-07-01
1186014,2019-12-31,wa police and government backflip on drug amnesty bins,2019-07-01
1186015,2019-12-31,we have fears for their safety: victorian premier,2019-07-01
1186016,2019-12-31,when do the 20s start,2019-07-01
1186017,2019-12-31,yarraville shooting woman dead man critically injured,2019-07-01


## Stemming & lemmatization with NLTK

In [73]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
porter = PorterStemmer()
wnl = WordNetLemmatizer()

In [74]:
w = 'profile'
porter.stem(w)

'profil'

In [75]:
wnl.lemmatize(w)

'profile'

In [76]:
wnl.lemmatize(w) if wnl.lemmatize(w).endswith('e') else porter.stem(w)

'profile'

In [77]:
def clean_text(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) #remove punctuation
    text = re.sub('\w*\d\w*', '', text) # remove words with numbers
    
    # Stemming
    text_list = text.split() # Convert to list
    
    text_list = [wnl.lemmatize(word) if wnl.lemmatize(word).endswith('e') 
                 else porter.stem(word) for word in text_list] # Stemming the data 

    text = " ".join(str(i) for i in text_list) # Convert to string
    
    return text

In [78]:
headlines_clean = pd.DataFrame(news_copy.headline_text.apply(clean_text))

In [79]:
headlines_clean

Unnamed: 0,headline_text
0,aba decid against commun broadcast licence
1,act fire wit must be aware of defam
2,a g call for infrastructure protect summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travel
...,...
1186013,vision of flame approach corryong in victoria
1186014,wa police and govern backflip on drug amnesti bin
1186015,we have fear for their safeti victorian premier
1186016,when do the start


#### Replace original "headline_text" with clean version

In [80]:
news_copy["headline_text"] = headlines_clean["headline_text"]

In [81]:
news_copy

Unnamed: 0,publish_date,headline_text,financial_year
0,2003-02-19,aba decid against commun broadcast licence,2002-07-01
1,2003-02-19,act fire wit must be aware of defam,2002-07-01
2,2003-02-19,a g call for infrastructure protect summit,2002-07-01
3,2003-02-19,air nz staff in aust strike for pay rise,2002-07-01
4,2003-02-19,air nz strike to affect australian travel,2002-07-01
...,...,...,...
1186013,2019-12-31,vision of flame approach corryong in victoria,2019-07-01
1186014,2019-12-31,wa police and govern backflip on drug amnesti bin,2019-07-01
1186015,2019-12-31,we have fear for their safeti victorian premier,2019-07-01
1186016,2019-12-31,when do the start,2019-07-01


## Remove stop words

a, the, and, etc.

In [84]:
cv = CountVectorizer(stop_words='english')

In [85]:
data_cv = cv.fit_transform(news_copy.headline_text)

In [86]:
news_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())

In [87]:
news_dtm.index = news_copy.index

In [88]:
news_dtm.head()

Unnamed: 0,aa,aaa,aaahhh,aac,aacc,aaco,aacta,aad,aadhaar,aadmi,...,zydelig,zygar,zygief,zygier,zyl,zylvest,zynga,zyngier,zz,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
news = news_copy

# Export to csv

In [58]:
news.to_csv('abc_news_clean.csv', sep=',', index=False, date_format='%Y-%m-%d')