In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
import nltk
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package wordnet to /Users/imogen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/imogen/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/imogen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
news = pd.read_csv("abcnews-date-text.csv")

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
news.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting licence
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [5]:
news.tail()

Unnamed: 0,publish_date,headline_text
1186013,20191231,vision of flames approaching corryong in victoria
1186014,20191231,wa police and government backflip on drug amnesty bins
1186015,20191231,we have fears for their safety: victorian premier
1186016,20191231,when do the 20s start
1186017,20191231,yarraville shooting woman dead man critically injured


In [6]:
news.shape

(1186018, 2)

In [7]:
news.dtypes

publish_date      int64
headline_text    object
dtype: object

## Change publish_date to datetime format

In [8]:
news['publish_date'] = pd.to_datetime(news['publish_date'].astype(str), format='%Y%m%d')
news

#another way:
#df_real['date'] = pd.to_datetime(df_real['date'], errors='coerce')

Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting licence
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers
...,...,...
1186013,2019-12-31,vision of flames approaching corryong in victoria
1186014,2019-12-31,wa police and government backflip on drug amnesty bins
1186015,2019-12-31,we have fears for their safety: victorian premier
1186016,2019-12-31,when do the 20s start


In [9]:
news.dtypes

publish_date     datetime64[ns]
headline_text            object
dtype: object

In [10]:
news_copy = news.copy()

## Make financial year categories as new column

My other dataset has immigration numbers by Australian financial year, which starts 1 July and ends 30 June, so I will make a column in the same format to make the data easy to compare.  

financial_year
- 2003-07-01: 2003-07-01 to 2004-06-30
- 2004-07-01: 2004-07-01 to 2005-06-30
- 2005-07-01: 2005-07-01 to 2006-06-30
- 2006-07: 2006-07-01 to 2007-06-30
- 2007-08: 2007-07-01 to 2008-06-30
- 2008-09: 2008-07-01 to 2009-06-30
- 2009-10: 2009-07-01 to 2010-06-30
- 2010-11: 2010-07-01 to 2011-06-30
- 2011-12: 2011-07-01 to 2012-06-30
- 2012-13: 2012-07-01 to 2013-06-30
- 2013-14: 2013-07-01 to 2014-06-30
- 2014-15: 2014-07-01 to 2015-06-30
- 2015-16: 2015-07-01 to 2016-06-30
- 2016-17: 2016-07-01 to 2017-06-30
- 2017-18: 2017-07-01 to 2018-06-30
- 2018-19: 2018-07-01 to 2019-06-30

Gaps: 
- can delete (or ignore? would be null values) data from 2003-02-19 to 2003-07-01
- 2019-06-30 to 2019-12-31: would rather not delete


In [11]:
news_copy['publish_date'].min()

Timestamp('2003-02-19 00:00:00')

In [12]:
news_copy['publish_date'].max()

Timestamp('2019-12-31 00:00:00')

### 1. Make new empty column

In [14]:
news_copy

Unnamed: 0,publish_date,headline_text,financial_year
0,2003-02-19,aba decides against community broadcasting licence,
1,2003-02-19,act fire witnesses must be aware of defamation,
2,2003-02-19,a g calls for infrastructure protection summit,
3,2003-02-19,air nz staff in aust strike for pay rise,
4,2003-02-19,air nz strike to affect australian travellers,
...,...,...,...
1186013,2019-12-31,vision of flames approaching corryong in victoria,
1186014,2019-12-31,wa police and government backflip on drug amnesty bins,
1186015,2019-12-31,we have fears for their safety: victorian premier,
1186016,2019-12-31,when do the 20s start,


### 2. add values to new column

In [11]:
news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2002-07-01"), 
                        pd.to_datetime("2003-06-30")), 'financial_year')] = '2002-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2003-07-01"), 
                        pd.to_datetime("2004-06-30")), 'financial_year')] = '2003-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2004-07-01"), 
                        pd.to_datetime("2005-06-30")), 'financial_year')] = '2004-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2005-07-01"), 
                        pd.to_datetime("2006-06-30")), 'financial_year')] = '2005-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2006-07-01"), 
                        pd.to_datetime("2007-06-30")), 'financial_year')] = '2006-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2007-07-01"), 
                        pd.to_datetime("2008-06-30")), 'financial_year')] = '2007-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2008-07-01"), 
                        pd.to_datetime("2009-06-30")), 'financial_year')] = '2008-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2009-07-01"), 
                        pd.to_datetime("2010-06-30")), 'financial_year')] = '2009-07-01'



In [12]:
news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2010-07-01"), 
                        pd.to_datetime("2011-06-30")), 'financial_year')] = '2010-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2011-07-01"), 
                        pd.to_datetime("2012-06-30")), 'financial_year')] = '2011-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2012-07-01"), 
                        pd.to_datetime("2013-06-30")), 'financial_year')] = '2012-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2013-07-01"), 
                        pd.to_datetime("2014-06-30")), 'financial_year')] = '2013-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2014-07-01"), 
                        pd.to_datetime("2015-06-30")), 'financial_year')] = '2014-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2015-07-01"), 
                        pd.to_datetime("2016-06-30")), 'financial_year')] = '2015-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2016-07-01"), 
                        pd.to_datetime("2017-06-30")), 'financial_year')] = '2016-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2017-07-01"), 
                        pd.to_datetime("2018-06-30")), 'financial_year')] = '2017-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2018-07-01"), 
                        pd.to_datetime("2019-06-30")), 'financial_year')] = '2018-07-01'

news_copy.loc[(news_copy["publish_date"]
               .between(pd.to_datetime("2019-07-01"), 
                        pd.to_datetime("2020-06-30")), 'financial_year')] = '2019-07-01'

## Check that it worked

Note that the 2002-3 and 2019-20 are incomplete and numbers of values should not be compared with other years

In [13]:
news_copy['financial_year'].value_counts()

2012-07-01    93574
2013-07-01    86497
2011-07-01    81546
2007-07-01    79743
2008-07-01    78062
2010-07-01    76406
2014-07-01    76302
2009-07-01    75475
2005-07-01    73499
2015-07-01    73027
2004-07-01    72856
2003-07-01    72753
2006-07-01    67690
2016-07-01    53234
2017-07-01    44629
2018-07-01    36344
2002-07-01    27436
2019-07-01    16945
Name: financial_year, dtype: int64

In [14]:
news_copy.dtypes

publish_date      datetime64[ns]
headline_text             object
financial_year            object
dtype: object

In [18]:
#news_copy['financial_year'] = pd.to_datetime(news_copy['financial_year'].astype(str), format='%Y%m%d')

news_copy['financial_year'] = pd.to_datetime(news_copy['financial_year'])
                                             #, format='%Y%m%d')

#pd.to_datetime(df)
#pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')
#news_copy

In [19]:
news_copy.dtypes

publish_date      datetime64[ns]
headline_text             object
financial_year    datetime64[ns]
dtype: object

In [20]:
news_copy

Unnamed: 0,publish_date,headline_text,financial_year
0,2003-02-19,aba decides against community broadcasting licence,2002-07-01
1,2003-02-19,act fire witnesses must be aware of defamation,2002-07-01
2,2003-02-19,a g calls for infrastructure protection summit,2002-07-01
3,2003-02-19,air nz staff in aust strike for pay rise,2002-07-01
4,2003-02-19,air nz strike to affect australian travellers,2002-07-01
...,...,...,...
1186013,2019-12-31,vision of flames approaching corryong in victoria,2019-07-01
1186014,2019-12-31,wa police and government backflip on drug amnesty bins,2019-07-01
1186015,2019-12-31,we have fears for their safety: victorian premier,2019-07-01
1186016,2019-12-31,when do the 20s start,2019-07-01


In [21]:
news = news_copy

In [22]:
news_copy = news.copy()

### Check for missing values

In [23]:
news_copy.isna().sum()

publish_date      0
headline_text     0
financial_year    0
dtype: int64

In [24]:
news_copy.loc[(news_copy['headline_text'] == ' ') 
        | (news_copy['headline_text'] == '')]

Unnamed: 0,publish_date,headline_text,financial_year


In [25]:
news_copy = news_copy[~(news_copy['headline_text'] == ' ')]


In [26]:
news_copy = news_copy[~(news_copy['headline_text'] == '')]

In [31]:
news_copy[(news_copy['headline_text'] == '')]

Unnamed: 0,publish_date,headline_text,financial_year


In [32]:
news_copy.loc[(news_copy['headline_text'] == ' ') 
        | (news_copy['headline_text'] == '')]

Unnamed: 0,publish_date,headline_text,financial_year


In [33]:
news = news_copy

In [34]:
news_copy = news.copy()

## text pre-processing

My text is already in lower case, no square brackets or quotes. 
Need to remove numbers and punctuation.

In [35]:
news_copy.tail()
# numbers: 2019, 29, 4000
# puntuation: ; : $

Unnamed: 0,publish_date,headline_text,financial_year
1186013,2019-12-31,vision of flames approaching corryong in victoria,2019-07-01
1186014,2019-12-31,wa police and government backflip on drug amnesty bins,2019-07-01
1186015,2019-12-31,we have fears for their safety: victorian premier,2019-07-01
1186016,2019-12-31,when do the 20s start,2019-07-01
1186017,2019-12-31,yarraville shooting woman dead man critically injured,2019-07-01


In [36]:
news_copy[news_copy["headline_text"].str.contains('[%s]' % re.escape(string.punctuation))]

Unnamed: 0,publish_date,headline_text,financial_year
26046,2003-06-24,egon schiele painting sells for record usd 20.93 million,2002-07-01
94005,2004-06-01,greene races to 100 meters in 9.78 seconds,2003-07-01
170396,2005-06-17,us current account deficit surges to record usd 195.1 billion,2004-07-01
181062,2005-08-09,primary health care nets 27.9 million profit,2005-07-01
186139,2005-09-03,us congress passes usd 10.5 billion hurricane aid,2005-07-01
...,...,...,...
1185701,2019-12-26,as facebook pivots to private; family group chats fire up,2019-07-01
1185939,2019-12-30,us strikes in iraq; syria target iranian backed shiite group,2019-07-01
1185976,2019-12-31,house saved; shed gone; everyones alive: daniel marshall,2019-07-01
1186004,2019-12-31,the fire is engulfing the house: tracey corbin,2019-07-01


In [37]:
news_copy[news_copy["headline_text"].str.contains('\w*\d\w*')]

Unnamed: 0,publish_date,headline_text,financial_year
10,2003-02-19,australia to contribute 10 million in aid to iraq,2002-07-01
38,2003-02-19,de villiers to learn fate on march 5,2002-07-01
42,2003-02-19,dog mauls 18 month old toddler in nsw,2002-07-01
56,2003-02-19,german court to give verdict on sept 11 accused,2002-07-01
84,2003-02-19,korean subway fire 314 still missing,2002-07-01
...,...,...,...
1186002,2019-12-31,tasmania new years eve guide 2019,2019-07-01
1186005,2019-12-31,these are the top stories of 2019 as decided by abc news readers,2019-07-01
1186006,2019-12-31,the year that was 2019 highlights memorable popular stories,2019-07-01
1186009,2019-12-31,up to 4000 people are seeking refuge on a beach in mallacoota,2019-07-01


In [38]:
news_copy[news_copy["headline_text"].str.contains('\n')]

Unnamed: 0,publish_date,headline_text,financial_year


In [39]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
porter = PorterStemmer()
wnl = WordNetLemmatizer()

In [40]:
w = 'profile'
porter.stem(w)

'profil'

In [41]:
wnl.lemmatize(w)

'profile'

In [42]:
wnl.lemmatize(w) if wnl.lemmatize(w).endswith('e') else porter.stem(w)

'profile'

In [43]:
def clean_text(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) #remove punctuation
    text = re.sub('\w*\d\w*', '', text) # remove words with numbers
    
    # Stemming
    text_list = text.split() # Convert to list
    
    text_list = [wnl.lemmatize(word) if wnl.lemmatize(word).endswith('e') 
                 else porter.stem(word) for word in text_list] # Stemming the data 

    text = " ".join(str(i) for i in text_list) # Convert to string
    
    #issue with this stemming is it 
    #cuts off the e on a lot of words. e.g. licenc, decid, awar, polic
    return text



In [44]:
headlines_clean = pd.DataFrame(news_copy.headline_text.apply(clean_text))

In [45]:
headlines_clean

Unnamed: 0,headline_text
0,aba decid against commun broadcast licence
1,act fire wit must be aware of defam
2,a g call for infrastructure protect summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travel
...,...
1186013,vision of flame approach corryong in victoria
1186014,wa police and govern backflip on drug amnesti bin
1186015,we have fear for their safeti victorian premier
1186016,when do the start


### Replace original "headline_text" with clean version

In [46]:
news_copy["headline_text"] = headlines_clean["headline_text"]

In [47]:
news_copy

Unnamed: 0,publish_date,headline_text,financial_year
0,2003-02-19,aba decid against commun broadcast licence,2002-07-01
1,2003-02-19,act fire wit must be aware of defam,2002-07-01
2,2003-02-19,a g call for infrastructure protect summit,2002-07-01
3,2003-02-19,air nz staff in aust strike for pay rise,2002-07-01
4,2003-02-19,air nz strike to affect australian travel,2002-07-01
...,...,...,...
1186013,2019-12-31,vision of flame approach corryong in victoria,2019-07-01
1186014,2019-12-31,wa police and govern backflip on drug amnesti bin,2019-07-01
1186015,2019-12-31,we have fear for their safeti victorian premier,2019-07-01
1186016,2019-12-31,when do the start,2019-07-01


In [48]:
news_copy.dtypes

publish_date      datetime64[ns]
headline_text             object
financial_year    datetime64[ns]
dtype: object

In [49]:
news_copy.isna().sum()

publish_date      0
headline_text     0
financial_year    0
dtype: int64

## Remove stop words

In [50]:
cv = CountVectorizer(stop_words='english')

In [51]:
data_cv = cv.fit_transform(news_copy.headline_text)


In [52]:
news_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())

In [53]:
news_dtm.index = news_copy.index

In [54]:
news_dtm.head()

Unnamed: 0,aa,aaa,aaahhh,aac,aacc,aaco,aacta,aad,aadhaar,aadmi,...,zydelig,zygar,zygief,zygier,zyl,zylvest,zynga,zyngier,zz,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
news = news_copy

In [56]:
news.isna().sum()

publish_date      0
headline_text     0
financial_year    0
dtype: int64

In [57]:
news[(news['headline_text'] == '')]

Unnamed: 0,publish_date,headline_text,financial_year
235989,2006-05-11,,2005-07-01
236757,2006-05-15,,2005-07-01
243358,2006-06-15,,2005-07-01
254803,2006-08-10,,2006-07-01
274837,2006-12-19,,2006-07-01
603318,2011-03-25,,2010-07-01
658067,2011-12-07,,2011-07-01
771294,2013-03-18,,2012-07-01
782023,2013-04-26,,2012-07-01
784872,2013-05-08,,2012-07-01


# Checkpoint: export to csv

In [58]:
news.to_csv('abc_news_clean.csv', sep=',', index=False, date_format='%Y-%m-%d')

In [102]:
hello

NameError: name 'hello' is not defined

# EXTRA - NOT SURE WHAT TO DO WITH YET

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 

In [None]:

news['headline_text'].apply(word_tokenize)
 Resource punkt not found.
  Please use the NLTK Downloader to obtain the resource:

  >>> import nltk
  >>> nltk.download('punkt')

In [None]:
news['headlines_tokenized'] = news['headline_text'].apply(tokenizer.tokenize)

In [None]:
# split() returns list of all the words in the string 
split_it = news['headline_text'[0]].split() 

In [None]:

  
# Pass the split_it list to instance of Counter class. 
Counter = Counter(split_it) 
  
# most_common() produces k frequently encountered 
# input values and their respective counts. 
most_occur = Counter.most_common(4) 
  
print(most_occur) 

In [None]:

words, word_values = get_top_n_words(n_top_words=15,
                                     count_vectorizer=count_vectorizer, 
                                     text_data=reindexed_data)

fig, ax = plt.subplots(figsize=(16,8))
ax.bar(range(len(words)), word_values);
ax.set_xticks(range(len(words)));
ax.set_xticklabels(words, rotation='vertical');
ax.set_title('Top words in headlines dataset (excluding stop words)');
ax.set_xlabel('Word');
ax.set_ylabel('Number of occurences');
plt.show()

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

In [None]:
for token in doc:
    print(token.text)

In [None]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:
spacy.explain("SYM")

In [None]:
for token in doc:
    print(token.lemma_)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
spacy.explain("GPE")

In [None]:
tokens = nlp("dog cat banana afskfsd")
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)


# DIDN'T USE

In [None]:
#to display/show more columns
#pd.set_option('display.max_columns', 50)

### change data types to integers - didn't work, maybe don't need?

In [None]:
news_copy.dtypes

In [None]:
#news_copy['financial_year' == '2003–04'] = news_copy['financial_year' == '2003–04'].astype('Int64')
#error: False

In [None]:
#news_copy.astype({'financial_year': 'int32'}).dtypes
#ValueError: cannot convert float NaN to integer

In [None]:
#news_copy.astype({'financial_year': 'float'}).dtypes
#ValueError: invalid literal for int() with base 10: '2002-03'
#ValueError: could not convert string to float: '2002-03'

In [None]:
'''
news_copy['2003–04'] = news_copy['2003–04'].astype('Int64')
news_copy['2004–05'] = news_copy['2004–05'].astype('Int64')
news_copy['2005–06'] = news_copy['2005–06'].astype('Int64')
news_copy['2006–07'] = news_copy['2006–07'].astype('Int64')
news_copy['2007–08'] = news_copy['2007–08'].astype('Int64')
news_copy['2008–09'] = news_copy['2008–09'].astype('Int64')
news_copy['2009–10'] = news_copy['2009–10'].astype('Int64')
news_copy['2010–11'] = news_copy['2010–11'].astype('Int64')
news_copy['2011–12'] = news_copy['2011–12'].astype('Int64')
news_copy['2012–13'] = news_copy['2012–13'].astype('Int64')
news_copy['2013–14'] = news_copy['2013–14'].astype('Int64')
news_copy['2014–15'] = news_copy['2014–15'].astype('Int64')
news_copy['2015–16'] = news_copy['2015–16'].astype('Int64')
news_copy['2016–17'] = news_copy['2016–17'].astype('Int64')
news_copy['2017–18'] = news_copy['2017–18'].astype('Int64')
news_copy['2018–19'] = news_copy['2018–19'].astype('Int64') '''
#this didn't work, these were columns, I'm changing individual values
#come back to later