In [48]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import re
from spacy import displacy


In [49]:
# LOAD DATA
bbc = pd.read_csv('./datasets/bbc_news.csv')
# bbc.head()

In [50]:
bbc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [51]:
title  = pd.DataFrame(bbc['title'])
title.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


**Clean Data**

In [52]:
# lowercase
title['lower'] = title['title'].str.lower()
title.head()

Unnamed: 0,title,lower
0,Can I refuse to work?,can i refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...


In [53]:
# stopwords
en_stopwords = stopwords.words('english')
title['no_stopwords'] = title['lower'].apply(lambda x: " ".join([word for word in x.split() if word not in (en_stopwords)]))
title.head(3)

Unnamed: 0,title,lower,no_stopwords
0,Can I refuse to work?,can i refuse to work?,refuse work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community


In [54]:
# punctuation remove
title['no_stopwords_no_punctuations'] = title.apply(lambda x: re.sub(r"[^\w\s]", '', x['no_stopwords']), axis=1)
title.head()

Unnamed: 0,title,lower,no_stopwords,no_stopwords_no_punctuations
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds


In [55]:
# tokenize

title['tokenize'] = title.apply(lambda x: word_tokenize(x['no_stopwords_no_punctuations']), axis=1) # why use axis?
# title.head()

In [56]:
# Lemmatization

lemmatizer = WordNetLemmatizer()
title['lemmatised'] = title['tokenize'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
title.head()

Unnamed: 0,title,lower,no_stopwords,no_stopwords_no_punctuations,tokenize,lemmatised
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[refuse, work]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, year, queen, 70, second]"


In [57]:
token_new_list = sum(title['tokenize'], []) # What?
token_clean_list = sum(title['lemmatised'], [])

In [58]:
nlp = spacy.load('en_core_web_sm')

In [59]:
spacy_doc = nlp(" ".join(token_new_list))

In [60]:
pos_df = pd.DataFrame(columns=['token', 'pos_tag'])
for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{'token': token.text, 'pos_tag': token.pos_}])], ignore_index=True)

In [61]:
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
pos_df_counts.head(10)

Unnamed: 0,token,pos_tag,counts
31,2022,NUM,47
1237,england,PROPN,40
935,cup,PROPN,36
3946,uk,PROPN,33
2544,new,ADJ,32
4096,war,NOUN,32
3262,says,VERB,30
3951,ukraine,VERB,28
4210,world,NOUN,28
4211,world,PROPN,26


In [62]:
nouns = pos_df_counts[pos_df_counts['pos_tag']=='NOUN'][0:10]
nouns

Unnamed: 0,token,pos_tag,counts
4096,war,NOUN,32
4210,world,NOUN,28
2275,man,NOUN,23
2707,papers,NOUN,18
3049,record,NOUN,17
1233,energy,NOUN,17
2845,police,NOUN,16
971,day,NOUN,15
4128,week,NOUN,15
934,cup,NOUN,14


In [63]:
verb = pos_df_counts[pos_df_counts['pos_tag']=='VERB'][0:10]
verb

Unnamed: 0,token,pos_tag,counts
3262,says,VERB,30
3951,ukraine,VERB,28
1471,found,VERB,13
4174,win,VERB,10
1562,get,VERB,9
2271,make,VERB,8
4184,wins,VERB,8
1763,hits,VERB,8
3691,take,VERB,8
3261,say,VERB,8


In [64]:
# NER
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records([{'token': token.text, 'ner_tag': token.label_}])], ignore_index=True)
        # ner_df.head(15)

In [65]:
# Import these at the top of your notebook
from IPython.display import HTML, display

# ... your other code to create spacy_doc ...

# Tell displacy not to auto-render, and just return the HTML
html = displacy.render(spacy_doc, style='ent', jupyter=False)

# Now, manually display the HTML string
display(HTML(html))