In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import spacy
import re
import matplotlib.pyplot as plt

<h4> Load Data<h4>

In [10]:
data = pd.read_csv("bbc_news.csv")

In [11]:
data.head()

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42115 entries, 0 to 42114
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        42115 non-null  object
 1   pubDate      42115 non-null  object
 2   guid         42115 non-null  object
 3   link         42115 non-null  object
 4   description  42115 non-null  object
dtypes: object(5)
memory usage: 1.6+ MB


In [13]:
titles = pd.DataFrame(data['title'])

In [15]:
titles.head()

Unnamed: 0,title
0,Ukraine: Angry Zelensky vows to punish Russian...
1,War in Ukraine: Taking cover in a town under a...
2,Ukraine war 'catastrophic for global food'
3,Manchester Arena bombing: Saffie Roussos's par...
4,Ukraine conflict: Oil price soars to highest l...


<h4> Clean data <h4>

In [30]:
# lowercase
#data['title_lower'] = data['title'].str.lower()
titles['lowercase'] = titles['title'].str.lower()

In [19]:
#data.head()
titles.head()

Unnamed: 0,title,lowercase
0,Ukraine: Angry Zelensky vows to punish Russian...,ukraine: angry zelensky vows to punish russian...
1,War in Ukraine: Taking cover in a town under a...,war in ukraine: taking cover in a town under a...
2,Ukraine war 'catastrophic for global food',ukraine war 'catastrophic for global food'
3,Manchester Arena bombing: Saffie Roussos's par...,manchester arena bombing: saffie roussos's par...
4,Ukraine conflict: Oil price soars to highest l...,ukraine conflict: oil price soars to highest l...


In [29]:
#NO STOPWORDS
en_stopwords= stopwords.words('english')
titles['lowercase'] = titles['lowercase'].fillna('')
titles['no_stopwords'] = titles['lowercase'].apply(lambda x: ' '.join([word for word in x.split() if word not in en_stopwords]) )

In [33]:
#PUNCTUATION REMOVAL
titles['no_punct'] = titles.apply(lambda x: re.sub(r'[^\w\s]', '', x['no_stopwords']), axis=1)

In [35]:
#TOKENIZE
titles['tokenized'] = titles.apply(lambda x: word_tokenize(x['no_punct']), axis=1)

In [36]:
#LEMMATIZING
lemmatizer = WordNetLemmatizer()
titles['lemmatized'] = titles['tokenized'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

In [37]:
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_punct,tokenized,lemmatized
0,Ukraine: Angry Zelensky vows to punish Russian...,ukraine: angry zelensky vows to punish russian...,ukraine: angry zelensky vows punish russian at...,ukraine angry zelensky vows punish russian atr...,"[ukraine, angry, zelensky, vows, punish, russi...","[ukraine, angry, zelensky, vow, punish, russia..."
1,War in Ukraine: Taking cover in a town under a...,war in ukraine: taking cover in a town under a...,war ukraine: taking cover town attack,war ukraine taking cover town attack,"[war, ukraine, taking, cover, town, attack]","[war, ukraine, taking, cover, town, attack]"
2,Ukraine war 'catastrophic for global food',ukraine war 'catastrophic for global food',ukraine war 'catastrophic global food',ukraine war catastrophic global food,"[ukraine, war, catastrophic, global, food]","[ukraine, war, catastrophic, global, food]"
3,Manchester Arena bombing: Saffie Roussos's par...,manchester arena bombing: saffie roussos's par...,manchester arena bombing: saffie roussos's par...,manchester arena bombing saffie roussoss paren...,"[manchester, arena, bombing, saffie, roussoss,...","[manchester, arena, bombing, saffie, roussoss,..."
4,Ukraine conflict: Oil price soars to highest l...,ukraine conflict: oil price soars to highest l...,ukraine conflict: oil price soars highest leve...,ukraine conflict oil price soars highest level...,"[ukraine, conflict, oil, price, soars, highest...","[ukraine, conflict, oil, price, soar, highest,..."


In [39]:
#CREATE LISTS FOR THE TOKENS
tokens_list= sum(titles['lemmatized'], [])

<h4> POS Tagging <h4>

In [49]:
nlp = spacy.load("en_core_web_sm")
nlp.max_length= 3_000_000

In [50]:
spacy_doc=nlp(' '.join(tokens_list))

In [51]:
pos_df = pd.DataFrame(columns=['token','POS'])

In [55]:
for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{'token': token.text, 'POS': token.pos_}])], ignore_index=True)

In [56]:
pos_df_counts = pos_df.groupby(['token', 'POS']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
pos_df_counts.head(10)

Unnamed: 0,token,POS,counts
27379,say,VERB,2277
10799,england,PROPN,1559
32411,uk,PROPN,1530
8142,cup,PROPN,1146
33681,war,NOUN,1061
21593,new,ADJ,944
34503,world,NOUN,896
19409,man,NOUN,878
34504,world,PROPN,878
463,2023,NUM,830


In [58]:
nouns = pos_df_counts[pos_df_counts['POS']=='NOUN']
nouns

Unnamed: 0,token,POS,counts
33681,war,NOUN,1061
34503,world,NOUN,896
19409,man,NOUN,878
34758,year,NOUN,802
34416,woman,NOUN,737
...,...,...,...
18168,leaderboard,NOUN,1
18173,leadmill,NOUN,1
18213,learner,NOUN,1
18217,learns,NOUN,1


In [60]:
adj= pos_df_counts[pos_df_counts['POS']=='ADJ']
adj

Unnamed: 0,token,POS,counts
21593,new,ADJ,944
12213,final,ADJ,535
26991,russian,ADJ,509
12309,first,ADJ,496
8525,dead,ADJ,342
...,...,...,...
18181,leafy,ADJ,1
18186,leah,ADJ,1
18212,learner,ADJ,1
18197,lean,ADJ,1


<h4> NER <H4

In [62]:
ner_df = pd.DataFrame(columns=['token','NER_tag'])
for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records([{'token': token.text, 'NER_tag': token.label_}])], ignore_index=True)

In [63]:
ner_df.head()

Unnamed: 0,token,NER_tag
0,ukraine,GPE
1,zelensky,PERSON
2,russian,NORP
3,arena bombing saffie roussoss parent,ORG
4,2008,DATE


In [67]:
ner_df_counts = ner_df.groupby(['token', 'NER_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
ner_df_counts.head(10)

Unnamed: 0,token,NER_tag,counts
4390,first,ORDINAL,707
8873,russia,GPE,588
8883,russian,NORP,545
5342,israel,GPE,502
4608,gaza,GPE,487
2268,bbc,ORG,460
591,2022,CARDINAL,457
10056,uk,GPE,394
606,2023,CARDINAL,354
4041,england,GPE,352


In [69]:
people = ner_df_counts[ner_df_counts['NER_tag']=='PERSON']
people

Unnamed: 0,token,NER_tag,counts
8446,putin,PERSON,192
2616,boris johnson,PERSON,144
3161,chris mason,PERSON,78
9655,taylor,PERSON,72
11004,zelensky,PERSON,62
...,...,...,...
11019,zelensky nato,PERSON,1
11020,zelensky peter,PERSON,1
11010,zelensky charles bronson,PERSON,1
11011,zelensky chris,PERSON,1
