In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import re
from spacy import displacy


In [None]:
# LOAD DATA
bbc = pd.read_csv('./datasets/bbc_news.csv')
# bbc.head()

In [None]:
bbc.info()

In [None]:
title  = pd.DataFrame(bbc['title'])
title.head()

**Clean Data**

In [None]:
# lowercase
title['lower'] = title['title'].str.lower()
title.head()

In [None]:
# stopwords
en_stopwords = stopwords.words('english')
title['no_stopwords'] = title['lower'].apply(lambda x: " ".join([word for word in x.split() if word not in (en_stopwords)]))
title.head(3)

In [None]:
# punctuation remove
title['no_stopwords_no_punctuations'] = title.apply(lambda x: re.sub(r"[^\w\s]", '', x['no_stopwords']), axis=1)
title.head()

In [None]:
# tokenize

title['tokenize'] = title.apply(lambda x: word_tokenize(x['no_stopwords_no_punctuations']), axis=1) # why use axis?
# title.head()

In [None]:
# Lemmatization

lemmatizer = WordNetLemmatizer()
title['lemmatised'] = title['tokenize'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
title.head()

In [None]:
token_new_list = sum(title['tokenize'], []) # What?
token_clean_list = sum(title['lemmatised'], [])

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
spacy_doc = nlp(" ".join(token_new_list))

In [None]:
pos_df = pd.DataFrame(columns=['token', 'pos_tag'])
for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{'token': token.text, 'pos_tag': token.pos_}])], ignore_index=True)

In [None]:
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
pos_df_counts.head(10)

In [None]:
nouns = pos_df_counts[pos_df_counts['pos_tag']=='NOUN'][0:10]
nouns

In [None]:
verb = pos_df_counts[pos_df_counts['pos_tag']=='VERB'][0:10]
verb

In [None]:
# NER
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records([{'token': token.text, 'ner_tag': token.label_}])], ignore_index=True)
        # ner_df.head(15)

In [None]:
# Import these at the top of your notebook
from IPython.display import HTML, display

# ... your other code to create spacy_doc ...

# Tell displacy not to auto-render, and just return the HTML
html = displacy.render(spacy_doc, style='ent', jupyter=False)

# Now, manually display the HTML string
display(HTML(html))