In [None]:
#Jeff - Practical Task 2
#In this project, I've developed a means to identify all parts of speech and the recognition of named entities for title headings in a dataset of bbc_news UK
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re
import pandas as pd
import matplotlib.pyplot as plt

### Load Data

In [None]:
bbc_data = pd.read_csv("z-files/94 - bbc-news.csv")

In [None]:
bbc_data.head()

In [None]:
bbc_data.info()

In [None]:
titles = pd.DataFrame(bbc_data["title"])

In [None]:
titles.head()

# Text Preprocessing

### Clean Data

In [None]:
#lowercase
titles["lowercase"] = titles["title"].str.lower()

In [None]:
#stopwords removal
en_stopwords = stopwords.words("english")
titles["no_stopwords"] = titles["lowercase"].apply(lambda x: " ".join([word for word in x.split() if word not in en_stopwords]))

In [None]:
#punctuation removal
titles["no_stopwords_no_punct"] = titles.apply(lambda x: re.sub(r"([^\w\s])", " ", x["no_stopwords"]), axis=1)

In [None]:
#tokenizing
titles["tokens_raw"] = titles.apply(lambda x: word_tokenize(x["title"]), axis=1)
titles["tokens_clean"] = titles.apply(lambda x: word_tokenize(x["no_stopwords_no_punct"]), axis=1)

In [None]:
#lemmatizing
lemmatizer = WordNetLemmatizer()
titles["tokens_clean_lemmatized"] = titles["tokens_clean"].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

In [None]:
titles.head()

In [None]:
#create new lists for tokens only
tokens_raw_list = sum(titles["tokens_raw"], []) # unpack lists into a single list
tokens_clean_list = sum(titles["tokens_clean_lemmatized"], [])


## POS Tagging


In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
spacy_doc = nlp(" ".join(tokens_raw_list))

In [None]:
pos_df = pd.DataFrame(columns = ["token", "pos_tag"])

In [None]:
# for token in spacy_doc:
#     data.append({"token": token.text,
#                            "pos_tag": token.pos_})
# pos_df = pd.DataFrame(data)
for token in spacy_doc:
    pos_df = pos_df._append({"token": token.text,
                            "pos_tag": token.pos_}, ignore_index=True)

In [None]:
pos_df_counts = pos_df.groupby(["token", "pos_tag"]).size().reset_index(name="counts").sort_values(by="counts", ascending=False)
pos_df_counts.head(10)

In [None]:
nouns = pos_df_counts[pos_df_counts.pos_tag == "NOUN"][:10]
nouns

In [None]:
verbs = pos_df_counts[pos_df_counts.pos_tag == "VERB"][0:10]
verbs

In [None]:
adj = pos_df_counts[pos_df_counts.pos_tag == "ADJ"][0:10]
adj

## NER

In [22]:
ner_df = pd.DataFrame(columns = ["token", "ner_tag"])

In [23]:
data = []

In [24]:
# for token in spacy_doc.ents:
#     if pd.isna(token.label_) is False:
#         data.append({"token": token.text,
#                      "ner_tag": token.label_})
# ner_df = pd.DataFrame(data)
for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = ner_df._append({"token": token.text,
                               "ner_tag": token.label_}, ignore_index=True)
        

In [25]:
ner_df.head()

Unnamed: 0,token,ner_tag
0,Liz Truss,PERSON
1,UK,GPE
2,Rationing,PRODUCT
3,superyachts,CARDINAL
4,Russian,NORP


In [26]:
ner_df_counts = ner_df.groupby(["token", "ner_tag"]).size().reset_index(name="counts").sort_values(by="counts", ascending=False)

In [27]:
ner_df_counts.head(10)

Unnamed: 0,token,ner_tag,counts
965,Ukraine,GPE,47
955,UK,GPE,36
329,England,GPE,32
819,Russian,NORP,20
957,US,GPE,19
1031,World Cup 2022,EVENT,18
1058,first,ORDINAL,13
918,The Papers,WORK_OF_ART,13
378,France,GPE,12
226,China,GPE,11


In [28]:
people = ner_df_counts[ner_df_counts.ner_tag == "PERSON"][8:120]
people

Unnamed: 0,token,ner_tag,counts
515,Jurgen Klopp,PERSON,4
325,Emma Raducanu,PERSON,4
807,Rory McIlroy,PERSON,3
1034,Wrexham,PERSON,3
220,Chelsea,PERSON,3
...,...,...,...
884,Steve Barclay,PERSON,1
1020,Wizz Air,PERSON,1
887,Stuart Bingham,PERSON,1
889,Summer McIntosh,PERSON,1
