# Named Entity Recognition on news articles

### I downloaded the dataset from https://www.kaggle.com/snapcrack/all-the-news
The first file contains 50000 articles

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #default 'last_expr'

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
# Datasets from https://www.kaggle.com/snapcrack/all-the-news
#import os
#print(os.listdir("all-the-news/"))
# Read one csv
df = pd.read_csv("all-the-news/articles1.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


#### Import spacy and define a dictionary to convert entity type to a more human-readable description

In [3]:
import spacy
from spacy import displacy
nlp = spacy.load('en')
dict_entity_type = {'PERSON':'Person',
'NORP':'Group',
'FAC':'Building',
'ORG':'Organization',
'GPE':'Place',
'LOC':'Location',
'PRODUCT':'Product',
'EVENT':'Event',
'WORK_OF_ART':'Title',
'LAW':'Legal',
'LANGUAGE':'Language',
'DATE':'Date',
'TIME':'Time',
'PERCENT':'Percentage',
'MONEY':'Value',
'QUANTITY':'Quantity',
'ORDINAL':'Ordinal',
'CARDINAL':'Number'}


## Define the corpus. If you have a fast computer increase the number of articles

In [4]:
corpus = df.iloc[0:2000].content.apply(lambda x: re.sub(' +', ' ',x))

#### Build a data frame of all the named entities and their types


In [5]:
named_entities = []
for i,article in enumerate(corpus):
    article = nlp(article)
    for element in article.ents:
        named_entities.append((element.text, element.label_, i))

entity_frame = pd.DataFrame(named_entities, columns=['Name', 'Type', 'Document number'])

#### Get the top named entity types

In [6]:
top_entities = (entity_frame.groupby(by=['Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Type,PERSON,ORG,GPE,DATE,CARDINAL,NORP,ORDINAL,TIME,LOC,WORK_OF_ART,MONEY,FAC,PERCENT,EVENT,QUANTITY
Frequency,58912,32462,31304,30182,17812,16399,3491,3222,3147,3000,2654,1932,1551,1145,987


#### Get the top named entities

In [7]:
top_entities = (entity_frame.groupby(by=['Name', 'Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Name,Trump,one,first,the United States,American,two,Obama,Republican,Russia,China,000,Republicans,Washington,Russian,Democrats
Type,PERSON,CARDINAL,ORDINAL,GPE,NORP,CARDINAL,PERSON,NORP,GPE,GPE,CARDINAL,NORP,GPE,NORP,NORP
Frequency,7116,2507,2310,2066,1912,1831,1368,1115,1062,1020,977,975,864,784,748


## Example: categorize the first article
Compute the named entities, then print the most important keywords

In [8]:
article = corpus.iloc[0]
article = nlp(article)
df = [(element.text, element.label_) for element in article.ents]
df = pd.DataFrame(df, columns=['Name', 'Type'])

top = (df.groupby(by=['Name', 'Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
# Show the top named entities
#top.T.iloc[:,:15]

# Get the top named entities, print a maximum of 6
quant09 = np.quantile(top.Frequency,0.9)
top_ent = top[top['Frequency']>=quant09][0:6]
gr = top_ent.groupby(by='Type')
from IPython.display import HTML
HTML("<b>Keywords:</b>")
for k in gr.groups.keys():
    keywords =+ top_ent[top_ent['Type']==k].Name
    print(dict_entity_type[k]+': ',', '.join(keywords))

Group:  Republicans, Trump
Organization:  House, Congress
Person:  Obama


#### Display named entities in some sentence of the the first article

In [9]:
sentences = [x for x in article.sents]
n = 12 #12th sentence

for element in nlp(str(sentences[n])).ents:
    print('%s, Type: %s' % (element, element.label_))
    
displacy.render(nlp(str(sentences[n])), style='ent', jupyter=True)
displacy.render(nlp(str(sentences[n])), style='dep', jupyter=True, options={'distance': 100})

2015, Type: DATE
Rosemary M. Collyer, Type: PERSON
House, Type: ORG
Republicans, Type: NORP
Obama, Type: PERSON
Constitution, Type: LAW
Congress, Type: ORG


## What to do next
* Explore summarization of content  
* It would be fun to create a recommendation system using named entities