# Named Entity Recognition
- using Spacy

# 1)- Importing key modules

In [0]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [0]:
# For data processing and maths

import pandas as pd
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [0]:
# For text we shall use Spacy

import spacy 
from spacy import displacy # for spacy visuals
nlp = spacy.load("en_core_web_sm")

In [0]:
# for web scarpping

from bs4 import BeautifulSoup
import requests
import re

In [5]:
! pip install version_information



In [6]:
# first install: pip install version_information
%reload_ext version_information
%version_information pandas,spacy,bs4,seaborn, matplotlib

Software,Version
Python,3.6.8 64bit [GCC 8.3.0]
IPython,5.5.0
OS,Linux 4.14.137+ x86_64 with Ubuntu 18.04 bionic
pandas,0.25.3
spacy,2.1.9
bs4,4.6.3
seaborn,0.9.0
matplotlib,3.1.1
Mon Nov 25 00:28:50 2019 UTC,Mon Nov 25 00:28:50 2019 UTC


# 2)- Getting Data

In [0]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [0]:
newyork_data = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')

In [9]:
len(newyork_data)

10315

In [10]:
type(newyork_data)

str

# 3)- Using Spacy for NER

### 3.1)-converting to spacy token

In [0]:
article = nlp(newyork_data)

In [12]:
type(article)

spacy.tokens.doc.Doc

In [13]:
len(article)

1685

### 3.2)-what are unique labels we have

In [14]:
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'CARDINAL': 5,
         'DATE': 22,
         'GPE': 15,
         'NORP': 2,
         'ORDINAL': 1,
         'ORG': 40,
         'PERSON': 79,
         'WORK_OF_ART': 1})

We have 8 unique labels

### 3.3)-most frequent tokens

In [15]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 28), ('F.B.I.', 13), ('Trump', 12)]

### 3.4) Random sampling

Let's pick a random sample out of given tokens

In [16]:
sentences = [x for x in article.sents]
print(sentences[20])

Aitan Goelman, Mr. Strzok’s lawyer, denounced his client’s dismissal.


# 4)- Visualization

- using displacy

In [17]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [18]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

# 5)- Using POS

In [19]:
[(x.text, x.pos_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Aitan', 'PROPN'),
 ('Goelman', 'PROPN'),
 ('Mr.', 'PROPN'),
 ('Strzok', 'PROPN'),
 ('’s', 'PROPN'),
 ('lawyer', 'NOUN'),
 ('denounced', 'VERB'),
 ('client', 'NOUN'),
 ('’s', 'PROPN'),
 ('dismissal', 'NOUN')]

In [20]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'Aitan Goelman': 'PERSON', 'Strzok': 'PERSON'}

In [21]:
sentences[20]

Aitan Goelman, Mr. Strzok’s lawyer, denounced his client’s dismissal.

In [22]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(Aitan, 'B', 'PERSON'), (Goelman, 'I', 'PERSON'), (,, 'O', ''), (Mr., 'O', ''), (Strzok, 'B', 'PERSON'), (’s, 'O', ''), (lawyer, 'O', ''), (,, 'O', ''), (denounced, 'O', ''), (his, 'O', ''), (client, 'O', ''), (’s, 'O', ''), (dismissal, 'O', ''), (., 'O', '')]


# 6)- NER for Full data

In [23]:
displacy.render(article, jupyter=True, style='ent')