### Named Entity Recognition

In [31]:
##############
# Import libs
##############

import re
import numpy as np
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import RegexpTagger
from collections import Counter


import spacy
from spacy import displacy
import en_core_web_md

In [15]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nickel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Nickel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Nickel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [None]:
# !python -m spacy download en

In [102]:
###########
# Settings
###########

TOP_LIMIT = 20

# Data load
with open('data/df_processed.pkl', 'rb') as f:
    df = pickle.load(f)

In [4]:
df.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunct, selfish, drag, kid, dysfunc..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, caus, offer, wheelc..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesti]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguid, societi, motiv]"


In [53]:
def extract_text(df, target='clean_tweet'):
    return " ".join(df[target])
    

### NER via LNTK

In [54]:
text = make_sent(df)

In [58]:
nltk_tags = nltk.pos_tag(nltk.word_tokenize(text))

In [63]:
nltk_tags[:20]

[('when', 'WRB'),
 ('father', 'NN'),
 ('is', 'VBZ'),
 ('dysfunctional', 'JJ'),
 ('and', 'CC'),
 ('is', 'VBZ'),
 ('so', 'RB'),
 ('selfish', 'JJ'),
 ('he', 'PRP'),
 ('drags', 'VBZ'),
 ('his', 'PRP$'),
 ('kids', 'NNS'),
 ('into', 'IN'),
 ('his', 'PRP$'),
 ('dysfunction', 'NN'),
 ('run', 'VB'),
 ('thanks', 'NNS'),
 ('for', 'IN'),
 ('lyft', 'JJ'),
 ('credit', 'NN')]

In [59]:
# Search for named entities via  nltk.ne_chunk()

named_entities = {(' '.join(c[0] for c in chunk), chunk.label() ) 
                  for chunk in nltk.ne_chunk(nltk_tags) 
                  if hasattr(chunk, 'label') }

In [60]:
# Named entities not found
named_entities

set()

### NER via SpaCy

In [70]:
nlp = en_core_web_md.load()

In [76]:
test_txt = text[:1000]
article = nlp(t)
displacy.render(article, jupyter=True, style='ent')

In [85]:
# display text dependences
displacy.render(article[:5], style='dep', jupyter=True)

In [99]:
def parse_string_entities(text_string, cntr, ents_dict):
    doc = nlp(text_string)
    ents = [(e.text, e.label_) for e in doc.ents]
    for entity in ents:

        cntr[entity[0]] += 1
        if entity[0] not in ents_dict:
            ents_dict[entity[0]] = entity[1]
    

In [100]:
%%time

cntr = Counter()
ents_dict = {}

# Counts all entities

for i in range(len(df)):
    parse_string_entities(df.iloc[i]['clean_tweet'], cntr, ents_dict)

Wall time: 14min 21s


In [104]:
# Top-20 cited entities
for word in cntr.most_common(TOP_LIMIT):
    ent_label = ents_dict[word[0]]
    print(word, ent_label)

('today', 1350) DATE
('friday', 590) DATE
('tomorrow', 586) DATE
('one', 526) CARDINAL
('first', 514) ORDINAL
('orlando', 482) GPE
('sunday', 472) DATE
('morning', 436) TIME
('bihday', 420) DATE
('tonight', 407) TIME
('summer', 400) DATE
('saturday', 324) DATE
('monday', 250) DATE
('america', 212) GPE
('night', 192) TIME
('two', 187) CARDINAL
('days', 182) DATE
('london', 172) GPE
('weekend', 169) DATE
('thursday', 162) DATE


In [105]:
%%time

# Counts top-20 Named entities  

tmp_cntr = 0


popular_orgs_and_persons = []
for i, common in enumerate(cntr.most_common(10000)):
    word = common[0]
    count = common[1]
    ent_label = ents_dict[word]
    if ent_label == "PERSON" or ent_label == "ORG" or ent_label == "GPE":
        popular_orgs_and_persons.append((word, ent_label, count))
        
    if len(popular_orgs_and_persons) == TOP_LIMIT:
        break

Wall time: 18.6 ms


In [106]:
popular_orgs_and_persons

[('orlando', 'GPE', 482),
 ('america', 'GPE', 212),
 ('london', 'GPE', 172),
 ('us', 'GPE', 126),
 ('bing bong bing bong', 'PERSON', 114),
 ('sjw', 'ORG', 106),
 ('uk', 'GPE', 97),
 ('obama', 'PERSON', 94),
 ('allahsoil', 'ORG', 88),
 ('florida', 'GPE', 82),
 ('gop', 'ORG', 81),
 ('miami', 'GPE', 74),
 ('suppo', 'PERSON', 68),
 ('nba', 'ORG', 63),
 ('india', 'GPE', 63),
 ('japan', 'GPE', 57),
 ('trump', 'ORG', 56),
 ('usa', 'GPE', 54),
 ('hillary', 'PERSON', 53),
 ('nyc', 'ORG', 52)]

### Summary

Модуль Spacy обрабатывает именованные сущности, однако ошибается на оммонимах без учета регистра (us != US), требуется дополнительная подгонка под документ и оценка опечаток 

Nltk скверно справляется с именованными сущностями, не видит сущности в нижнем регистре, либо присваивает их любым незнакомым словам, имеющим заглавные буквы