## Named Entity Recognition

### Dataset: tweeter messages

In [18]:
##############
# Import libs
##############

import re
import numpy as np
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import RegexpTagger
from collections import Counter
import html
from utils.dicts import apostrophe_dict, emoticon_dict, short_word_dict


import spacy
from spacy import displacy
import en_core_web_md

In [4]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nickel\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Nickel\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Nickel\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [5]:
# !python -m spacy download en

In [9]:
###########
# Settings
###########

TOP_LIMIT = 20

# Data load
with open('data/df_processed.pkl', 'rb') as f:
    df = pickle.load(f)
    
train_df = pd.read_csv('data/train_tweets.csv')
test_df = pd.read_csv('data/test_tweets.csv')

In [7]:
df.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunct, selfish, drag, kid, dysfunc..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, caus, offer, wheelc..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesti]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguid, societi, motiv]"


In [35]:
######################
# Preprocess Routine
######################
###################


def extract_text(df, target='clean_tweet'):
    
    """Extract raw text from tweets Data Frame"""
    
    return " ".join(df[target])
    

def replace_from_dict(text, source_dict):
    
    """Search through text, map tokens via dict, place default if not in dict"""
    
    return " ".join([source_dict.get(word, word) for word in text.split()])


def remove_onechar_tokens(text):
    
    """Search through text, remove one-caracter tokens"""
    
    return ' '.join([w for w in text.split() if len(w)>1])


def filter_stop_words(tokens, stop_words=None):
    
    """Remove stop words from tokens"""
    
    return [token for token in tokens if token not in stop_words]
    

def stem_tokens(tokens, stemmer=None):
    
    """Stemming preprocessing"""
    
    return [stemmer.stem(token) for token in tokens]


def lemmatize_tokens(tokens, lemmatizer=None):
    
    """Lemmatize preprocessing"""
    
    return [lemmatizer.lemmatize(token) for token in tokens]
    

def preprocess(df,
               src_col='tweet',
               clean_col='clean_tweet',
               token_col=None,
               filter_col=None,
               stemmed_col=None,
               lem_col=None
              ):
     
    """
    Modified preprocessor for NER testing.
    
    Changes: 
    - case processing str.lower() changed for str.upper() 
    - tokenization, stemming, lemmatization
    """
        
    # 1. Clean html context
    df[clean_col] = df[src_col].apply(lambda x: html.unescape(x))
    
    # 2. Remove @user references
    df[clean_col] = df[clean_col].apply(lambda x: re.sub(r'@[\w]*','', x))
    
    # 3. Correct register to lowercase
    df[clean_col] = df[clean_col].str.upper()
    
    # 4. Change apostrophes
    vfunc = np.vectorize(replace_from_dict)
    df[clean_col] = vfunc(df[clean_col], apostrophe_dict)
    
    # 5. Prolong short words
    df[clean_col] = vfunc(df[clean_col], short_word_dict)
    
    # 6. Replace emoticons
    df[clean_col] = vfunc(df[clean_col], emoticon_dict)
    
    # 7. Replace punctuation to spaces
    df[clean_col] = df[clean_col].apply(lambda x: re.sub(r'[^\w\s]','', x))
    
    # 8. Replace special characters to spaces
    df[clean_col] = df[clean_col].apply(lambda x: re.sub(r'[^a-zA-Z0-9]', ' ', x))
    
    # 9. Replace nums for spaces
    df[clean_col] = df[clean_col].apply(lambda x: re.sub(r'[^a-zA-Z]', ' ', x))
    
    # 10. Drop one char words
    vfunc = np.vectorize(remove_onechar_tokens)
    df[clean_col] = vfunc(df[clean_col])
    
    # 11. Tokenize text
    # df[token_col] = df[clean_col].apply(lambda x:  nltk.tokenize.word_tokenize(x))
    
    # 12. Filter stop words
    # stop_words = set(nltk.corpus.stopwords.words("english"))
    # df[filter_col] = df[token_col].apply(lambda x: filter_stop_words(x))
    
    # 13. Apply stemming
    # df[stemmed_col] = df[filter_col].apply(lambda x: stem_tokens(x))
    
    # 14. Lemmatize
    # df[lem_col] = df[stemmed_col].apply(lambda x: lemmatize_tokens(x))
    
    return df

### NER via LNTK

In [15]:
text = extract_text(df)

In [58]:
nltk_tags = nltk.pos_tag(nltk.word_tokenize(text))

In [63]:
nltk_tags[:20]

[('when', 'WRB'),
 ('father', 'NN'),
 ('is', 'VBZ'),
 ('dysfunctional', 'JJ'),
 ('and', 'CC'),
 ('is', 'VBZ'),
 ('so', 'RB'),
 ('selfish', 'JJ'),
 ('he', 'PRP'),
 ('drags', 'VBZ'),
 ('his', 'PRP$'),
 ('kids', 'NNS'),
 ('into', 'IN'),
 ('his', 'PRP$'),
 ('dysfunction', 'NN'),
 ('run', 'VB'),
 ('thanks', 'NNS'),
 ('for', 'IN'),
 ('lyft', 'JJ'),
 ('credit', 'NN')]

In [59]:
# Search for named entities via  nltk.ne_chunk()

named_entities = {(' '.join(c[0] for c in chunk), chunk.label() ) 
                  for chunk in nltk.ne_chunk(nltk_tags) 
                  if hasattr(chunk, 'label') }

In [60]:
# Named entities not found
named_entities

set()

##### Try preprocessor with upper case correction

In [41]:
# Recalculate new data
new_target = 'clean_tweet_uppercase'
df = preprocess(df, clean_col=new_target)

In [42]:
df[['clean_tweet', new_target]]

Unnamed: 0,clean_tweet,clean_tweet_uppercase
0,when father is dysfunctional and is so selfish...,WHEN FATHER IS DYSFUNCTIONAL AND IS SO SELFISH...
1,thanks for lyft credit cannot use cause they d...,THANKS FOR LYFT CREDIT CANT USE CAUSE THEY DON...
2,bihday your majesty,BIHDAY YOUR MAJESTY
3,model love you take with you all the time in ur,MODEL LOVE TAKE WITH ALL THE TIME IN UR
4,factsguide society now motivation,FACTSGUIDE SOCIETY NOW MOTIVATION
...,...,...
49154,thought factory leftright polarisation trump u...,THOUGHT FACTORY LEFTRIGHT POLARISATION TRUMP U...
49155,feeling like mermaid hairflip neverready forma...,FEELING LIKE MERMAID HAIRFLIP NEVERREADY FORMA...
49156,hillary campaigned today in ohioomg used words...,HILLARY CAMPAIGNED TODAY IN OHIOOMG USED WORDS...
49157,happy at work conference right mindset leads t...,HAPPY AT WORK CONFERENCE RIGHT MINDSET LEADS T...


In [44]:
# Extract raw text 
new_text = extract_text(df, target=new_target)

# Assign tags for tokens
new_nltk_tags = nltk.pos_tag(nltk.word_tokenize(new_text))


# Search for named entities via  nltk.ne_chunk()
new_named_entities = [(' '.join(c[0] for c in chunk), chunk.label() ) 
                  for chunk in nltk.ne_chunk(new_nltk_tags) 
                  if hasattr(chunk, 'label') ]

In [57]:
nltk_cntr = Counter()
nltk_ents_dict = {}

# Counts all entities

for entity in new_named_entities:
    nltk_cntr[entity[0]] += 1
    if entity[0] not in nltk_ents_dict:
        nltk_ents_dict[entity[0]] = entity[1]

In [63]:
# Most common from all entities
nltk_cntr.most_common(10)

[('THE', 4378),
 ('FATHERS', 983),
 ('FAMILY', 710),
 ('TIME', 582),
 ('HAPPINESS', 556),
 ('GOLD', 447),
 ('WELL', 405),
 ('TO', 380),
 ('WORLD', 307),
 ('WHITE', 302)]

In [61]:
%%time

# Counts top-20 Named entities  

tmp_cntr = 0


nltk_popular_orgs_and_persons = []
for i, common in enumerate(nltk_cntr.most_common(10000)):
    word = common[0]
    count = common[1]
    ent_label = nltk_ents_dict[word]
    if ent_label == "PERSON" or ent_label == "ORG" or ent_label == "GPE":
        nltk_popular_orgs_and_persons.append((word, ent_label, count))
        
    if len(nltk_popular_orgs_and_persons) == TOP_LIMIT:
        break

Wall time: 25 ms


In [62]:
# Resulted top-20 named entities
nltk_popular_orgs_and_persons

[('CLINTON', 'PERSON', 21),
 ('CLICK', 'PERSON', 9),
 ('CLICK TO WATCH', 'PERSON', 8),
 ('ARABIC', 'PERSON', 8),
 ('RUSSIA', 'GPE', 7),
 ('CLICK RECIPE', 'PERSON', 6),
 ('AMERICANS', 'GPE', 5),
 ('JOHNSON', 'PERSON', 5),
 ('BELGIAN', 'GPE', 4),
 ('JOHN', 'PERSON', 4),
 ('CLINTONS', 'PERSON', 4),
 ('INDIAN', 'GPE', 4),
 ('CLICK TO', 'PERSON', 3),
 ('JOHN WOODEN', 'PERSON', 3),
 ('ELECT', 'PERSON', 3),
 ('JOHN BURR', 'PERSON', 2),
 ('ALBUM ON', 'PERSON', 2),
 ('STEIN', 'PERSON', 2),
 ('SOROS', 'PERSON', 2),
 ('JOHN MCCAIN TO', 'PERSON', 2)]

### NER via SpaCy

In [70]:
nlp = en_core_web_md.load()

In [76]:
test_txt = text[:1000]
article = nlp(t)
displacy.render(article, jupyter=True, style='ent')

In [85]:
# display text dependences
displacy.render(article[:5], style='dep', jupyter=True)

In [99]:
def parse_string_entities(text_string, cntr, ents_dict):
    doc = nlp(text_string)
    ents = [(e.text, e.label_) for e in doc.ents]
    for entity in ents:

        cntr[entity[0]] += 1
        if entity[0] not in ents_dict:
            ents_dict[entity[0]] = entity[1]
    

In [100]:
%%time

cntr = Counter()
ents_dict = {}

# Counts all entities

for i in range(len(df)):
    parse_string_entities(df.iloc[i]['clean_tweet'], cntr, ents_dict)

Wall time: 14min 21s


In [104]:
# Top-20 cited entities
for word in cntr.most_common(TOP_LIMIT):
    ent_label = ents_dict[word[0]]
    print(word, ent_label)

('today', 1350) DATE
('friday', 590) DATE
('tomorrow', 586) DATE
('one', 526) CARDINAL
('first', 514) ORDINAL
('orlando', 482) GPE
('sunday', 472) DATE
('morning', 436) TIME
('bihday', 420) DATE
('tonight', 407) TIME
('summer', 400) DATE
('saturday', 324) DATE
('monday', 250) DATE
('america', 212) GPE
('night', 192) TIME
('two', 187) CARDINAL
('days', 182) DATE
('london', 172) GPE
('weekend', 169) DATE
('thursday', 162) DATE


In [105]:
%%time

# Counts top-20 Named entities  

tmp_cntr = 0


popular_orgs_and_persons = []
for i, common in enumerate(cntr.most_common(10000)):
    word = common[0]
    count = common[1]
    ent_label = ents_dict[word]
    if ent_label == "PERSON" or ent_label == "ORG" or ent_label == "GPE":
        popular_orgs_and_persons.append((word, ent_label, count))
        
    if len(popular_orgs_and_persons) == TOP_LIMIT:
        break

Wall time: 18.6 ms


In [106]:
popular_orgs_and_persons

[('orlando', 'GPE', 482),
 ('america', 'GPE', 212),
 ('london', 'GPE', 172),
 ('us', 'GPE', 126),
 ('bing bong bing bong', 'PERSON', 114),
 ('sjw', 'ORG', 106),
 ('uk', 'GPE', 97),
 ('obama', 'PERSON', 94),
 ('allahsoil', 'ORG', 88),
 ('florida', 'GPE', 82),
 ('gop', 'ORG', 81),
 ('miami', 'GPE', 74),
 ('suppo', 'PERSON', 68),
 ('nba', 'ORG', 63),
 ('india', 'GPE', 63),
 ('japan', 'GPE', 57),
 ('trump', 'ORG', 56),
 ('usa', 'GPE', 54),
 ('hillary', 'PERSON', 53),
 ('nyc', 'ORG', 52)]

### Summary

Распознавание именованных сущеностей в **Nltk** сильно зависит от регистра написания, капризный к тексту. Слова в нижнем регистре не обрабатываются. В топе поисков - имена, фамилии людей, крупные географические имена собственные. Есть ошибочное (мусорное) детектирование

Модуль **Spacy** обрабатывает именованные сущности в любом регистре, по этой причине чаще может ошибаться на оммонимах (us != US). В топе детекций, в основном имена географических объектов. Присутствуют ошибочные (мусорные) детекции. В целом, Spacy более гибок к входным данным, однако, требуется дополнительная подгонка под документ и оценка опечаток 

В целом, обе бибилиотеки пригодны для использования только в связке с постпроцессорами