# NLP on tweets - basics

Set up the required imports, and load the data:

In [79]:
import pandas as pd
import spacy as spc
from spacy import displacy as dsp
train_df = pd.read_csv("data/train.csv")

## Tailor the NLP pipeline to our purposes

Useful for reference:
* [rule-based matching](https://spacy.io/usage/rule-based-matching)
* [pipelines](https://spacy.io/usage/processing-pipelines)

Load up the base pipeline:

In [80]:
nlp = spc.load("en_core_web_sm")

The first step is to make sure that '@' and '#' get the same treatment. By default, '@' is considered a part of a token, and '#' is considered its own token. So, make sure that they are considered individual tokens, to make processing easier in later parts.

In [81]:
prefixes = nlp.Defaults.prefixes + (r'@',r'#')
prefix_regex = spc.util.compile_prefix_regex(prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search

So, this is what the tokenizer does now:

In [82]:
for tok in nlp("@username text #hashtag"):
    print(tok)

@
username
text
#
hashtag


Next, define the part of the pipeline that combines the '@' and '#' symbols, followed by alphanumerics, into a single token:

In [83]:
def retokenize_pipe(doc):
    with doc.retokenize() as retokenizer:
        for tok in doc:
            if (tok.text == '#' or tok.text == '@') and not bool(tok.whitespace_):
                retokenizer.merge(doc[tok.i:tok.i+2])
            elif tok.text == '&' and not bool(tok.whitespace_):
                retokenizer.merge(doc[tok.i:tok.i+3])
    return doc

In some cases, the tweets contain HTML entities. Some should be replaced with their equivalent words like ('&' to 'and'), others should be removed/

In [84]:
def handle_htmlents_pipe(doc):
    replacements = {
        '&amp;': 'and',
        '&deg;': 'degrees'
    }
    words = []
    has_space = []
    for t in doc:
        if t.text.startswith('&') and t.text.endswith(';'):
            if t.text in replacements:
                words.append(replacements[t.text])
                has_space.append(t.whitespace_)
        else:
            words.append(t.text)
            has_space.append(t.whitespace_)

    
    return spc.tokens.Doc(doc.vocab, words=words, spaces=has_space)

Certain mentions should be treated as special, i.e. if they mention an account belonging to a news/disaster relief organisation. The presence of such a handle would be a strong indicator of the tweet being about a disaster.

In [85]:
from json import load
with open('twitter_handles.json') as f:
    twitter_handles = load(f)

is_news_mention = lambda t: t.text.startswith('@') and t.text[1:] in twitter_handles['news']
is_relief_mention = lambda t: t.text.startswith('@') and t.text[1:] in twitter_handles['relief']
spc.tokens.Token.set_extension("is_news_mention", getter=is_news_mention, force=True)
spc.tokens.Token.set_extension("is_relief_mention", getter=is_relief_mention, force=True)

The extensions set above can be called with e.g. `Token._.is_news_mention`, they're not pipeline components. `force=True` is set because Jupyter would complain.

Then, the part of the pipeline which takes a '@xxxx' or '#xxx' symbol and marks it as the correct entity. Also marks links as link entities.

In [86]:
def entity_pipe(nlp):
    ruler = nlp.create_pipe("entity_ruler")
    
    patterns = [
        {"label": "HASHTAG", "pattern": [{"TEXT": {"REGEX": r'^#\w+'}}]},
        {"label": "LINK", "pattern": [{"TEXT": {"REGEX": r'https?://.*'}}]},
        {"label": "NEWS-ORG", "pattern": [{"_": {"is_news_mention": True}}]},
        {"label": "RELIEF-ORG", "pattern": [{"_": {"is_relief_mention": True}}]},
        {"label": "MENTION", "pattern": [{"TEXT": {"REGEX": r'^@\w+'}, "_": {"is_news_mention": False, "is_relief_mention": False}}]}
    ]
    
    ruler.add_patterns(patterns)
    return ruler

Some tweets include abbreviations, and these may get incorrectly classified by the default tools. This part of the pipeline should fix that.

In [87]:
def abbr_handler(nlp):
    ruler = nlp.create_pipe("entity_ruler")
    patterns = [
        {"label": "ORG", "pattern": [{"LOWER": {"REGEX": r'e\.?r\.?'}}]},
        {"label": "ORG", "pattern": [{"LOWER": "emergency"}, {"LOWER": "room"}]},
        {"label": "ORG", "pattern": [{"LOWER": "emergency"}, {"LOWER": "relief"}]}
    ]
    ruler.add_patterns(patterns)
    return ruler

Insert the functions into the NLP pipeline. The tokenizer is the first thing that runs (implicitly, it's not visible in the pipeline), so the token combiner function should be the first thing in the pipeline. The entity ruler should go before the named entity recogniser, as we want the NER to recognise anything that our custom ruler doesn't recognise, not the other way around.

In [88]:
nlp.add_pipe(retokenize_pipe, name="retokenizer", first=True)
nlp.add_pipe(handle_htmlents_pipe, name="html_ent_handler", after='retokenizer')
nlp.add_pipe(entity_pipe(nlp), name="entruler", before='ner')
nlp.add_pipe(abbr_handler(nlp), name="abbr_handler", after='ner')

The current pipeline looks like this:

In [89]:
nlp.pipeline

[('retokenizer', <function __main__.retokenize_pipe(doc)>),
 ('html_ent_handler', <function __main__.handle_htmlents_pipe(doc)>),
 ('tagger', <spacy.pipeline.pipes.Tagger at 0x120f90dd0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1217d6b40>),
 ('entruler', <spacy.pipeline.entityruler.EntityRuler at 0x13fa0b2d0>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1217d6e50>),
 ('abbr_handler', <spacy.pipeline.entityruler.EntityRuler at 0x13fa0b3d0>)]

## Execute the pipeline

Now, run the pipeline on a random tweet:

In [90]:
random_tweet = train_df.sample().iloc[0].text
doc = nlp(random_tweet)

The tweet has been tokenized, but not all tokens are useful. In particular, stop words and punctuation are useless for us, so `is_token_allowed` will filter those out:

In [91]:
def is_token_allowed(token):
    return (token and token.string.strip() and not token.is_stop and not token.is_punct)

We also only want some entities:

In [92]:
def is_entity_allowed(entity):
    wanted = ['NEWS-ORG', 'RELIEF-ORG', 'HASHTAG', 'ORG', 'GPE', 'FAC']
    return entity.label_ in wanted

Also, all tokens should be converted to their lowercase, lemmatized form.
So, define two hashes containing the results from the processed doc:

In [93]:
useful_tokens = [{'token': token.lemma_.strip().lower(), 'pos': token.pos_, 'dep': token.dep_, 'ent': token.ent_type_} for token in doc if is_token_allowed(token)]
useful_entities = [{'text': ent.text, 'label': ent.label_} for ent in doc.ents if is_entity_allowed(ent)]

Extract tokens with specific properties to get the general idea of the sentence:

In [94]:
def extract_sov(toks):
    return list({t['token'] for t in toks 
            if ((t['dep'] in ['nsubj', 'nsubjpass', 'dobj', 'obj', 'pobj']) 
                or (t['pos'] in ['VERB', 'NOUN', 'PROPN']))
            and t['ent'] not in ['LINK', 'MENTION', 'HASHTAG', 'RELIEF-ORG', 'NEWS-ORG']})

main_toks = extract_sov(useful_tokens)

## See the results

Finally, print out the results:

In [78]:
print("Full tweet:")
print(f'\t{doc.text}\n')

print("Useful tokens:")
for tok in useful_tokens:
    print(f'\t{tok}')

print("\nUseful entities:")

for ent in useful_entities:
    print(f'\t{ent}')
    
print("\nGist of the tweet:")
print(f'\t{main_toks}')

if doc.ents:
    dsp.render(doc, style='ent', options={'colors': {'USER': 'linear-gradient(90deg, #fc4a1a, #f7b733)', 
                                                     'HASHTAG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)',
                                                     'LINK': 'linear-gradient(90deg, #B2FEFA, #0ED2F7)'}})
else:
    print("No entities present.")

Full tweet:
	#hot  Reddit's new content policy goes into effect many horrible subreddits banned or quarantined http://t.co/algtcN8baf #prebreak #best

Useful tokens:
	{'token': '#hot', 'pos': 'ADJ', 'dep': 'ROOT', 'ent': 'HASHTAG'}
	{'token': 'reddit', 'pos': 'PROPN', 'dep': 'poss', 'ent': ''}
	{'token': 'new', 'pos': 'ADJ', 'dep': 'amod', 'ent': ''}
	{'token': 'content', 'pos': 'NOUN', 'dep': 'compound', 'ent': ''}
	{'token': 'policy', 'pos': 'NOUN', 'dep': 'nsubj', 'ent': ''}
	{'token': 'go', 'pos': 'VERB', 'dep': 'ROOT', 'ent': ''}
	{'token': 'effect', 'pos': 'NOUN', 'dep': 'pobj', 'ent': ''}
	{'token': 'horrible', 'pos': 'ADJ', 'dep': 'amod', 'ent': ''}
	{'token': 'subreddit', 'pos': 'NOUN', 'dep': 'nsubj', 'ent': ''}
	{'token': 'ban', 'pos': 'VERB', 'dep': 'advcl', 'ent': ''}
	{'token': 'quarantine', 'pos': 'VERB', 'dep': 'conj', 'ent': ''}
	{'token': 'http://t.co/algtcn8baf', 'pos': 'NOUN', 'dep': 'dobj', 'ent': 'LINK'}
	{'token': '#prebreak', 'pos': 'NOUN', 'dep': 'punct', 'ent'