In [1]:
import os
import re
import pprint

from nltk.tag import StanfordNERTagger
from nltk import word_tokenize

from json_io import list_from_json

## Download and setup instructions

1) Download zip to [Stanford's Java NER](http://nlp.stanford.edu/software/CRF-NER.shtml#Download)

2) Add path to *stanford-ner.jar* to $CLASSPATH environment variable

3) For project convenience, set an environment variable *$STANFORD_NER* to path to *stanford-ner-2016-10-31/classifiers/english.all.3class.distsim.crf.ser.gz*

In [2]:
# Check if environment variables set correctly. 
# Will need to restart shell running 'jupyter notebook' for changes to take effect
print(os.environ.get('CLASSPATH'))
print(os.environ.get('STANFORD_NER'))

/Users/James/School/Spring_2017/Senior-Design-CSC-59867/senior-design/stanford-ner-2016-10-31/stanford-ner.jar
/Users/James/School/Spring_2017/Senior-Design-CSC-59867/senior-design/stanford-ner-2016-10-31/classifiers/english.all.3class.distsim.crf.ser.gz


In [3]:
st = StanfordNERTagger(os.environ.get('STANFORD_NER'), encoding='UTF-8')
st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

[('Rami', 'PERSON'),
 ('Eid', 'PERSON'),
 ('is', 'O'),
 ('studying', 'O'),
 ('at', 'O'),
 ('Stony', 'ORGANIZATION'),
 ('Brook', 'ORGANIZATION'),
 ('University', 'ORGANIZATION'),
 ('in', 'O'),
 ('NY', 'O')]

In [4]:
sarcastic_tweets = list_from_json("../json/sarcastic/unique.json")

In [5]:
def ner_tag_tweet(tweet, ner_tagger):
    """
    take a tweet object and a Stanford NER tagger object, and returns tweet object with new field "ner_text" 
    containing words in tweet replaced by entities found by tagger
    """
    # tokenize tweet using nltk
    word_tokenized_tweet = word_tokenize(tweet["text"])
    
    # list of tuples (word, entity) where entity is not "O" (tuples where entity is found)
    ner_tuples = [ner_tuple for ner_tuple in ner_tagger.tag(word_tokenized_tweet) if ner_tuple[1] is not "O"]
    
    # construct new string by replacing each word in tweet with its corresponding entity tag
    ner_text = tweet["text"] 
    for word, entity in ner_tuples:
        ner_text = ner_text.replace(word, "[{}]".format(entity))
        
    # replace embedded urls/media with [url], [media], or [url_media]
    if tweet["media"] or tweet["urls"]:
        if tweet['media'] and tweet['urls']:
            replacement_word = "[URL_MEDIA]"
        elif tweet['media']:
            replacement_word = "[MEDIA]"
        else:
            replacement_word = "[URL]"
        
        ner_text = " ".join([replacement_word if word.startswith("https://") else word for word in ner_text.split()])
    
    # replace mentions (@[handle]) with PERSON tag
    ner_text = " ".join(["[PERSON]" if (word.startswith("@") and len(word) > 1) else word for word in ner_text.split()])
    
    tweet["ner_text"] = ner_text
    return tweet

In [6]:
for sarcastic_tweet in sarcastic_tweets[:25]:
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(ner_tag_tweet(sarcastic_tweet, st))
    print("*********************************")

{   'id': 827590313191444481,
    'media': False,
    'ner_text': '[PERSON] Come on [LOCATION]! What about the '
                '#bowlinggreenmassacre ? Damn those Muslims! #alternatefacts '
                '#Sarcasm #WorstPresidentEver',
    'text': '@MetalBlonde Come on Karina! What about the #bowlinggreenmassacre '
            '? Damn those Muslims!  #alternatefacts #Sarcasm '
            '#WorstPresidentEver',
    'urls': False}
*********************************
{   'id': 827590233298436101,
    'media': False,
    'ner_text': 'Take that Black History month.Make America White again, with '
                'fatty tissue around the heart #sarcasm #trump #first100 [URL]',
    'text': 'Take that Black History month.Make America White again, with '
            'fatty tissue around the heart #sarcasm #trump #first100\n'
            'https://t.co/SXnqNmRgqr',
    'urls': True}
*********************************
{   'id': 827589626944770049,
    'media': False,
    'ner_text': 'Ears lowere

In [9]:
def ner_tag_tweet(tweet, ner_tagger):
    """
    take a tweet object and a Stanford NER tagger object, and returns tweet object with new field "ner_text" 
    containing words in tweet replaced by entities found by tagger, mentions (@[handle]) replaced by [PERSON],
    and embedded links/media replaced by [URL] or [MEDIA] or [URL_MEDIA]
    
    Stanford NER pre-trained tagger supports up to seven entity types, depending on classifier used:
    PERSON (3 class, 4 class, 7 class)
    ORGANIZATION (3 class, 4 class, 7 class)
    LOCATION (3 class, 4 class, 7 class)
    MISCELLANEOUS (4 class)
    TIMES (7 class)
    MONEY (7 class)
    PERCENTS (7 class)
    DATES (7 class)
    """
    # tokenize tweet using nltk
    word_tokenized_tweet = word_tokenize(tweet["text"])
    
    # list of tuples (word, entity) where entity is not "O" (tuples where entity is found)
    ner_tuples = [ner_tuple for ner_tuple in ner_tagger.tag(word_tokenized_tweet) if ner_tuple[1] is not "O"]
    
    # construct new string by replacing each word in tweet with its corresponding entity tag
    ner_text = tweet["text"] 
    for word, entity in ner_tuples:
        ner_text = ner_text.replace(word, "[{}]".format(entity))
        
    # replace embedded urls/media with [url], [media], or [url_media]
    if tweet["media"] or tweet["urls"]:
        if tweet['media'] and tweet['urls']:
            replacement_word = "[URL_MEDIA]"
        elif tweet['media']:
            replacement_word = "[MEDIA]"
        else:
            replacement_word = "[URL]"
        
        # replace twitter links with appropriate tag
        ner_text = re.sub("https://t.co/(\w)+", replacement_word, ner_text)
    
    # replace mentions (@[handle]) with PERSON tag
    ner_text = re.sub("@(\w)+", "[PERSON]", ner_text)
    
    tweet["ner_text"] = ner_text
    return tweet

In [10]:
for sarcastic_tweet in sarcastic_tweets[:25]:
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(ner_tag_tweet(sarcastic_tweet, st))
    print("*********************************")

{   'id': 827590313191444481,
    'media': False,
    'ner_text': '[PERSON] Come on [LOCATION]! What about the '
                '#bowlinggreenmassacre ? Damn those Muslims!  #alternatefacts '
                '#Sarcasm #WorstPresidentEver',
    'text': '@MetalBlonde Come on Karina! What about the #bowlinggreenmassacre '
            '? Damn those Muslims!  #alternatefacts #Sarcasm '
            '#WorstPresidentEver',
    'urls': False}
*********************************
{   'id': 827590233298436101,
    'media': False,
    'ner_text': 'Take that Black History month.Make America White again, with '
                'fatty tissue around the heart #sarcasm #trump #first100\n'
                '[URL]',
    'text': 'Take that Black History month.Make America White again, with '
            'fatty tissue around the heart #sarcasm #trump #first100\n'
            'https://t.co/SXnqNmRgqr',
    'urls': True}
*********************************
{   'id': 827589626944770049,
    'media': False,
    'ne