### Imports

In [6]:
%matplotlib inline

import numpy as np
import NLPlib as nlp

import csv
import itertools

import re
import HTMLParser

from matplotlib import pyplot as plt

### Part 1: Pre-process, tokenize and tag

CSV Format:

1. the polarity of the tweet (0 = negative emotion, 4 = positive emotion)
2. the id of the tweet (e.g., 2087)
3. the date of the tweet (e.g., Sat May 16 23:58:44 UTC 2009)
4. the query (e.g., lyx). If there is no query, then this value is NO QUERY. 
5. the user that tweeted (e.g., robotickilldozr)
6. the text of the tweet (e.g., Lyx is cool)

In [26]:
GID = 4
class_zero_data = [GID * 5500, GID * 5500 + 10] # (GID + 1) * 5500 - 1]
class_four_data = [GID * 5500 + 800000, GID * 5500 + 800000 + 10] # (GID + 1) * 5500 - 1 + 800000]

In [28]:
with open('training.1600000.processed.noemoticon.csv', 'rb') as train_file:
    reader = csv.reader(train_file)
    
    for row in itertools.islice(reader, *class_zero_data):
        print row
    for row in itertools.islice(reader, *class_four_data):
        print row

['0', '1557393927', 'Sun Apr 19 03:50:43 PDT 2009', 'NO_QUERY', 'davidgarlick', '@Natalia_Bella not much to buy now Woolworth closed down ']
['0', '1557394133', 'Sun Apr 19 03:50:48 PDT 2009', 'NO_QUERY', 'sarah_etf', 'Kill me please -.- ...Oh crap school tommorow ']
['0', '1557394170', 'Sun Apr 19 03:50:48 PDT 2009', 'NO_QUERY', 'hypnotic', "@chriskeating re the labour general secretary meeting with Labour PM's aide - I posted the very same on facebook. BBC gone downhill "]
['0', '1557394563', 'Sun Apr 19 03:50:57 PDT 2009', 'NO_QUERY', 'Robbertt', 'Whole day of homework ahead of name ']
['0', '1557394776', 'Sun Apr 19 03:51:02 PDT 2009', 'NO_QUERY', 'desii____8', "hamlet...romeo n juliet...radio:ACTIVE live at Wembley...McFly tour DVD's too money to me "]
['0', '1557394848', 'Sun Apr 19 03:51:03 PDT 2009', 'NO_QUERY', 'loubeejones', '@charleypearson haha, lucky you. i just got told one!  loubee is not happy!']
['0', '1557395142', 'Sun Apr 19 03:51:09 PDT 2009', 'NO_QUERY', 'musicjunk

In [30]:
row[5]

'working on different politic media plans for the upcoming EU elections... '

##### 1. All html tags and attributes (i.e., /<[^>]+>/) are removed.

In [4]:
def strip_html_tags(tweet):
    regex = re.compile('<[^>]+>')
    return regex.sub('', tweet)

In [5]:
strip_html_tags('<a href="foo.com" class="bar">I Want This <b>text!</b></a>')

'I Want This text!'

##### 2. Html character codes (i.e., &...;) are replaced with an ASCII equivalent.

In [7]:
def replace_html_codes(tweet):
    parser = HTMLParser.HTMLParser()
    return parser.unescape(tweet)

In [12]:
print replace_html_codes('&quot;You win &pound;100 &iexcl; &quot;')

"You win £100 ¡ "


##### 3. All URLs (i.e., tokens beginning with http or www) are removed.

In [26]:
def remove_urls(tweet):
    # Note that this will modify the whitespace when words are separated by
    # more than one space, but that shouldn't matter as we are tokenizing
    # the tweets anyways
    
    return ' '.join(filter(lambda x : not x.startswith(('www', 'http')), tweet.split()))

In [29]:
remove_urls("brad is the best www.youtube.com dancer but not the worst http://www.google.ca singer")

'brad is the best dancer but not the worst singer'

##### 4. The first character in Twitter user names (@) and hash tags (#) are removed.

In [55]:
def remove_hashtags(tweet):
    return ' '.join([ x[1:] if  x.startswith(('@', '#')) else x for x in tweet.split()])

In [57]:
remove_hashtags('brad #donkey @kick face #@other @#test')

'brad donkey kick face @other #test'

##### 5. Each sentence within a tweet is on its own line.

In [107]:
def create_abbrev_set(file_path='abbrev.english'):
    abbrev_set = set()
    
    with open(file_path, 'rb') as abbrevs:
        for line in abbrevs:
            abbrev_set.union(line.strip())
            #print line.strip()
    
    return abbrev_set

In [108]:
abbrev_set = create_abbrev_set()

In [72]:
print abbrev_set.issubset('Mr.')
print abbrev_set.issubset('vs.')

True
True


In [114]:
def split_by_sentence(tweet):
    '''
        # 1. Anything ending in .?! declared a sentence
        # 2. Sentence boundary moved after quotation mark, if any ex. He said, "I am coming."
        # 3. Period boundary is disqualified if it preceded by an element in abbrev_set
        #    <We could look for capitals after an EOS, but nobody uses capitals on twitter>
        #    <Both sides of :;- could also be thought of as sentence>
    '''
    
    abbrev_set = create_abbrev_set()
    split_by_space = tweet.split(' ')
    eos_indices = [i + 1 for i, x in enumerate(split_by_space) if x[-1] in {'.', '?', '!'} or x[-2:-1] in {'."', '?"', '!"'} and not abbrev_set.issubset(x)]
    return [' '.join(x) for x in [split_by_space[i:j] for i, j in zip([0] + eos_indices, eos_indices + [None])]]

In [115]:
sample_tweet = '4km technique swim set done. Meeting with a creative director at 10am. Photographs to the printers. A million phone calls to make'

In [116]:
split_by_sentence(sample_tweet)

['4km technique swim set done.',
 'Meeting with a creative director at 10am.',
 'Photographs to the printers.',
 'A million phone calls to make']

##### 8. Each token is tagged with its part-of-speech.

In [119]:
tagger = nlp.NLPlib()


unpickle the dictionary
Initialized lexHash from pickled data.


In [120]:
sent = ["tag", "me"]
tags = tagger.tag(sent)
tags

['NN', 'PRP']

In [198]:
tagger.tag('/')

['NN']

In [291]:
tagger.tag('"')

['"']

### Part 2: Feature Extraction

In [202]:
sample_parser_output = """<A=4>
Meet/VB me/PRP today/NN at/IN the/DT FEC/NN in/IN DC/NN at/IN 4/NN ./.
Wear/VB a/DT carnation/NN so/RB I/PRP know/VB it/PRP 's/POS you/PRP ./."""

In [203]:
sample_parser_output

"<A=4>\nMeet/VB me/PRP today/NN at/IN the/DT FEC/NN in/IN DC/NN at/IN 4/NN ./.\nWear/VB a/DT carnation/NN so/RB I/PRP know/VB it/PRP 's/POS you/PRP ./."

In [195]:
def compute_feature_vector(sentences, label):
    if len(sentences) == 0:
        return

    feature_vector = []
    function_set = [
                    first_person_pronouns, 
                    second_person_pronouns, 
                    third_person_pronouns,
                    coordinating_conjunctions,
                    past_tense_verbs,
                    future_tense_verbs,
                    commas,
                    colons,
                    dashes,
                    parantheses,
                    ellipses,
                    common_nouns,
                    proper_nouns,
                    adverbs,
                    wh_words,
                    slang_acronyms,
                    upper_case_words,
                    sentence_length,
                    token_length,
                    number_sentences
                   ]
    
    for function in function_set:
        feature_vector.append(function(sentences))
        
    feature_vector.append(label)
    
    print feature_vector

In [196]:
sentence_container = []

for line in sample_parser_output.split('\n'):

    if line.startswith('<A='):
        class_label = int(line[3])
        compute_feature_vector(sentence_container, class_label)
        sentence_container = []
    else:
        sentence_container.append(line)
    
compute_feature_vector(sentence_container, class_label)
    

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4]


##### Per-feature functions

In [207]:
test_sentences = ["Meet/VB me/PRP today/NN at/IN the/DT FEC/NN in/IN DC/NN at/IN 4/NN ./.", "Wear/VB a/DT carnation/NN so/RB I/PRP know/VB it/PRP 's/POS you/PRP ./."]
test_sentences

['Meet/VB me/PRP today/NN at/IN the/DT FEC/NN in/IN DC/NN at/IN 4/NN ./.',
 "Wear/VB a/DT carnation/NN so/RB I/PRP know/VB it/PRP 's/POS you/PRP ./."]

In [217]:
def split_sentences(sentences):
    tokens = [x.split(' ') for x in sentences]
    return [y.split('/') for x in tokens for y in x]

In [218]:
split_sentences(test_sentences)

[['Meet', 'VB'],
 ['me', 'PRP'],
 ['today', 'NN'],
 ['at', 'IN'],
 ['the', 'DT'],
 ['FEC', 'NN'],
 ['in', 'IN'],
 ['DC', 'NN'],
 ['at', 'IN'],
 ['4', 'NN'],
 ['.', '.'],
 ['Wear', 'VB'],
 ['a', 'DT'],
 ['carnation', 'NN'],
 ['so', 'RB'],
 ['I', 'PRP'],
 ['know', 'VB'],
 ['it', 'PRP'],
 ["'s", 'POS'],
 ['you', 'PRP'],
 ['.', '.']]

In [226]:
def first_person_pronouns(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['I', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']
    return [x[0] in candidate_words for x in token_split].count(True)

In [227]:
first_person_pronouns(test_sentences)

2

In [228]:
def second_person_pronouns(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['you', 'your', 'yours', 'u', 'ur', 'urs']
    return [x[0] in candidate_words for x in token_split].count(True)

In [229]:
second_person_pronouns(test_sentences)

1

In [232]:
def third_person_pronouns(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['he', 'him', 'his', 'she', 'her', 'hers', 'it', 'its', 'they', 'them', 'their', 'theirs']
    return [x[0] in candidate_words for x in token_split].count(True)

In [233]:
third_person_pronouns(test_sentences)

1

In [235]:
def coordinating_conjunctions(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['CC']
    return [x[1] in candidate_words for x in token_split].count(True)

In [236]:
coordinating_conjunctions(test_sentences)

0

In [237]:
def past_tense_verbs(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['VBD']
    return [x[1] in candidate_words for x in token_split].count(True)

In [238]:
past_tense_verbs(test_sentences)

0

In [165]:
def future_tense_verbs(sentences):
    token_split = split_sentences(sentences)
    candidate_words = [',']
    return [x[1] in candidate_words for x in token_split].count(True)

In [239]:
def commas(sentences):
    token_split = split_sentences(sentences)
    candidate_words = [',']
    return [x[1] in candidate_words for x in token_split].count(True)

In [240]:
commas(test_sentences)

0

In [242]:
def colons(sentences):
    token_split = split_sentences(sentences)
    candidate_words = [':', ';']
    return [x[0] in candidate_words for x in token_split].count(True)

In [243]:
colons(test_sentences)

0

In [244]:
def dashes(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['-']
    return [x[0] in candidate_words for x in token_split].count(True)

In [245]:
def parantheses(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['(', ')']
    return [x[0] in candidate_words for x in token_split].count(True)

In [246]:
def ellipses(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['...']
    return [x[0] in candidate_words for x in token_split].count(True)

In [263]:
def common_nouns(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['NN', 'NNS']
    return [x[1] in candidate_words for x in token_split].count(True)

In [264]:
def proper_nouns(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['NNP', 'NNPS']
    return [x[1] in candidate_words for x in token_split].count(True)

In [265]:
def adverbs(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['RB', 'RBR', 'RBS']
    return [x[1] in candidate_words for x in token_split].count(True)

In [266]:
def wh_words(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['WDT', 'WP', 'WP$', 'WRB']
    return [x[1] in candidate_words for x in token_split].count(True)

In [267]:
def slang_acronyms(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['smh', 'fwb',  'lmfao', 'lmao', 'lms', 'tbh',  'rofl', 'wtf',
                       'bff', 'wyd',  'lylc',  'brb',  'atm', 'imao', 'sml',  'btw',
                       'bw',  'imho', 'fyi',   'ppl',  'sob', 'ttyl', 'imo',  'ltr',
                       'thx', 'kk',   'omg',   'ttys', 'afn', 'bbs',  'cya',  'ez',
                       'f2f', 'gtr',  'ic',    'jk',   'k',   'ly',   'ya',   'nm',  'np',
                       'plz', 'ru',   'so',    'tc',   'tmi', 'ym',   'ur',   'u',   'sol']
    return [x[0] in candidate_words for x in token_split].count(True)

In [268]:
slang_acronyms(test_sentences)

1

In [269]:
def upper_case_words(sentences):
    token_split = split_sentences(sentences)
    return [x[0].isupper() and len(x[0]) > 1 for x in token_split].count(True)

In [270]:
upper_case_words(test_sentences)

2

In [259]:
def sentence_length(sentences):
    token_split = split_sentences(sentences)
    return len(token_split) / float(len(sentences))

In [260]:
sentence_length(test_sentences)

10.5

In [292]:
def token_length(sentences):
    token_split = split_sentences(sentences)
    candidate_words = ['#', '$', '.', ',', ':', '(', ')', '"', 'POS']
    token_lengths = [len(x[0]) for x in token_split if x[1] not in candidate_words]
    return sum(token_lengths) / float(len(token_lengths))

In [293]:
token_length(test_sentences)

2.888888888888889

In [261]:
def number_sentences(sentences):
    return len(sentences)

In [262]:
number_sentences(test_sentences)

2

### IBM Watson NLP Classifier

{
  "credentials": {
    "url": "https://gateway.watsonplatform.net/natural-language-classifier/api",
    "username": "2bd0e6c7-5784-4967-860c-a9778754fdee",
    "password": "rFs4Solusscl"
  }
}