In [3]:
import pandas as pd
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
training_set = "../Data/train_data.csv"
train_data = pd.read_csv(training_set, engine='python')

#dropna drops missing values(not available)
train_data = train_data.dropna(axis=0)
#print train_data.sentiment.unique()

X = train_data.content

In [5]:
#open the lexicon
lexicon = "../Data/NRC-Emotion-Lexicon-v0.92-English.csv"
lex_data = pd.read_csv(lexicon, engine='python')
emos = ['Positive','Negative','Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust']

#prepare the tweets
print "Cleaning tweets and POS Tagging..."
document = [re.sub(r'[^\x00-\x7f]',r' ',s) for s in X]          #remove non-ascii characters
#POS tag hashtags, @-mentions, remove whitespace and unnecessary symbols(;:.'")
document = [re.sub(r'https?:\/\/[^ ]*',r'[URL]',s) for s in document]
document = [re.sub(r'#[^ ]*',r'[HASHTAG]', s) for s in document]
document = [re.sub(r'@[^ ]*',r'[AT_MENTION]', s) for s in document]
document = [re.sub("[^A-Za-z_' ]+",r' ', s) for s in document]

#tokenize them and for each word create a tuple
#with the word and the number of times it exists in the tweet
print "Tokenizing..."
tokens = []
for tweet in document:
    tokens.append([w.lower() for w in word_tokenize(tweet)])

Cleaning tweets and POS Tagging...
Tokenizing...


In [55]:
print "Removing Stopwords..."
#remove stopwords
filtered = []
for i,lst in enumerate(tokens):
    if i == len(tokens)/4:
        print "25%"
    if i == len(tokens)/2:
        print "50%"
    if i == (3*len(tokens))/4:
        print "75%"
    filtered.append([w for w in lst if not w in stopwords.words('english')])
print "100%"

print "Lemmatizing..."
#Lemmatize with POS Tags
#it may take some minutes !!
from nltk.corpus import wordnet
from nltk.stem import  WordNetLemmatizer
import nltk

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
lems = []
for lst in filtered:
    lems.append([ lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in lst ])

print lems[:10]

print "Creating the tuple..."
tuples = []
for lst in lems:
    count = Counter(lst)
    tuples.append(count.most_common(len(count)))

Removing Stopwords...
25%
50%
75%
100%
Lemmatizing...
[['at_mention', 'know', 'listenin', 'bad', 'habit', 'earlier', u'start', 'freakin', 'part'], ['layin', 'n', 'bed', 'headache', 'ughhhh', 'waitin', 'call'], ['funeral', 'ceremony', 'gloomy', 'friday'], [u'want', 'hang', u'friend', 'soon'], ['at_mention', 'want', 'trade', 'someone', 'houston', u'ticket', 'one'], [u'ping', 'at_mention', "n't", 'go', 'prom', 'bc', 'bf', "n't", 'like', u'friend'], ['sleep', 'im', u'think', 'old', 'friend', 'want', "'s", 'married', 'damn', 'amp', u'want', 'scandalous'], ['hmmm', 'url'], ['at_mention', 'charlene', 'love', 'miss'], ['at_mention', "'m", 'sorry', 'least', "'s", 'friday']]
Creating the tuple...


In [79]:
print tuples[29]

[('hate', 4), ('cancer', 1)]


In [77]:
#for each word in the tweet
results = []
for lst in tuples[:100]:
    vals = [0,0,0,0,0,0,0,0,0,0]
    for word, count in lst:
        tmp = count * lex_data[emos].where(lex_data['English'] == word).dropna(axis=0).values
        if len(tmp) != 0:
            vals += tmp;
    results.append(vals)

In [80]:
print results[29]

[[0. 5. 5. 0. 5. 5. 0. 5. 0. 0.]]
