In [8]:
from __future__ import division
from collections import Counter # Counter() is a dict for counting
from collections import defaultdict

from nltk.lm.util import log_base2
from numpy import mean
import numpy as np
import pandas as pd


In [12]:
# Sentiment values
sentiments = ["positive","neutral","negative"]
# List of positive words:
pos_words = ["love", "great", "like"]
# List of negative words:
neg_words = ["hate", "bad", "annoy"]
# List of target companies:
companies = ["@virginamerica", "@united", "@southwestair", "@jetblue", "@usairways", "@americanair"]
sentiment_words = pos_words+neg_words

def s2id(sentiment):
    if sentiment == "positive":
        return 0
    elif sentiment == "neutral":
        return 1
    elif sentiment == "negative":
        return 2
    else:
        print("ERROR: bad value!!")
        return -1
 

In [11]:
#remove any counts from dictionary if it's below min_threshold or above max_treshold
#max_threshold is ignored if unset
def filter_occ_counts(counts, min_threshold, max_threshold=0):
    if (max_threshold > 0):
        return Counter({w : counts[w] for w in counts.keys() if counts[w] > min_threshold and counts[w] < max_threshold})
    else:
        return Counter({w : counts[w] for w in counts.keys() if counts[w] > min_threshold})

#remove any co-occurence counts if below threshold 
def filter_cooc_counts(co_counts, min_threshold):
     return {w: filter_occ_counts(co_counts[w], min_threshold) for w in co_counts.keys()}


In [15]:
def PMI(c_xy, c_x, c_y, N):
    # Computes PMI(x, y) where
    # c_xy is the number of times x co-occurs with y
    # c_x is the number of times x occurs.
    # c_y is the number of times y occurs.
    # N is the number of observations.
    pmi = 0
    #TODO: code the PMI here
    pmi = log_base2((c_xy/N) / ((c_x/N) * (c_y/N)))
    return pmi


#Do a simple error check using value computed by hand
if(PMI(2,4,3,12) != 1): # these numbers are from our y,z example
    print("Warning: PMI is incorrectly defined")
else:
    print("PMI check passed")


PMI check passed


In [29]:
# Define the data structures used to store the counts:
occ_counts = Counter() # Occurrence counts
cooc_counts = defaultdict(Counter) # Co-occurrence counts:

#This will be indexed by target words. cooc_counts[companies] will contain
#a dictionary of co-occurrence counts of companies with each sentiment word.
# read the file Tweets_short.csv
df = pd.read_csv("Tweets_short.csv", index_col=0)

# TODO: apply preprocessing (e.g. lowercase)
# TODO: update N so that it contains the total number of tweets
df.text = df.text.apply(lambda x: x.lower())

# Combine all tweets into one string
all_words = ' '.join(df['text']).lower().split()
# Count occurrences of each word
word_counts = Counter(all_words)
#print(word_counts)

N = sum(word_counts.values())
print("Total number of tweets: {}".format(N))

#iterate over the tweets and count the words
for sentiment, tweet in df.itertuples(index=False):
    words = set(tweet.strip().split()) #remove duplicate words
    for word in words:
        occ_counts[word] += 1 # Store occurence counts for all words
        # but only get co-occurrence counts for companies/sentiment word pairs
        if word in companies:
            for word2 in words:
                if word2 in sentiment_words:
                    cooc_counts[word][word2] += 1 # Store co-occurence counts




Total number of tweets: 258446


In [30]:
#For a Counter c, c.most_common(n) returns a sorted list of the n most common 
#items in c. If no n is given, it returns all items, sorted by decreasing frequency
print("Counts of positive words:")
print(Counter({w : occ_counts[w] for w in pos_words}).most_common())
print("Counts of negative words:")
print(Counter({w : occ_counts[w] for w in neg_words}).most_common())
print("Counts of company:")
print(Counter({w : occ_counts[w] for w in companies}).most_common())
print()
for company in companies:
    print("{:14s} cooc counts: {}".format(company, cooc_counts[company].most_common()))
    

Counts of positive words:
[('like', 411), ('great', 252), ('love', 213)]
Counts of negative words:
[('bad', 148), ('hate', 37), ('annoy', 0)]
Counts of company:
[('@united', 3738), ('@usairways', 2923), ('@americanair', 2876), ('@southwestair', 2384), ('@jetblue', 2009), ('@virginamerica', 493)]

@virginamerica cooc counts: [('love', 22), ('great', 14), ('like', 13), ('bad', 4)]
@united        cooc counts: [('like', 111), ('great', 48), ('bad', 34), ('love', 26), ('hate', 13)]
@southwestair  cooc counts: [('love', 68), ('like', 67), ('great', 52), ('bad', 22), ('hate', 4)]
@jetblue       cooc counts: [('like', 55), ('great', 53), ('love', 50), ('bad', 14), ('hate', 4)]
@usairways     cooc counts: [('like', 70), ('great', 46), ('bad', 34), ('love', 18), ('hate', 11)]
@americanair   cooc counts: [('like', 87), ('bad', 40), ('great', 34), ('love', 29), ('hate', 3)]


In [44]:
# TODO: filter out co-occurrences with too few counts **if you want**
cooc_counts = filter_cooc_counts(cooc_counts, 2)

for company in companies:
    #print (company)
    company_count = occ_counts[company]
    posPMIs = []
    negPMIs = []
    # TODO: compute PMI between company and each positive word, and
    # Computes PMI(x, y) where
    
    # c_xy is the number of times x co-occurs with y
    # c_x is the number of times x occurs.
    # c_y is the number of times y occurs.
    # N is the number of observations.
    # add it to the list of positive sentiment PMI values
    
    print("{:14s} cooc counts: {}".format(company, cooc_counts[company].most_common()))
    print("{:14s} cooc counts: {}".format(company, Counter({w : occ_counts[w] for w in pos_words}).most_common()))
    for w in pos_words:
        pos_pmi = PMI(, word_counts[w], company_count, N)
        posPMIs.append(pos_pmi)

    # TODO: same for negative sentiment words

    # TODO: uncomment the following line when posPMIs and negPMIs are no longer empty.
    #print("{:14s}: {:5.2f} (pos), {:5.2f} (neg)".format((company).ljust(12), mean(posPMIs), mean(negPMIs)))


@virginamerica cooc counts: [('love', 22), ('great', 14), ('like', 13), ('bad', 4)]
@virginamerica cooc counts: [('like', 411), ('great', 252), ('love', 213)]
@united        cooc counts: [('like', 111), ('great', 48), ('bad', 34), ('love', 26), ('hate', 13)]
@united        cooc counts: [('like', 411), ('great', 252), ('love', 213)]
@southwestair  cooc counts: [('love', 68), ('like', 67), ('great', 52), ('bad', 22), ('hate', 4)]
@southwestair  cooc counts: [('like', 411), ('great', 252), ('love', 213)]
@jetblue       cooc counts: [('like', 55), ('great', 53), ('love', 50), ('bad', 14), ('hate', 4)]
@jetblue       cooc counts: [('like', 411), ('great', 252), ('love', 213)]
@usairways     cooc counts: [('like', 70), ('great', 46), ('bad', 34), ('love', 18), ('hate', 11)]
@usairways     cooc counts: [('like', 411), ('great', 252), ('love', 213)]
@americanair   cooc counts: [('like', 87), ('bad', 40), ('great', 34), ('love', 29), ('hate', 3)]
@americanair   cooc counts: [('like', 411), ('gr

# Gradable method

In [None]:
valence = pd.read_csv("valence_lexicon_small.tsv", sep='\t', index_col=0)['Valence'].to_dict()

with open("negation_words.txt", 'rt') as fd:
    negation_words = [line.rstrip() for line in fd]
#print(negation_words)

#See https://en.wiktionary.org/wiki/Category:English_degree_adverbs
strengthen_words = pd.read_csv("strengthen_words.tsv", sep='\t', index_col=0)['score'].to_dict()
#print(strengthen_words)

weaken_words = pd.read_csv("weaken_words.tsv", sep='\t', index_col=0)['score'].to_dict()
#print(weaken_words)

exclamation_words = {'!':0.1, "!!":0.2, "!!!":0.3}


In [None]:
def plot_confusion_matrix(cm, target_names, title='Confusion matrix', cmap=None, normalize=True):
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if normalize:
        cm = cm.astype('float') / cm.sum()

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.grid(False)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [None]:
# Gradable method

# This threshold is used to distinguish between negative / neutral / positive tweets
# negative < -threshold <= neutral <= +threshold < positive
threshold = 0.05
cm = np.zeros((3,3))

# TODO implement gradable method and compute confusion matrix and accuracy


# TODO print results    
print("######### TH = {}".format(threshold))
plot_confusion_matrix(cm           = cm, 
                      normalize    = False,
                      target_names = sentiments,
                      title        = "Confusion Matrix")

