In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

class Splitter(object):
    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self, text):
        """
        input format: a paragraph of text
        output format: a list of lists of words.
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        """
        sentences = self.nltk_splitter.tokenize(text)
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
        return tokenized_sentences


class POSTagger(object):
    def __init__(self):
        pass
        
    def pos_tag(self, sentences):
        """
        input format: list of lists of words
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        output format: list of lists of tagged tokens. Each tagged tokens has a
        form, a lemma, and a list of tags
            e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
                    [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
        """

        pos = [nltk.pos_tag(sentence) for sentence in sentences]
        #adapt format
        pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
        return pos

[nltk_data] Downloading package punkt to /Users/HercHja/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/HercHja/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
import yaml
class DictionaryTagger(object):
    def __init__(self, dictionary_paths):
        files = [open(path, 'r') for path in dictionary_paths]
        dictionaries = [yaml.load(dict_file) for dict_file in files]
        map(lambda x: x.close(), files)
        self.dictionary = {}
        self.max_key_size = 0
        for curr_dict in dictionaries:
            for key in curr_dict:
                if key in self.dictionary:
                    self.dictionary[key].extend(curr_dict[key])
                else:
                    self.dictionary[key] = curr_dict[key]
                    self.max_key_size = max(self.max_key_size, len(key))

    def tag(self, postagged_sentences):
        return [self.tag_sentence(sentence) for sentence in postagged_sentences]

    def tag_sentence(self, sentence, tag_with_lemmas=False):
        """
        the result is only one tagging of all the possible ones.
        The resulting tagging is determined by these two priority rules:
            - longest matches have higher priority
            - search is made from left to right
        """
        tag_sentence = []
        N = len(sentence)
        if self.max_key_size == 0:
            self.max_key_size = N
        i = 0
        while (i < N):
            j = min(i + self.max_key_size, N) #avoid overflow
            tagged = False
            while (j > i):
                expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
                expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
                if tag_with_lemmas:
                    literal = expression_lemma
                else:
                    literal = expression_form
                if literal in self.dictionary:
                    #self.logger.debug("found: %s" % literal)
                    is_single_token = j - i == 1
                    original_position = i
                    i = j
                    taggings = [tag for tag in self.dictionary[literal]]
                    tagged_expression = (expression_form, expression_lemma, taggings)
                    if is_single_token: #if the tagged literal is a single token, conserve its previous taggings:
                        original_token_tagging = sentence[original_position][2]
                        tagged_expression[2].extend(original_token_tagging)
                    tag_sentence.append(tagged_expression)
                    tagged = True
                else:
                    j = j - 1
            if not tagged:
                tag_sentence.append(sentence[i])
                i += 1
        return tag_sentence

In [3]:
dicttagger = DictionaryTagger(['positive.yml', 'negative.yml'])


In [6]:
def value_of(sentiment):
    if sentiment == 'positive': return 1
    if sentiment == 'negative': return -1
    return 0

def getSentenceSentiment(sentence):
    text = sentence
    splitter = Splitter()
    postagger = POSTagger()
    splitted_sentences = splitter.split(text)
    pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
    dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)

    return print(sum ([value_of(tag) for sentence in dict_tagged_sentences for token in sentence for tag in token[2]]))

In [9]:
import pandas as pd
df = pd.read_csv('NLP-classification-training-data.csv', encoding = "ISO-8859-1")
print(df)

        0  1467811592  Mon Apr 06 22:20:03 PDT 2009  NO_QUERY  \
0       0  1467812771  Mon Apr 06 22:20:19 PDT 2009  NO_QUERY   
1       0  1467813782  Mon Apr 06 22:20:34 PDT 2009  NO_QUERY   
2       0  1467814783  Mon Apr 06 22:20:50 PDT 2009  NO_QUERY   
3       0  1467815988  Mon Apr 06 22:21:09 PDT 2009  NO_QUERY   
4       0  1467817502  Mon Apr 06 22:21:32 PDT 2009  NO_QUERY   
5       0  1467819650  Mon Apr 06 22:22:05 PDT 2009  NO_QUERY   
6       0  1467821085  Mon Apr 06 22:22:26 PDT 2009  NO_QUERY   
7       0  1467822522  Mon Apr 06 22:22:49 PDT 2009  NO_QUERY   
8       0  1467824199  Mon Apr 06 22:23:15 PDT 2009  NO_QUERY   
9       0  1467825642  Mon Apr 06 22:23:39 PDT 2009  NO_QUERY   
10      0  1467833736  Mon Apr 06 22:25:45 PDT 2009  NO_QUERY   
11      0  1467834265  Mon Apr 06 22:25:54 PDT 2009  NO_QUERY   
12      0  1467835305  Mon Apr 06 22:26:10 PDT 2009  NO_QUERY   
13      0  1467836500  Mon Apr 06 22:26:28 PDT 2009  NO_QUERY   
14      0  1467837470  Mo