In [99]:
import nltk
import yaml
import os
import pandas as pd
from bs4 import BeautifulSoup
import re
import random
from stemming.porter2 import stem

In [100]:
# Converting the text into lower case and only considering the UTF-8 Characters
def preprocess( raw_review ):
    review_text = BeautifulSoup(raw_review).get_text()        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().replace("\'","").encode('Utf-8').split()
    
    processed = [stem(w) for w in words] 
            
    return(" ".join(processed)) 

# Tokenizer
class Splitter(object):
    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self, text):
        sentences = self.nltk_splitter.tokenize(text)
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
        return tokenized_sentences

# Part-Of-Speech Tagger
class POSTagger(object):
    def __init__(self):
        pass
        
    def pos_tag(self, sentences):
        pos = [nltk.pos_tag(sentence) for sentence in sentences]
        #adapt format
        pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
        return pos

# Tag the Dictionary with pos,neg,inc,dec,inv    
class DictionaryTagger(object):
    def __init__(self, dictionary_paths):
        files = [open(path, 'r') for path in dictionary_paths]
        dictionaries = [yaml.load(dict_file) for dict_file in files]
        map(lambda x: x.close(), files)
        self.dictionary = {}
        self.max_key_size = 0
        for curr_dict in dictionaries:
            # Stem Dictionary Words
            curr_dict={stem(k.lower()): v for k, v in curr_dict.items()}
            for key in curr_dict:
                if key in self.dictionary:
                    self.dictionary[key].extend(curr_dict[key])
                else:
                    self.dictionary[key] = curr_dict[key]
                    self.max_key_size = max(self.max_key_size, len(key))

    def tag(self, postagged_sentences):
        return [self.tag_sentence(sentence) for sentence in postagged_sentences]

    def tag_sentence(self, sentence, tag_with_lemmas=False):
        tag_sentence = []
        N = len(sentence)
        if self.max_key_size == 0:
            self.max_key_size = N
        i = 0
        while (i < N):
            j = min(i + self.max_key_size, N) #avoid overflow
            tagged = False
            while (j > i):
                expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
                expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
                if tag_with_lemmas:
                    literal = expression_lemma
                else:
                    literal = expression_form
                if literal in self.dictionary:
                    #self.logger.debug("found: %s" % literal)
                    is_single_token = j - i == 1
                    original_position = i
                    i = j
                    taggings = [tag for tag in self.dictionary[literal]]
                    tagged_expression = (expression_form, expression_lemma, taggings)
                    if is_single_token: #if the tagged literal is a single token, conserve its previous taggings:
                        original_token_tagging = sentence[original_position][2]
                        tagged_expression[2].extend(original_token_tagging)
                    tag_sentence.append(tagged_expression)
                    tagged = True
                else:
                    j = j - 1
            if not tagged:
                tag_sentence.append(sentence[i])
                i += 1
        return tag_sentence

In [101]:
def value_of(sentiment):
    if sentiment == 'positive': return 1
    if sentiment == 'negative': return -1
    return 0

def sentence_score(sentence_tokens, previous_token, acum_score):    
    if not sentence_tokens:
        return acum_score
    else:
        current_token = sentence_tokens[0]
        tags = current_token[2]
        token_score = sum([value_of(tag) for tag in tags])
        if previous_token is not None:
            previous_tags = previous_token[2]
            if 'inc' in previous_tags:
                token_score *= 2.0
            elif 'dec' in previous_tags:
                token_score /= 2.0
            elif 'inv' in previous_tags:
                token_score *= -1.0
        return sentence_score(sentence_tokens[1:], current_token, acum_score + token_score)

def sentiment_score(review):
    return sum([sentence_score(sentence, None, 0.0) for sentence in review])

def sentiment_classify(text,senti_dictionary):
    dicttagger = DictionaryTagger(senti_dictionary)
    dict_tagged_sentences = dicttagger.tag(split_tag(text))
    return sentiment_score(dict_tagged_sentences)

In [102]:
# Call Splitter and POSTagger
def split_tag(text):
    splitter = Splitter()
    postagger = POSTagger()
    splitted_sentences = splitter.split(preprocess(text))
    pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
    return pos_tagged_sentences

In [103]:
path="newd.csv"
sentiment_dictionary=[ 'positive.yml', 'negative.yml', 'inc.yml', 'dec.yml', 'inv.yml']
data=pd.read_csv(path)
processed_data=data["text"].apply(lambda x: sentiment_classify(preprocess(x.strip()),sentiment_dictionary))
data['review']=processed_data
data

Unnamed: 0,text,review
0,currently staying at the hilton rialto &amp; a...,-2
1,please update the hilton honors android app to...,0
2,would not it be great if the homewood suites i...,1
3,| | | my dad is visiting frm new york staying ...,1
4,"in cleveland ohio for a baseball game, enjoy t...",0
5,"hey, hilton, i had a nightmare of a stay at hi...",-1
6,apparently asking for a clean room is too much...,0
7,"to be a guest at the hilton, we are suppose to...",0
8,i was once hhonors diamond. i now stay anywher...,0
9,my family and i had to travel to jacksonville ...,3
