In [33]:
import nltk
import re
import pandas as pd
import glob
from normalization import normalize_corpus, parse_document
from nltk.corpus import sentiwordnet as swn

def analyze_sentiment_sentiwordnet_lexicon(review, verbose=False):
    # tokenize and POS tag text tokens
    text_tokens = nltk.word_tokenize(review)
    tagged_text = nltk.pos_tag(text_tokens)
    pos_score = neg_score = token_count = obj_score = 0
    # get wordnet synsets based on POS tags
    # # get sentiment scores if synsets are found
    for word, tag in tagged_text:
        ss_set = None
        if 'NN' in tag and len(list(swn.senti_synsets(word, 'n'))) != 0:
            ss_sets = list(swn.senti_synsets(word, 'n'))
            ss_set = ss_sets[0]
        elif 'VB' in tag and len(list(swn.senti_synsets(word, 'v'))) != 0:
            ss_sets = list(swn.senti_synsets(word, 'v'))
            ss_set = ss_sets[0]
        elif 'JJ' in tag and len(list(swn.senti_synsets(word, 'a'))) != 0:
            ss_sets = list(swn.senti_synsets(word, 'a'))
            ss_set = ss_sets[0]
        elif 'RB' in tag and len(list(swn.senti_synsets(word, 'r'))) != 0:
            ss_sets = list(swn.senti_synsets(word, 'r'))
            ss_set = ss_sets[0]
        # if senti-synset is found
        if ss_set:
            # add scores for all found synsets
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            token_count += 1
            # aggregate final scores
    final_score = pos_score - neg_score
    norm_final_score = round(float(final_score) / token_count, 2)
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
    if verbose:
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)
        # to display results in a nice table
        sentiment_frame = pd.DataFrame([[final_sentiment,
                                        norm_pos_score, 
                                        norm_neg_score,
                                        norm_final_score]],
                                        columns=pd.MultiIndex(levels=[['Sentiment Statistics:'],
                                        ['Predicted Sentiment',
                                        'Positive',
                                        'Negative',
                                        'Overall']],
                                        labels=[[0,0,0,0],
                                        [0,1,2,3]]))
        print(sentiment_frame)
        return final_sentiment

    #Using readlines

def positive_and_negative_statistics(review, verbose=False):
    # tokenize and POS tag text tokens
#    sent_tokens = nltk.sent_tokenize(review)
    number_of_positive_words = 0
    number_of_negative_words = 0
    for sentence in review:
        text_tokens = nltk.word_tokenize(sentence)
        tagged_text = nltk.pos_tag(text_tokens)
        pos_score = neg_score = token_count = obj_score = 0
        # get wordnet synsets based on POS tags
        # get sentiment scores if synsets are found
        for word, tag in tagged_text:
            ss_set = None
            if 'NN' in tag and len(list(swn.senti_synsets(word, 'n'))) != 0:
                ss_sets = list(swn.senti_synsets(word, 'n'))
                ss_set = ss_sets[0]
            elif 'VB' in tag and len(list(swn.senti_synsets(word, 'v'))) != 0:
                ss_sets = list(swn.senti_synsets(word, 'v'))
                ss_set = ss_sets[0]
            elif 'JJ' in tag and len(list(swn.senti_synsets(word, 'a'))) != 0:
                ss_sets = list(swn.senti_synsets(word, 'a'))
                ss_set = ss_sets[0]
            elif 'RB' in tag and len(list(swn.senti_synsets(word, 'r'))) != 0:
                ss_sets = list(swn.senti_synsets(word, 'r'))
                ss_set = ss_sets[0]
            # if senti-synset is found
            if ss_set:
                # add scores for all found synsets
                pos_score += ss_set.pos_score()
                neg_score += ss_set.neg_score()
                token_count += 1
                # aggregate final scores
                final_score = pos_score - neg_score
                norm_final_score = round(float(final_score) / token_count, 2)
                final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
        if final_sentiment == 'positive':
            number_of_positive_words += 1
        else:
            number_of_negative_words += 1
    return 'Number of Positive Reviews:' + str(number_of_positive_words), 'Number of Negative Reviews:' + str(number_of_negative_words)
      
# Import the data set 
path = r'../data/raw/OpinosisDataset1.0_0/topics/'
allFiles = glob.glob(path + "/*.data")
for file_ in allFiles:
    with open(file_, "r") as f:
        reviews = f.readlines()
        filename_search = re.search(r'[^\\/:*?"<>|\r\n]+$', file_)
        filename_parts = filename_search.group()
        filename = filename_parts.split('.')[0]
        print('Sentiment summary of ' + str(filename))
        print(positive_and_negative_statistics(reviews, verbose=True))
        print(analyze_sentiment_sentiwordnet_lexicon(' '.join(reviews), verbose=True))

Sentiment summary of accuracy_garmin_nuvi_255W_gps
('Number of Positive Reviews:58', 'Number of Negative Reviews:9')
  Sentiment Statistics:                          
    Predicted Sentiment Positive Negative Overall
0              positive     0.13     0.07    0.07
positive
Sentiment summary of bathroom_bestwestern_hotel_sfo
('Number of Positive Reviews:62', 'Number of Negative Reviews:26')
  Sentiment Statistics:                          
    Predicted Sentiment Positive Negative Overall
0              positive     0.12     0.09    0.03
positive
Sentiment summary of battery-life_amazon_kindle
('Number of Positive Reviews:61', 'Number of Negative Reviews:29')
  Sentiment Statistics:                          
    Predicted Sentiment Positive Negative Overall
0              positive     0.07     0.06    0.01
positive
Sentiment summary of battery-life_ipod_nano_8gb
('Number of Positive Reviews:52', 'Number of Negative Reviews:17')
  Sentiment Statistics:                          
    Pre

('Number of Positive Reviews:182', 'Number of Negative Reviews:84')
  Sentiment Statistics:                          
    Predicted Sentiment Positive Negative Overall
0              positive     0.11     0.09    0.03
positive
Sentiment summary of rooms_swissotel_chicago
('Number of Positive Reviews:124', 'Number of Negative Reviews:32')
  Sentiment Statistics:                          
    Predicted Sentiment Positive Negative Overall
0              positive     0.12     0.07    0.05
positive
Sentiment summary of room_holiday_inn_london
('Number of Positive Reviews:391', 'Number of Negative Reviews:184')
  Sentiment Statistics:                          
    Predicted Sentiment Positive Negative Overall
0              positive      0.1     0.08    0.02
positive
Sentiment summary of satellite_garmin_nuvi_255W_gps
('Number of Positive Reviews:48', 'Number of Negative Reviews:15')
  Sentiment Statistics:                          
    Predicted Sentiment Positive Negative Overall
0        