In [299]:
# imports and setup
import pandas as pd
import nltk
import stanza
import string
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from operator import itemgetter

pattern = r'[^A-Za-z0-9]+'

In [None]:
# import corpora
stanza.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

# initialise nlp pipeline
nlp = stanza.Pipeline()
sid = SentimentIntensityAnalyzer()

In [5]:
reviews = pd.read_csv('data/Books_rating.csv')

In [87]:
sample = 'The book is too long, but the characters were good. Who knew Darth Vader was such a caring father?'
# sample = 'The book is too long, but the characters were good.'

In [250]:
# check first n rows of review text
reviews[['review/text']].iloc[:500]

Unnamed: 0,review/text
0,This is only for Julie Strain fans. It's a col...
1,I don't care much for Dr. Seuss but after read...
2,"If people become the books they read and if ""t..."
3,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,Philip Nel - Dr. Seuss: American IconThis is b...
...,...
495,I'm writng again to say that this book has so ...
496,This book is exciting It's a turn pager you ca...
497,"Ash. Ahh. Such a great guy. Mary-Lynette, she'..."
498,We've fallen head over heels in love with Ash!...


In [269]:
# lowercase and tokenise
data = reviews[['review/text']].iloc[:500].apply(lambda x: x.astype(str).str.lower())
data = data['review/text'].apply(nltk.sent_tokenize)
data.head()
# text = reviews
# text = text.lower()
# text = nltk.sent_tokenize(text)
# text

0    [this is only for julie strain fans., it's a c...
1    [i don't care much for dr. seuss but after rea...
2    [if people become the books they read and if "...
3    [theodore seuss geisel (1904-1991), aka &quot;...
4    [philip nel - dr. seuss: american iconthis is ...
Name: review/text, dtype: object

In [301]:
str(data[22]).replace('[^a-zA-Z0-9]', '')

re.sub(pattern, ' ', str(data[22]))

' i loved this book for a few reasons the first was that i felt as if i was not reading the story but i was part of the story i loved the way it twisted and turned at every corner i loved that i simply did not know what to expect the choice of words was spectacular and just when i thought that i was in control of my own emotions something would jump up and surprise me i am anxiously waiting for the next book this was one of the best romance suspense books that i have read in years i highly recomend it i give this one five stars '

In [179]:
totalfeatureList = []

In [303]:
# score sentiment 
def sentiment_score(finalcluster):
    scores = []
    for pair in finalcluster:
        # only look at valid pairs
        if len(pair[1]) != 0:
            score = sid.polarity_scores(''.join(pair[1]))
            # print(score['compound'], score['compound'] != 0.0)
            if score['compound'] != 0.0:
                pair_score = [pair, score['compound']]
                scores.append(pair_score)
    return(scores)

In [314]:
totalfeatureList = []
cnt = 0
for review in data:
    sentence_scores = []
    for sent in review:
        sent = re.sub(pattern, ' ', sent)
        # sent = sent.replace('.', ' ').replace('-', '').replace("'", '').replace("/", '').replace('[^a-zA-Z0-9]', '')
        # word_list = nltk.word_tokenize(sent)
        # pos_list = nltk.pos_tag(word_list)

        # # noun combiner
        # newwordList = []
        # flag = 0
        # for i in range(0,len(pos_list)-1):
        #     if(pos_list[i][1]=="NN" and pos_list[i+1][1]=="NN"): # If two consecutive words are Nouns then they are joined together
        #         newwordList.append(pos_list[i][0]+pos_list[i+1][0])
        #         flag=1
        #     else:
        #         if(flag==1):
        #             flag=0
        #             continue
        #         newwordList.append(pos_list[i][0])
        #         if(i==len(pos_list)-2):
        #             newwordList.append(pos_list[i+1][0])
        # finaltxt = ' '.join(word for word in newwordList)


        # remove stopwords
        stop = set(stopwords.words('english'))
        new_txt_list = nltk.word_tokenize(sent)
        # wordsList = [w for w in new_txt_list if not w in stop]
        taggedList = nltk.pos_tag(new_txt_list)

        # print(wordsList)

        # relationship parser
        doc = nlp(sent)
        dep_node = []

        try:
            for dep in doc.sentences[0].dependencies:
                dep_node.append([dep[2].text, dep[0].id, dep[1]])
            for i in range(0, len(dep_node)):
                if (int(dep_node[i][1]) != 0):
                    dep_node[i][1] = new_txt_list[(int(dep_node[i][1]) - 1)]



            # possible features
            featureList = []
            categories = []
            for i in taggedList:
                if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                    featureList.append(list(i))
                    totalfeatureList.append(list(i)) # This list will store all the features for every sentence
                    categories.append(i[0])

            # cluster together features and descriptors
            fcluster = []
            for i in featureList:
                filist = []
                for j in dep_node:
                    if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                        if(j[0]==i[0]):
                            filist.append(j[1])
                        else:
                            filist.append(j[0])
                fcluster.append([i[0], filist])

            # select only nouns
            finalcluster = []
            dic = {}
            for i in featureList:
                dic[i[0]] = i[1]
            for i in fcluster:
                if(dic[i[0]]=="NN"):
                    finalcluster.append(i)

            # get sentence scores  
            sentence_sentiment = sentiment_score(finalcluster) 
            
            for score in sentence_sentiment:
                sentence_scores.append(score)

        except Exception as e:
            print('review #:', cnt, e) 
            print(sent)
            cnt += 1
            raise(NotImplementedError)

    
    sentence_scores = sorted(sentence_scores, key=itemgetter(1), reverse = True)
    # print('review #:', cnt, sentence_scores)
    cnt += 1




review #: 0 sequence item 0: expected str instance, int found
this is only for julie strain fans 


NotImplementedError: 