In [152]:
# imports and setup
import pandas as pd
import nltk
import stanza
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from operator import itemgetter

In [None]:
# import corpora
stanza.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

# initialise nlp pipeline
nlp = stanza.Pipeline()
sid = SentimentIntensityAnalyzer()

In [5]:
reviews = pd.read_csv('data/Books_rating.csv')

In [87]:
sample = 'The book is too long, but the characters were good. Who knew Darth Vader was such a caring father?'
# sample = 'The book is too long, but the characters were good.'

In [100]:
# check first n rows of review text
reviews[['review/text']].iloc[:5]

Unnamed: 0,review/text
0,This is only for Julie Strain fans. It's a col...
1,I don't care much for Dr. Seuss but after read...
2,"If people become the books they read and if ""t..."
3,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,Philip Nel - Dr. Seuss: American IconThis is b...


In [108]:
# lowercase and tokenise
data = reviews[['review/text']].iloc[:1].apply(lambda x: x.astype(str).str.lower())
data = data['review/text'].apply(nltk.sent_tokenize)
data.head()
# text = reviews
# text = text.lower()
# text = nltk.sent_tokenize(text)
# text

0    [this is only for julie strain fans., it's a c...
Name: review/text, dtype: object

In [110]:
data[0]

['this is only for julie strain fans.',
 "it's a collection of her photos -- about 80 pages worth with a nice section of paintings by olivia.if you're looking for heavy literary content, this isn't the place to find it -- there's only about 2 pages with text and everything else is photos.bottom line: if you only want one book, the six foot one ... is probably a better choice, however, if you like julie like i like julie, you won't go wrong on this one either."]

In [89]:
totalfeatureList = []

In [167]:
# score sentiment 
def sentiment_score(finalcluster):
    scores = []
    for pair in finalcluster:
        # only look at valid pairs
        if len(pair[1]) != 0:
            score = sid.polarity_scores(''.join(pair[1]))
            pair_score = [pair[0], score['compound']]
            scores.append(pair_score)
    return(scores)

In [168]:
totalfeatureList = []
for review in data:
    sentence_scores = []
    for sent in review:
        word_list = nltk.word_tokenize(sent)
        pos_list = nltk.pos_tag(word_list)

        # word combiner
        newwordList = []
        flag = 0
        for i in range(0,len(pos_list)-1):
            if(pos_list[i][1]=="NN" and pos_list[i+1][1]=="NN"): # If two consecutive words are Nouns then they are joined together
                newwordList.append(pos_list[i][0]+pos_list[i+1][0])
                flag=1
            else:
                if(flag==1):
                    flag=0
                    continue
                newwordList.append(pos_list[i][0])
                if(i==len(pos_list)-2):
                    newwordList.append(pos_list[i+1][0])
        finaltxt = ' '.join(word for word in newwordList)


        # 
        stop = set(stopwords.words('english'))
        new_txt_list = nltk.word_tokenize(finaltxt)
        wordsList = [w for w in new_txt_list if not w in stop]
        taggedList = nltk.pos_tag(wordsList)


        # relationship parser
        doc = nlp(finaltxt)
        dep_node = []
        for dep_edge in doc.sentences[0].dependencies:
            dep_node.append([dep_edge[2].text, dep_edge[0].id, dep_edge[1]])
        for i in range(0, len(dep_node)):
            if (int(dep_node[i][1]) != 0):
                dep_node[i][1] = newwordList[(int(dep_node[i][1]) - 1)]


        # possible features
        featureList = []
        categories = []
        for i in taggedList:
            if(i[1]=='JJ' or i[1]=='NN' or i[1]=='JJR' or i[1]=='NNS' or i[1]=='RB'):
                featureList.append(list(i))
                totalfeatureList.append(list(i)) # This list will store all the features for every sentence
                categories.append(i[0])

        # cluster together features and descriptors
        fcluster = []
        for i in featureList:
            filist = []
            for j in dep_node:
                if((j[0]==i[0] or j[1]==i[0]) and (j[2] in ["nsubj", "acl:relcl", "obj", "dobj", "agent", "advmod", "amod", "neg", "prep_of", "acomp", "xcomp", "compound"])):
                    if(j[0]==i[0]):
                        filist.append(j[1])
                    else:
                        filist.append(j[0])
            fcluster.append([i[0], filist])

        # select only nouns
        finalcluster = []
        dic = {}
        for i in featureList:
            dic[i[0]] = i[1]
        for i in fcluster:
            if(dic[i[0]]=="NN"):
                finalcluster.append(i)

        # get sentence scores  
        sentence_sentiment = sentiment_score(finalcluster) 
        
        for score in sentence_sentiment:
            sentence_scores.append(score)
    
    sentence_scores = sorted(sentence_scores, key=itemgetter(1), reverse = True)
    print(sentence_scores)


[['section', 0.4215], ['juliestrain', 0.0], ['collection', 0.0]]
