In [770]:
import os
import pandas as pd
import numpy as np
import pickle
from collections import Counter, defaultdict
import re
import matplotlib
from math import pi
import matplotlib.pyplot as plt
import collections

# import sklearn models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import LabelPowerset

# nlp libraries/api
import spacy
from spacy import displacy
import gensim
nlp = spacy.load('en_coref_lg')

In [None]:
# Load opinion lexicon
neg_file = open("neg_words.txt",encoding = "ISO-8859-1")
pos_file = open("pos_words.txt",encoding = "ISO-8859-1")
neg = [line.strip() for line in neg_file.readlines()]
pos = [line.strip() for line in pos_file.readlines()]

#create list of postive + negative words
opinion_words = neg + pos

#Uncomment below if running for first time. 
#Setup nltk corpora path and Google Word2Vec location
#google_vec_file = '/Users/gielderks/neuralcoref/neuralcoref/GoogleNews-vectors-negative300.bin'
#word2vec = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)
#word2vec.save("word2vec_google2.pkl")

# # If above script has been run, load saved word embedding
word2vec = pickle.load(open("/Users/gielderks/neuralcoref/neuralcoref/word2vec_google.pkl", 'rb'))

In [893]:
# load the Multi-label binarizer from previous notebook
mlb = pickle.load(open("mlb.pkl", 'rb'))

# load the fitted naive bayes model from previous notebook
naive_model1 = pickle.load(open("naive_model1.pkl", 'rb'))

In [894]:
mlb_term = ['SPEED', 'CPU', 'MEMORY', 'BATTERY', 'PRICE', 'QUALITY', 'SOFTWARE', 'DISPLAY', 'MOUSE', 'COMPANY', 'GENERAL', 'LAPTOP', 'KEYBOARD', 'HARDDISK']

In [914]:
def check_similarity(aspects, word):
    
    #print('word = ', word)
    
    similarity = []
    
    for aspect in aspects:
        #print('aspect= ', aspect, 'word=', word)
        similarity.append(word2vec.n_similarity([aspect.lower()], [word]))
        
    # set threshold for max value
    #print('Similarity', similarity)
   # print('\n')

    if max(similarity) > 0.25: # how close should the aspect be to the word in the vector space?
        #print('aspect = ', aspects[np.argmax(similarity)])
        #print('\n')

        return aspects[np.argmax(similarity)]
    
    else:
       # print('aspect = ', 'GENERAL')
       # print('\n')
        return None

def assign_term_to_aspect(aspect_sent, terms_dict, sent_dict, pred):
    '''
    function: takes in a sentiment dictionary and appends the aspect dictionary
    inputs: sent_dict is a Counter in the form Counter(term:sentiment value)
            aspect_sent is total sentiment tally
            terms_dict is dict with individual aspect words associated with sentiment
    output: return two types of aspect dictionaries: 
            updated terms_dict and aspect_sent
    '''
    aspects = mlb_term
    
    aspects = mlb_term
    
   # print('sent_dict', sent_dict)
    
    counter_positive = sent_dict[0]
    counter_negative = sent_dict[1]
    
    # First, check word2vec
    # Note: the .split() is used for the term because word2vec can't pass compound nouns
        
    for term in counter_positive:
#         print('\n')  
#         print('TERM', term)
#         print('\n')
        
        try:
            # The conditions for when to use the NB classifier as default vs word2vec
            # Checks if check_similarity is not None. IF none --> naive bayes is used
            if check_similarity(aspects, term.split()[-1]):
                terms_dict[check_similarity(aspects, term.split()[-1])][term] += sent_dict[0][term]
                aspect_sent[check_similarity(aspects, term.split()[-1])]["pos"] += sent_dict[0][term]
            elif (pred[0] == "MISCELLANEOUS"):
                continue
            elif (len(pred) == 1):
                terms_dict['GENERAL'][term] += sent_dict[0][term]
                aspect_sent['GENERAL']["pos"] += sent_dict[0][term]
                
            # if unable to classify via NB or word2vec, then put them in misc. bucket
            else:
                terms_dict['GENERAL'][term] += sent_dict[0][term]
                aspect_sent['GENERAL']["pos"] += sent_dict[0][term]

        except:
            #raise()
           # print(term, "not in vocab")
            continue
            
    for term in counter_negative:

#         print('TERM', term)
#         print('\n')

        try:
            # The conditions for when to use the NB classifier as default vs word2vec
            # Checks if check_similarity is not None. IF none --> naive bayes is used
            if check_similarity(aspects, term.split()[-1]):
                terms_dict[check_similarity(aspects, term.split()[-1])][term] += sent_dict[1][term]
                aspect_sent[check_similarity(aspects, term.split()[-1])]["neg"] += abs(sent_dict[1][term])
            elif (pred[0] == 'GENERAL'):
                continue
            elif (len(pred) == 1):
                terms_dict['GENERAL'][term] += sent_dict[1][term]
                aspect_sent['GENERAL']["neg"] += abs(sent_dict[1][term])
            # if unable to classify via NB or word2vec, then put them in misc. bucket
            else:
                terms_dict['GENERAL'][term] += sent_dict[1][term]
                aspect_sent['GENERAL']["neg"] += abs(sent_dict[1][term])
                
        except:
            #raise()
           # print(term, "not in vocab")
            continue


    return aspect_sent, terms_dict
    
def feature_sentiment(sentence):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature did not exist previously,
              then updates sentiment to each of the new or existing features
    output: updated dictionary
    '''

    counter_positive = collections.Counter()

    counter_negative = collections.Counter()

    sentence = nlp(sentence)

    for token in sentence:
        #print(token)

        #    print(token.text,token.dep_, token.head, token.head.dep_)
        # check if the word is an opinion word, then assign sentiment
        if token.text in opinion_words:  # Words such as worked / crashed / well / useless / enjoyed

            # print(token.text, 'main_token')
            sentiment = 1 if token.text in pos else -1  # if word is in postive opinion words then add 1 - if in neg opinion words --> substract 1
           # print(sentiment)
            # if target is an adverb modifier (i.e. pretty, highly, etc.)
            # but happens to be an opinion word, ignore and pass
            if (token.dep_ == "advmod"):
                # print(token, 'advmod')
                continue
            elif (
                token.dep_ == "amod"):  # adjectical modifier --> amazing lightless of the laptop "amazing is the adjectical mod"

                 # important --> amazing is the amod here. Therefore amazing.head = ligthless is added to the dict
                if sentiment > 0:
                    counter_positive[token.head.text] += sentiment
                elif sentiment < 0:
                    counter_negative[token.head.text] += sentiment

            # for opinion words that are adjectives, adverbs, verbs...
            else:

                for child in token.children:  # for example: issues has child many, which is an adjectival modifier
                    # print(child, 'child', child.dep_)
                    # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                    # This could be better updated for modifiers that either positively or negatively emphasize
                    #  if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words): #does this have to be in opinion words
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")):  # does this have to be in opinion words
                        sentiment *= 1.5
                        # print(sentiment, token, 'token sentiment')
                    # check for negation words and flip the sign of sentiment  --> double negative e.g. not amazing
                    if child.dep_ == "neg":
                        sentiment *= -1
                        continue

                for child in token.children:

                    # if verb, check if there's a direct object --> lijdend voorwerp in dutch (enjoyed(verb) the keyboard light(direct object))
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):

                        #sent_dict[child.text] += sentiment
                        if sentiment > 0:
                            counter_positive[token.head.text] += sentiment
                        elif sentiment < 0:
                            counter_negative[token.head.text] += sentiment

                        # check for conjugates (a AND b), then add both to dictionary
                        # Example: Enjoyed both the screen and the keyboard light
                        subchildren = []
                        conj = 0
                        for subchild in child.children:
                            if subchild.text == "and":
                                conj = 1
                            if (conj == 1) and (subchild.text != "and"):
                                subchildren.append(subchild.text)
                                conj = 0
                        for subchild in subchildren:
                            #sent_dict[subchild] += sentiment
                            if sentiment > 0:
                                counter_positive[token.head.text] += sentiment
                            elif sentiment < 0:
                                counter_negative[token.head.text] += sentiment
                # check for negation
                for child in token.head.children:
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if (child.dep_ == "neg"):
                        sentiment *= -1
                        
                # check for nouns
                for child in token.head.children:
                    noun = ""
                    
                    if sentiment > 0:
                        if (child.pos_ == "NOUN"):
                            noun = child.text
                            # Check for compound nouns
                            for subchild in child.children:
                                if subchild.dep_ == "compound":
                                    noun = subchild.text + " " + noun

                            counter_positive[noun] += sentiment
                        
                    else:
                        if (child.pos_ == "NOUN"):
                            noun = child.text
                            # Check for compound nouns
                            for subchild in child.children:
                                if subchild.dep_ == "compound":
                                    noun = subchild.text + " " + noun

                            counter_negative[noun] += sentiment

    # return sent_dict
    return counter_positive, counter_negative

def classify_and_sent(sentence, aspect_sent, terms_dict):
    '''
    function: classify the sentence into a category, and assign sentiment
    note: aspect_dict is a parent dictionary with all the aspects
    input: sentence & aspect dictionary, which is going to be updated
    output: updated aspect dictionary
    '''
    # classify sentence with NB classifier
    predicted = naive_model1.predict([sentence])
    pred = mlb.inverse_transform(predicted)
        
    #print('pred2', pred, sentence)
#     print('pred2', predicted)
#     print('\n')
    
   # print('predict proba =', max(naive_model1.predict_proba([sentence]).toarray().tolist()[0]))
        
   # print('\n')
   # print(naive_model1.predict_proba([sentence]).toarray().tolist())
    
    df = pd.DataFrame({'aspect' : mlb.classes_, 'proba' : naive_model1.predict_proba([sentence]).toarray().tolist()[0]}).sort_values('proba', ascending=False)

    #print(df)
   # print('\n')

    
    # get aspect names and their sentiment in a dictionary form
    sent_dict = feature_sentiment(sentence)
   # print('sent_dict_feature = ', sent_dict)
    
    # try to categorize the aspect names into the 4 aspects in aspect_dict
    aspect_sent, terms_dict = assign_term_to_aspect(aspect_sent, terms_dict, sent_dict, pred[0])
  #  print('\n')
   # print('aspect_sent', aspect_sent)
   # print('\n')

   # print('terms_dict', terms_dict)
    
    return aspect_sent, terms_dict

def replace_pronouns(text):
    input_actual = text
    text = nlp(text)
    text_resolved = text._.coref_resolved
    if text_resolved:
        text = text_resolved
        return text
    else:
        return input_actual

def split_sentence(text):
    '''
    splits review into a list of sentences using spacy's sentence parser
    '''
    review = nlp(text)
    bag_sentence = []
    start = 0
    for token in review:
        if token.sent_start:
            bag_sentence.append(review[start:(token.i-1)])
            start = token.i
        if token.i == len(review)-1:
            bag_sentence.append(review[start:(token.i+1)])
    return bag_sentence

# Remove special characters using regex
def remove_special_char(sentence):
    return re.sub(r"[^a-zA-Z0-9.',:;?]+", ' ', sentence)

def review_pipe(review, aspect_sent, terms_dict):

    review = replace_pronouns(review)
    sentences = split_sentence(review)
    for sentence in sentences:
        #print(sentence)
        #print('\n')

        sentence = remove_special_char(str(sentence))
        aspect_sent, terms_dict = classify_and_sent(sentence.lower(), aspect_sent, terms_dict)
       # print('\n')
    return dict(aspect_sent), dict(terms_dict)

In [915]:
list_classes= mlb.classes_.tolist()
list_classes.append("SPEED") 
list_classes.append("DISPLAY")

In [916]:
terms_dict = {x : Counter() for x in list_classes}
aspect_sent = {x : Counter() for x in mlb_term}

In [917]:
# modify for each restaurant
count = 0
for review in air['reviewText'][0:5]:
    if count % 10 == 0:
        print(count)
    aspect_sent, terms_dict = review_pipe(review, aspect_sent, terms_dict)
    count+=1

0


In [918]:
import json

In [961]:
outfile = open('model_output.json', 'w')
outfile.write("[\n")
count2 = 0

process_batch = air[0:20]

length_rtp = len(process_batch) -1

aspect_filtered = pd.read_csv('aspect_data.csv', usecols=['aspect_list'])
filtered_aspects = list(aspect_filtered)

# modify for each restaurant
count = 0
for index, review in process_batch.iterrows():
    
    terms_dict = {x : Counter() for x in list_classes}
    aspect_sent = {x : Counter() for x in mlb_term}
    
    if count % 10 == 0:
        print(count)
        
    aspect_sent, terms_dict = review_pipe(review['reviewText'], aspect_sent, terms_dict)
    
    positive, negative = feature_sentiment(review['reviewText'])
    
    print(positive)
    print('\n')

    for k in filtered_aspects:
        positive.pop(k, None)
        
    for k in filtered_aspects:
        negative.pop(k, None)
        
    aspect_sent = {k: dict(v) for k, v in aspect_sent.items() if bool(dict(v))==True}
    terms_dict = {k: dict(v) for k, v in terms_dict.items() if bool(dict(v))==True}

    
#     for k,v in aspect_sent.items():
#         if bool(dict(v)):
#             print(dict(v))
#             print (k, v)

    print(aspect_sent)
    print('\n')
    print(terms_dict)
    print('\n')

        
#     terms_dict = [item for item in terms_dict if item]
#     aspect_sent = [item for item in aspect_sent if item]
    adder = {'asin' : review['asin'], 'ReviewText' : review['reviewText'], 'Overall' : review['overall'], 'postive' : dict(positive), 'negative' : dict(negative)}
    
    outfile.write(json.dumps(adder))
    
    if count2 == length_rtp:

        outfile.write("\n")
    
    elif count2 == len(process_batch):
        continue
    else:

        outfile.write(",\n")

    count2 += 1
                
    count+=1
outfile.write("]")
outfile.close()

0
Counter({'issues': 1.5, 'windows': 1, 'enjoyed': 1, 'background light': 1, 'lightless': 1})


{'CPU': {'neg': 2.25, 'pos': 1}, 'GENERAL': {'pos': 4.5, 'neg': 2}}


{'CPU': {'os': -2.25, 'windows': 1}, 'GENERAL': {'issues': 0.5, 'death': -1, 'enjoyed': 1, 'background light': 1, 'lightless': 1}}


Counter({'brand': 1})


{'CPU': {'pos': 1}, 'DISPLAY': {'neg': 1.5}, 'COMPANY': {'pos': 1}, 'GENERAL': {'neg': 1.5}}


{'COMPANY': {'brand': 1}, 'CPU': {'cpu': 1}, 'GENERAL': {'mess': -1.5}, 'DISPLAY': {'screen resolution': -1.5}}


Counter({'machine': 1, 'features': 1, 'works': 1, 'hardware superiority': 1})


{'QUALITY': {'neg': 1.5}, 'SOFTWARE': {'neg': 1.5}, 'DISPLAY': {'pos': 1}, 'GENERAL': {'pos': 2}, 'HARDDISK': {'pos': 1}}


{'GENERAL': {'works': 1, 'hardware superiority': 1}, 'HARDDISK': {'machine': 1}, 'QUALITY': {'productivity': -1.5}, 'SOFTWARE': {'virus': -1.5}, 'DISPLAY': {'features': 1}}


Counter({'speed': 1, 'work': 1})


{'SPEED': {'pos': 1}}


{'SPEED': {'speed': 1}}


Coun

KeyboardInterrupt: 

In [939]:
df_test = pd.DataFrame(row, columns=['asin', 'ReviewText', 'Overall'])
df_test

Unnamed: 0,asin,ReviewText,Overall
0,B005CWJB5G,I had never own a Mac before I had bought this...,2.0
1,B005CWJB5G,Too slow for a $1400 Computer. It should come ...,1.0
2,B005CWJB5G,I bought this thinking it is a good machine wi...,1.0
3,B005CWJB5G,Very please with this product. The SSD really ...,5.0
4,B005CWJB5G,"I will buy it, but i can't tell for the moment...",3.0
5,B005CWJB5G,I had never own a Mac before I had bought this...,2.0
6,B005CWJB5G,I had never own a Mac before I had bought this...,2.0
7,B005CWJB5G,Too slow for a $1400 Computer. It should come ...,1.0
8,B005CWJB5G,I bought this thinking it is a good machine wi...,1.0
9,B005CWJB5G,Very please with this product. The SSD really ...,5.0


In [921]:
aspect_sent, terms_dict

({'SPEED': Counter({'pos': 4.5, 'neg': 4.5}),
  'CPU': Counter({'neg': 11.0, 'pos': 9.0}),
  'MEMORY': Counter(),
  'BATTERY': Counter({'pos': 2}),
  'PRICE': Counter({'pos': 2.5, 'neg': 1.5}),
  'QUALITY': Counter({'neg': 4.0, 'pos': 6.0}),
  'SOFTWARE': Counter({'neg': 8.0, 'pos': 2}),
  'DISPLAY': Counter({'neg': 5.0, 'pos': 4}),
  'MOUSE': Counter(),
  'COMPANY': Counter({'pos': 4}),
  'GENERAL': Counter({'pos': 69.0, 'neg': 22.25}),
  'LAPTOP': Counter({'pos': 12.0, 'neg': 3.25}),
  'KEYBOARD': Counter({'neg': 10.5, 'pos': 11.5}),
  'HARDDISK': Counter({'pos': 13.5, 'neg': 3})},
 {'BATTERY': Counter({'battyer life': 1, 'battery life': 1}),
  'COMPANY': Counter({'brand': 2, 'investment': 1, 'company': 1}),
  'CONNECTIVITY': Counter(),
  'CPU': Counter({'os': -4.5,
           'windows': -2.0,
           'cpu': 2,
           'ssd': 1.5,
           'processor': 1}),
  'DESIGN_FEATURES': Counter(),
  'GENERAL': Counter({'issues': 0.0,
           'death': -2,
           'enjoyed': 2,
  

In [903]:
aspect_filtered = pd.read_csv('aspect_data.csv', usecols=['aspect_list'])

In [904]:
aspect_filterd

Unnamed: 0,aspect_list
0,cord
1,battery life
2,service center
3,"""sales"" team"
4,tech guy
5,quality
6,GUI
7,applications
8,use
9,start up


In [765]:
word2vec.n_similarity(['ssd'], ["harddisk"])

0.5703332414490843

In [801]:
word2vec.n_similarity(['resolution'], ["display"])

0.25501903058970077

## Testing functions

In [251]:
# test code for feature sentiment
sentence= "The laptop is responsive and snappy for an under $400 laptop. I’m a bit perplexed by some of the reviews claiming this to be slow. It is not. This value laptop has some chops. I loaded my desired suite of applications with Ninite dot com, that went quickly. In testing performance, I ran a Netflix movie and a show on YouTube at the same time with no performance degradation. I ran the 2+ hour movie twice while doing other stuff on the laptop and the battery still showed over 7 hours left. This laptop has a lot of connectors: USB, HDMI, and even a VGA connection (which I would suggest dropping, if you need that just put a HDMI/VGA adapter in your computer bag). Also included is the R/W DVD drive... something becoming rare in laptops. A nice option to watch movies if desired.The laptop display is good but has a poor viewing angle. I wasn’t surprised, my guess is that this is the display was where trade-offs were made to get the laptop to this price. The display is hard to see from the side (a witty salesperson would claim this is great for privacy) and required tilting the display forward/back more than I liked to obtain the best viewing angle from the front.."
#sentence= 'This laptop is not good'
feature_sentiment(sentence)
print('')
# check similarity
print(check_similarity(['electronics', 'computers', 'Dell', 'Apple',"Orange"], 'laptop'))
print(check_similarity(["Speed"], 'laptop'))




computers
None
