In [None]:
"""
Justifying recommendations using distantly-labeled reviews and fined-grained aspects
Jianmo Ni, Jiacheng Li, Julian McAuley
Empirical Methods in Natural Language Processing (EMNLP), 2019
"""

### Input? 
- JSON DATASET, PositiveWordList, and NegativeWordList
- the dataset should have same labels as those https://nijianmo.github.io/amazon/index.html
- reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
- asin - ID of the product, e.g. 0000013714
- reviewerName - name of the reviewer
- vote - helpful votes of the review
- style - a disctionary of the product metadata, e.g., "Format" is "Hardcover"
- reviewText - text of the review
- overall - rating of the product
- summary - summary of the review
- unixReviewTime - time of the review (unix time)
- reviewTime - time of the review (raw)
- image - images that users post after they have received the product


### Output?
- DataFrame of top 10 products in the dataSet


### How to define top 10? 
- The products with top 10 overall scores


### How is the overall score calculated? 
- Positive Review(4,5 stars)
- Negative Review(0,1,2,3 stars)
- PositiveSum = Sum of all Positive Review, each have score (1 multiply by 3 if verified, multiply by (1 + numberofVotes/10)) 
- NegativeSum = Sum of all Negative Review, each have score (-1 multiply by 3 if verified, multiply by (1 + numberofVotes/10)) 
- PostiveSum + NegativeSum is the overall score 

In [1]:
import numpy as np
import pandas as pd
import gzip
import nltk
import spacy
import csv
nlp = spacy.load('en_core_web_trf')

## General Functions

In [2]:
# read json file into a pandas dataframe and return the dataframe
# Assume the sample size is <= 10000
def readCreateDataFrame(path):
    df = pd.read_json(path, lines = True)
    df = df.sample(n = 10000, random_state = 113)
    df.reset_index(drop = True, inplace = True)
    return df

In [3]:
# read the neg word list and lemmatize it, return a list of lemmatized neg words
def getLemNegWords(path):
    negative_words = []
    with open(path, "r") as file:
        reader = csv.reader(file)
        for row in reader:
            negative_words.append(row[0])
            
    negative_words_lemma = []        
    for word in negative_words:
        doc = nlp(word)
        token = doc[0].lemma_.lower()
        if token not in negative_words_lemma:
            negative_words_lemma.append(token)
    return negative_words_lemma

In [4]:
# read the pos word list and lemmatize it, return a list of lemmatized pos words
def getLemPosWords(path):
    positive_words = []
    with open(path, "r") as file:
        reader = csv.reader(file)
        for row in reader:
            positive_words.append(row[0])
            
    positive_words_lemma = []        
    for word in positive_words:
        doc = nlp(word)
        token = doc[0].lemma_.lower()
        if token not in positive_words_lemma:
            positive_words_lemma.append(token)
    return positive_words_lemma

### read & create dataframe

In [5]:
df = readCreateDataFrame("C:\\Users\\zdszy\\Desktop\\NLP\\CapstoneProject\\AmazonSW\\Software.json")
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4,True,"11 7, 2013",A2NL42GF3KGW9,B00A8IZMUM,,Ute J. Collins,Good product and very easy to use\nWill come i...,A great product when you are making your own g...,1383782400,,
1,5,True,"12 29, 2014",A2WQWMUX8HLSGO,B00597EEIS,{'Platform:': ' PC Download'},Ralph Lane,I have used Corel Fusion for years. This purch...,It is easy to use and does what I Want to do,1419811200,,
2,3,False,"04 19, 2015",ATNSS2AKOKDA2,B00NG7K2RA,{'Platform:': ' PC Download'},jwd,earlier versions were more user friendly.produ...,Three Stars,1429401600,,
3,5,True,"04 6, 2017",A1ETHPQDGZRIF7,B0153V62P2,{'Platform:': ' Mac Download'},Carissa Laine Coulson,great product,Five Stars,1491436800,,
4,5,True,"12 27, 2014",A1CB86K53NG0P2,B00975BPC6,{'Platform:': ' PC/Mac Disc'},Kelly B,"Learned so much already, so easy to use. Works...",solid software,1419638400,,


### process dataframe

In [6]:
# delete unnecessary columns
# images: not used
# names: can be just first Name or with abbreviation
# ID: not using it for any cross referencing
# unixReviewTime: don't really know how to convert this
# reviewTime, style, summary not used for now
df.drop(columns = ['image', 'reviewerName', 'reviewerID', 'unixReviewTime', 'reviewTime', 'style', 'summary'], inplace = True)
# drops rows with NaN review
df.dropna(subset = ['reviewText'], inplace = True)
# replace vote with NaN with 0
# https://stackoverflow.com/questions/13295735/how-to-replace-nan-values-by-zeroes-in-a-column-of-a-pandas-dataframe
df['vote'].fillna(0, inplace = True)
df.head()
df['vote'] = df['vote'].apply(lambda str: int(str))

In [7]:
type(df.loc[7,'vote'])

numpy.int64

# Using Own Predictor

In [8]:
# get the ratio of pos words and neg words in one review
def getPNRatio(text):
    doc = nlp(text)
    token_lemma_cleaned = [token.lemma_ for token in doc if not token.is_stop]
    token_lemma_cleaned = [token.lower() for token in token_lemma_cleaned if token.isalpha()]
    pos_word_count = 1 # avoid divide by zero
    neg_word_count = 1
    for word in token_lemma_cleaned:
        if word in positive_words_lemma:
            pos_word_count += 1
        elif word in negative_words_lemma:
            neg_word_count += 1
    return pos_word_count/neg_word_count

In [9]:
# if larger than 1.94, we say its pos, else neg
def getPrediction(text):
    pnRatio = getPNRatio(text)
    if (pnRatio > 1.94):
        return 'pos'
    else:
        return 'neg'

In [10]:
# https://stackoverflow.com/questions/38026984/how-do-i-calculate-a-pandas-column-with-multiple-columns-as-arguments
def calculateWeightedScore(verified, votes, prediction):
    ans = 1
    if prediction == 'neg':
        ans *= -1
    if verified:
        ans *= 3
    ans *= 1 + (votes / 10)
    return ans

### get pos neg wordlist and lemmatize them

In [11]:
negative_words_lemma = getLemNegWords("words_negative.csv")
positive_words_lemma = getLemPosWords("words_positive.csv")

### apply predictor to the dataframe

In [12]:
df['prediction_myOwn'] = df['reviewText'].apply(lambda review: getPrediction(review))

In [17]:
df.head(10)
# print(type(df.loc[7,'vote']))
# type(df.loc[0,'verified'])
# df.loc[0,'myOwnScore']
#df.loc[7,'votes']

Unnamed: 0,overall,verified,asin,reviewText,vote,prediction_myOwn
0,4,True,B00A8IZMUM,Good product and very easy to use\nWill come i...,0,pos
1,5,True,B00597EEIS,I have used Corel Fusion for years. This purch...,0,pos
2,3,False,B00NG7K2RA,earlier versions were more user friendly.produ...,0,neg
3,5,True,B0153V62P2,great product,0,pos
4,5,True,B00975BPC6,"Learned so much already, so easy to use. Works...",0,pos
5,2,True,B0007P8H8U,For what this program does it is very over pri...,0,neg
6,4,False,B000W3RSGE,If you are a State Farm customer you can do yo...,0,pos
7,4,False,B0042X78SS,I really had fun with this! Starting from scra...,4,pos
8,5,True,B002SR4S1K,I have used H&R Block tax software for the pas...,2,pos
9,5,False,B000FK88JK,I have it installed and its running fine on my...,10,pos


In [18]:
# calculate each review's score by its weight
df['myOwnScore'] = list(map(calculateWeightedScore, df['verified'], df['vote'], df['prediction_myOwn']))

Unnamed: 0,overall,verified,asin,reviewText,vote,prediction_myOwn,myOwnScore
0,4,True,B00A8IZMUM,Good product and very easy to use\nWill come i...,0,pos,3.0
1,5,True,B00597EEIS,I have used Corel Fusion for years. This purch...,0,pos,3.0
2,3,False,B00NG7K2RA,earlier versions were more user friendly.produ...,0,neg,-1.0
3,5,True,B0153V62P2,great product,0,pos,3.0
4,5,True,B00975BPC6,"Learned so much already, so easy to use. Works...",0,pos,3.0


In [22]:
df.head()

Unnamed: 0,overall,verified,asin,reviewText,vote,prediction_myOwn,myOwnScore
0,4,True,B00A8IZMUM,Good product and very easy to use\nWill come i...,0,pos,3.0
1,5,True,B00597EEIS,I have used Corel Fusion for years. This purch...,0,pos,3.0
2,3,False,B00NG7K2RA,earlier versions were more user friendly.produ...,0,neg,-1.0
3,5,True,B0153V62P2,great product,0,pos,3.0
4,5,True,B00975BPC6,"Learned so much already, so easy to use. Works...",0,pos,3.0


In [25]:
result1 = df[['asin','myOwnScore']].groupby('asin').sum()
result1.nlargest(10, 'myOwnScore', keep = 'all')

Unnamed: 0_level_0,myOwnScore
asin,Unnamed: 1_level_1
B00UB76290,131.7
B00CTTEKJW,113.6
B00MYXTCGY,96.7
B009HBCU9W,70.4
B007QYZ08A,65.6
B000VPNSJY,55.8
B00FZ0FK0U,53.7
B00F8K9MZQ,52.3
B008XAXAC4,47.2
B0026PEPT4,45.9


# Using NLTK VADER

In [28]:
# lifted from inclass example
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [29]:
df['vader_prediction'] = df['reviewText'].apply(lambda review: 'pos' if sid.polarity_scores(review)['compound'] >= 0 else 'neg')
df.head()

Unnamed: 0,overall,verified,asin,reviewText,vote,prediction_myOwn,myOwnScore,vader_prediction
0,4,True,B00A8IZMUM,Good product and very easy to use\nWill come i...,0,pos,3.0,pos
1,5,True,B00597EEIS,I have used Corel Fusion for years. This purch...,0,pos,3.0,pos
2,3,False,B00NG7K2RA,earlier versions were more user friendly.produ...,0,neg,-1.0,neg
3,5,True,B0153V62P2,great product,0,pos,3.0,pos
4,5,True,B00975BPC6,"Learned so much already, so easy to use. Works...",0,pos,3.0,pos


In [30]:
# calculate each review's score by its weight
df['vadarScore'] = list(map(calculateWeightedScore, df['verified'], df['vote'], df['vader_prediction']))

In [31]:
result2 = df[['asin','vadarScore']].groupby('asin').sum()
result2.nlargest(10, 'vadarScore', keep = 'all')

Unnamed: 0_level_0,vadarScore
asin,Unnamed: 1_level_1
B00UB76290,493.9
B00CTTEKJW,350.2
B00EZPXYP4,181.6
B00MYXTCGY,129.3
B015724OVG,126.7
B00NG7JVSQ,124.0
B00F8K9MZQ,119.5
B00H9A60O4,118.6
B01617VPUY,94.3
B009HBCU9W,90.8


# Using Results From DataSet

In [32]:
df['review_tag'] = df['overall'].apply(lambda score: 'pos' if score >= 4 else 'neg')
df.head()

Unnamed: 0,overall,verified,asin,reviewText,vote,prediction_myOwn,myOwnScore,vader_prediction,vadarScore,review_tag
0,4,True,B00A8IZMUM,Good product and very easy to use\nWill come i...,0,pos,3.0,pos,3.0,pos
1,5,True,B00597EEIS,I have used Corel Fusion for years. This purch...,0,pos,3.0,pos,3.0,pos
2,3,False,B00NG7K2RA,earlier versions were more user friendly.produ...,0,neg,-1.0,neg,-1.0,neg
3,5,True,B0153V62P2,great product,0,pos,3.0,pos,3.0,pos
4,5,True,B00975BPC6,"Learned so much already, so easy to use. Works...",0,pos,3.0,pos,3.0,pos


In [33]:
# calculate each review's score by its weight
df['actualScore'] = list(map(calculateWeightedScore, df['verified'], df['vote'], df['review_tag']))

In [36]:
df.head()

Unnamed: 0,overall,verified,asin,reviewText,vote,prediction_myOwn,myOwnScore,vader_prediction,vadarScore,review_tag,actualScore
0,4,True,B00A8IZMUM,Good product and very easy to use\nWill come i...,0,pos,3.0,pos,3.0,pos,3.0
1,5,True,B00597EEIS,I have used Corel Fusion for years. This purch...,0,pos,3.0,pos,3.0,pos,3.0
2,3,False,B00NG7K2RA,earlier versions were more user friendly.produ...,0,neg,-1.0,neg,-1.0,neg,-1.0
3,5,True,B0153V62P2,great product,0,pos,3.0,pos,3.0,pos,3.0
4,5,True,B00975BPC6,"Learned so much already, so easy to use. Works...",0,pos,3.0,pos,3.0,pos,3.0


In [34]:
result3 = df[['asin','actualScore']].groupby('asin').sum()
result3.nlargest(10, 'actualScore', keep = 'all')

Unnamed: 0_level_0,actualScore
asin,Unnamed: 1_level_1
B00UB76290,410.5
B00EZPXYP4,189.6
B00CTTEKJW,174.4
B00F8K9MZQ,98.5
B015724OVG,87.5
B0064PFB9U,74.4
B01617VPUY,71.3
B00FZ0FK0U,69.1
B01F7RJHIQ,68.8
B00EZQYC8G,67.0
