In [1]:
import numpy as np
import pandas as pd
import json
import ast

In [2]:
microwave = pd.read_csv('microwave_alter.csv')
hair_dryer = pd.read_csv('hair_dryer_alter.csv')
pacifier = pd.read_csv('pacifier_alter.csv')

In [3]:
microwave = microwave.fillna(' ')
hair_dryer = hair_dryer.fillna(' ')
pacifier = pacifier.fillna(' ')

In [4]:
def calculate_vote_weight(helpful_votes, total_votes):
    
    support = helpful_votes / total_votes
    
    if support < 0.3:
        weight = 2 * support + 0.2
        type = 0 #customer
    elif support < 0.7:
        weight = 0.5 * support + 0.65
        type = 1 #review
    elif support < 0.9:
        weight = support + 0.3
        type = 1 #review
    else:
        weight = 3 * support - 1.5
        type = 0 #customer
    
    amp = ((float(total_votes) - 5) * 0.02 + 1)
    if amp > 2:
        amp = 2
    
    weight = weight ** amp
    
    return type, weight

In [5]:
def calculate_length_weight(content_length, average_length):
    ratio = float(content_length) / average_length
    if ratio < 0.8:
        weight = 0.125 * ratio + 0.9
    else:
        weight = 0.5 * ratio + 0.6
    if weight > 1.5:
        weight = 1.5
    return weight

In [6]:
def calculate_final_score(star_rating, content_score):
    star_score = float(star_rating) / 5
    if content_score == -1:
        return star_score
    if abs(star_score - content_score) > 0.3:
        return star_score
    else:
        return (star_score + content_score) / 2

In [7]:
def customer_dict_write(dict, customer_id, customer_weight):
    if customer_id in dict:
        dict[customer_id] = (dict[customer_id] * customer_weight) ** 0.5
    else:
        dict[customer_id] = customer_weight    

In [8]:
def customer_dict_read(dict, customer_id):
    if customer_id in dict:
        return dict[customer_id]
    else:
        return 1.0

In [9]:
def judge_weight(row, dict, average_length, average_keywords):
    # The vine customers have one and only advantage of boosting their weight to 3.0
    # The vine customers shall share no other weight multipliers
    if row['vine'] == 'y' or row['vine'] == 'Y':
        customer_dict_write(dict, row['customer_id'], 3.0)
        return 1.0
    
    # Initialize the weight
    customer_weight = 1.0
    review_weight = 1.0
    
    # If there are more than 5 votes, apply the vote weight multiplier
    if row['total_votes'] >= 5:
        [vote_type, vote_weight] = calculate_vote_weight(row['helpful_votes'], row['total_votes'])
        if vote_type == 0:
            customer_weight = customer_weight * vote_weight
        else:
            review_weight = review_weight * vote_weight
        
    # Customer weight will be punished if 'customer' is suspected talking nonsense
    if len(row['review_body']) > average_length and len(ast.literal_eval(row['keyword'])) < average_keywords / 2:
        customer_weight = customer_weight / 2
        
    # All customer weight multipliers have been applied, if customer weight is no longer 1.0, write the weight into the dictionary
    if customer_weight != 1.0:
        customer_dict_write(dict, row['customer_id'], customer_weight)
    
    # Then we go on to judge the review-only weight
    # If the purchase is not verified, cut the weight to half
    if row['verified_purchase'] == 'n' or row['verified_purchase'] == 'N':
        review_weight = review_weight / 2
    
    # Use the length weight multiplier
    review_weight = review_weight * calculate_length_weight(len(row['review_body']), average_length)
    
    return review_weight

In [10]:
def preprocess(df):
    df['final_score'] = 0.0
    df['weight_review_only'] = 1.0
    df['final_weight'] = 1.0

In [11]:
def cal_score_weight(df):
    preprocess(df)
    customer_weight = {}
    entry_count = len(df['review_body'])
    length_sum = 0
    keyword_sum = 0
    for index, row in df.iterrows():
        df['final_score'][index] = calculate_final_score(row['star_rating'], row['content_score'])
        length_sum = length_sum + len(row['review_body'])
        keyword_sum = keyword_sum + len(ast.literal_eval(row['keyword']))
    average_length = length_sum / entry_count
    average_keywords = keyword_sum / entry_count
    for index, row in df.iterrows():
        df['weight_review_only'][index] = judge_weight(row, customer_weight, average_length, average_keywords)
    for index, row in df.iterrows():
        df['final_weight'][index] = row['weight_review_only'] * customer_dict_read(customer_weight, row['customer_id'])

In [12]:
cal_score_weight(microwave)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [13]:
microwave.to_csv('microwave_weight.csv')

In [14]:
cal_score_weight(hair_dryer)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [15]:
hair_dryer.to_csv('hair_dryer_weight.csv')

In [16]:
cal_score_weight(pacifier)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [17]:
pacifier.to_csv('pacifier_weight.csv')

In [20]:
data = microwave.set_index(pd.to_datetime(microwave['review_date']))

In [33]:
data

Unnamed: 0_level_0,Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,content_score,keyword,final_score,weight_review_only,final_weight
review_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-08-31,0,21879631,RY52KZABZK8QF,B0052G14E8,423421857,danby 0.7 cu.ft. countertop microwave,1,0,0,N,Y,Do not recommend it. go use your money for som...,3 of the buttons stopped working after a month...,8/31/2015,0.234928,"['buttons', 'stopped', 'working', 'month', 'us...",0.217464,0.951809,0.951809
2015-08-31,1,14964566,R3GCOEV4HYZG2I,B0055UBB4O,423421857,danby 0.7 cu.ft. countertop microwave,5,0,0,N,Y,Looks as good as the picture,Didn't use it long so I can't attest to that b...,8/31/2015,0.121529,"['use', 'long', 'ca', 'attest', 'price', 'extr...",1.000000,0.990126,0.990126
2015-08-31,2,13230389,R1V2OPPNL0QGCE,B0052G14E8,423421857,danby 0.7 cu.ft. countertop microwave,4,0,0,N,Y,Four Stars,"Very nice microwave, great price",8/31/2015,0.998171,"['nice', 'microwave', 'great', 'price']",0.899085,0.908635,0.908635
2015-08-31,3,43655888,R9Q0QDTLKV567,B004ZU09QQ,423421857,danby 0.7 cu.ft. countertop microwave,3,0,1,N,Y,quiet,"Quiet, but does not seem like 1000 watt power.",8/31/2015,0.500000,"['Quiet', 'seem', 'like', 'watt', 'power']",0.550000,0.912413,0.912413
2015-08-31,4,117794,R3DL7HYC3QTWNI,B005GSZB7I,827502283,"whirlpool stainless look countertop microwave,...",4,0,0,N,Y,Four Stars,It's bigger than I thought.,8/31/2015,0.500000,"['bigger', 'thought']",0.800000,0.907286,0.907286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004-09-13,1610,49010899,R2OXHMMI830KJ3,B00009V3X6,459626087,sharp 1.1-cubic-foot 850-watt over-the-range c...,2,12,14,N,N,"A great, sleek oven... if you can get an undam...",We ordered one of these and were shipped an op...,9/13/2004,0.115423,"['ordered', 'one', 'shipped', 'open', 'return'...",0.257711,0.887863,0.887863
2004-07-20,1611,40308862,RY9INWIK8MAL3,B00012ORT2,305608994,sharp 950-watt 1-2/5-cubic-foot over-the-range...,3,20,53,N,N,What Happened to Sharp Quality?,I do not have this particular microwave; thoug...,7/20/2004,0.082553,"['particular', 'microwave', 'though', 'Sharp',...",0.600000,0.427347,0.427347
2004-06-25,1612,36386173,RXOAWGI9Z98QY,B00009V3WZ,379992322,sharp 1-1/2-cubic-foot 1000-watt over-the-rang...,4,19,19,N,Y,"Great microwave, little tough to mount",Its a pretty big and heavy unit so having some...,6/25/2004,0.075752,"['pretty', 'big', 'heavy', 'unit', 'someone', ...",0.800000,1.500000,2.520508
2004-06-21,1613,18471248,RVNWLGVDWE20J,B00009V3X8,379992322,sharp 1-1/2-cubic-foot 1000-watt over-the-rang...,3,6,6,N,Y,Sharp could be Sharper,UPDATE: Yes i too had the door latch button me...,6/21/2004,0.215294,"['UPDATE', 'Yes', 'door', 'latch', 'button', '...",0.600000,1.500000,2.268320


In [35]:
pd.to_datetime(microwave['review_date'])

0      2015-08-31
1      2015-08-31
2      2015-08-31
3      2015-08-31
4      2015-08-31
          ...    
1610   2004-09-13
1611   2004-07-20
1612   2004-06-25
1613   2004-06-21
1614   2004-06-19
Name: review_date, Length: 1615, dtype: datetime64[ns]