In [1]:
import numpy as np
import pandas as pd
import json
import ast

In [2]:
microwave = pd.read_csv('microwave_alter.csv')
hair_dryer = pd.read_csv('hair_dryer_alter.csv')
pacifier = pd.read_csv('pacifier_alter.csv')

In [3]:
microwave = microwave.fillna(' ')
hair_dryer = hair_dryer.fillna(' ')
pacifier = pacifier.fillna(' ')

In [4]:
def calculate_vote_weight(helpful_votes, total_votes):
    
    support = helpful_votes / total_votes
    
    if support < 0.3:
        weight = 2 * support + 0.2
        type = 0 #customer
    elif support < 0.7:
        weight = 0.5 * support + 0.65
        type = 1 #review
    elif support < 0.9:
        weight = support + 0.3
        type = 1 #review
    else:
        weight = 3 * support - 1.5
        type = 0 #customer
    
    amp = ((float(total_votes) - 5) * 0.02 + 1)
    if amp > 2:
        amp = 2
    
    weight = weight ** amp
    
    return type, weight

In [5]:
def calculate_length_weight(content_length, average_length):
    ratio = float(content_length) / average_length
    if ratio < 0.8:
        weight = 0.125 * ratio + 0.9
    else:
        weight = 0.5 * ratio + 0.6
    if weight > 1.5:
        weight = 1.5
    return weight

In [6]:
def calculate_final_score(star_rating, content_score):
    star_score = float(star_rating) / 5
    if content_score == -1:
        return star_score
    if abs(star_score - content_score) > 0.3:
        return star_score
    else:
        return (star_score + content_score) / 2

In [7]:
def customer_dict_write(dict, customer_id, customer_weight):
    if customer_id in dict:
        dict[customer_id] = (dict[customer_id] * customer_weight) ** 0.5
    else:
        dict[customer_id] = customer_weight    

In [8]:
def customer_dict_read(dict, customer_id):
    if customer_id in dict:
        return dict[customer_id]
    else:
        return 1.0

In [9]:
def judge_weight(row, dict, average_length, average_keywords):
    # The vine customers have one and only advantage of boosting their weight to 3.0
    # The vine customers shall share no other weight multipliers
    if row['vine'] == 'y' or row['vine'] == 'Y':
        customer_dict_write(dict, row['customer_id'], 3.0)
        return 1.0
    
    # Initialize the weight
    customer_weight = 1.0
    review_weight = 1.0
    
    # If there are more than 5 votes, apply the vote weight multiplier
    if row['total_votes'] >= 5:
        [vote_type, vote_weight] = calculate_vote_weight(row['helpful_votes'], row['total_votes'])
        if vote_type == 0:
            customer_weight = customer_weight * vote_weight
        else:
            review_weight = review_weight * vote_weight
        
    # Customer weight will be punished if 'customer' is suspected talking nonsense
    if len(row['review_body']) > average_length and len(ast.literal_eval(row['keyword'])) < average_keywords / 2:
        customer_weight = customer_weight / 2
        
    # All customer weight multipliers have been applied, if customer weight is no longer 1.0, write the weight into the dictionary
    if customer_weight != 1.0:
        customer_dict_write(dict, row['customer_id'], customer_weight)
    
    # Then we go on to judge the review-only weight
    # If the purchase is not verified, cut the weight to half
    if row['verified_purchase'] == 'n' or row['verified_purchase'] == 'N':
        review_weight = review_weight / 2
    
    # Use the length weight multiplier
    review_weight = review_weight * calculate_length_weight(len(row['review_body']), average_length)
    
    return review_weight

In [10]:
def preprocess(df):
    df['final_score'] = 0.0
    df['weight_review_only'] = 1.0
    df['final_weight'] = 1.0

In [11]:
def cal_score_weight(df):
    preprocess(df)
    customer_weight = {}
    entry_count = len(df['review_body'])
    length_sum = 0
    keyword_sum = 0
    for index, row in df.iterrows():
        df['final_score'][index] = calculate_final_score(row['star_rating'], row['content_score'])
        length_sum = length_sum + len(row['review_body'])
        keyword_sum = keyword_sum + len(ast.literal_eval(row['keyword']))
    average_length = length_sum / entry_count
    average_keywords = keyword_sum / entry_count
    for index, row in df.iterrows():
        df['weight_review_only'][index] = judge_weight(row, customer_weight, average_length, average_keywords)
    for index, row in df.iterrows():
        df['final_weight'][index] = row['weight_review_only'] * customer_dict_read(customer_weight, row['customer_id'])

In [12]:
cal_score_weight(microwave)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [13]:
microwave.to_csv('microwave_weight.csv')

In [14]:
cal_score_weight(hair_dryer)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [15]:
hair_dryer.to_csv('hair_dryer_weight.csv')

In [16]:
cal_score_weight(pacifier)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [17]:
pacifier.to_csv('pacifier_weight.csv')