In [1]:
import pandas as pd
import os
import numpy as np

## Load json data
* `yelp_academic_dataset_user`
* `yelp_academic_dataset_business`
* `yelp_academic_dataset_review`
* `yelp_academic_dataset_tip`

In [2]:
DATA_ROOT_DIRECTORY = '../rev_Yelp/'

In [188]:
active_user_df = pd.read_json(os.path.join(DATA_ROOT_DIRECTORY, "yelp_academic_dataset_user.json")).reset_index(drop=True)
restaurant_df = pd.read_json(os.path.join(DATA_ROOT_DIRECTORY, "yelp_academic_dataset_business.json")).reset_index(drop=True)
review_df = pd.read_json(os.path.join(DATA_ROOT_DIRECTORY, "yelp_academic_dataset_review.json")).reset_index(drop=True)
tip_df = pd.read_json(os.path.join(DATA_ROOT_DIRECTORY, "yelp_academic_dataset_tip.json")).reset_index(drop=True)

In [179]:
active_user_df['elite'] = active_user_df['elite'].str.split(',').str.len()

## Joining based on `business`

In [184]:
def gen_prange(n):
    return np.arange(n+1)/n

def convert_column_by_percentage(column_data, percentage, score=None):
    '''
    Args
        column_data: pandas Series
            input feature
        percentage: list
            list of percentage for assigning different scores.
            note that the list should be in the assending order and have only positive value ranging from 0.0 to 1.0.
        score: list (Optional)
            list of scores corresponding to the percentage list,
            if not given, will automatically use [0 ... len(percentage)]
    '''
    if score is None:
        score = np.arange(len(percentage))
    assert len(percentage) == len(score), print("The length of the defined range is not equal to that of the scores")
    column_sorted = np.sort(column_data)
    output = column_data.values.copy()
    last_value = column_data.min()-1
    for p, s in zip(percentage, score):
        indice = int((column_sorted.size-1)*p)
        value = column_sorted[indice]
        idx = (column_data <= value) & (column_data > last_value)
        output[idx.values] = s
        last_value = value
    return output

def convert_column_by_absolute(column_data, absolute_values, score=None):
    '''
    Args
        column_data: pandas Series
            input feature
        absolute_values: list
            absolute value range for assigning different scores.
            note that the list should be in the assending order.
        score: list (Optional)
            list of scores corresponding to the percentage list,
            if not given, will automatically use [0 ... len(percentage)]
    '''
    if score is None:
        score = np.arange(len(absolute_values))
    assert len(absolute_values) == len(score), print("The length of the defined range is not equal to that of the scores")
    column_sorted = np.sort(column_data)
    output = column_data.values.copy()
    last_value = column_data.min()-1
    for av, s in zip(absolute_values, score):
        idx = (column_data <= av) & (column_data > last_value)
        output[idx.values] = s
        last_value = av
    return output

def assign_score(to_df, from_src, by_feature, split_list, with_type, score=None):
    assert with_type != 'absolute' or with_type != 'percentage', print("Neither `absolute` or `percentage`")
    output = None
    if with_type == 'percentage':
        output = convert_column_by_percentage(active_user_df[by_feature], split_list)
    elif with_type == 'absolute':
        output = convert_column_by_absolute(active_user_df[by_feature], split_list)
    user_score_df[by_feature] = output.tolist()

### 1. Calculate the importance of each `user`

In [54]:
active_user_df.columns

Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'],
      dtype='object')

In [64]:
user_score_df = pd.DataFrame(active_user_df['user_id'])
user_score_df = user_score_df.reset_index(drop=True)

* `review_count` - percentage 0...1/5
* `fans` - percentage 0...1/10
* `elite` - absolute

In [181]:
trg_features = ['review_count', 'fans', 'elite']
trg_split_range = [gen_prange(5), gen_prange(10), [0, 1, 2, 4, 6, 8, 17]]
trg_split_type = ['percentage', 'percentage', 'absolute']

In [185]:
for i in range(len(trg_features)):
    assign_score(to_df=user_score_df, from_src=active_user_df, by_feature=trg_features[i],
                 split_list=trg_split_range[i], with_type=trg_split_type[i])

In [186]:
user_score_df.head(5)

Unnamed: 0,user_id,review_count,fans,elite
0,qVc8ODYU5SZjKXVBgXdI7w,5,10,1
1,j14WgRoU_-2ZE1aw1dXrJg,5,10,6
2,2WnXYQFK0hXEoTxPtV2zvg,5,9,4
3,SZDeASXq7o05mMNLshsdIA,3,7,3
4,q_QQ5kBBwlCcbL1s4NVK3g,5,10,6


### 2. Join the important of `user` with `review`

**Potential issue**:
* same user may have > 1 reviews on the same restaurant

In [117]:
review_df.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')

In [138]:
review_score_df = pd.DataFrame(review_df.iloc[:, 1:4])
review_score_df = review_score_df.join(user_score_df.set_index('user_id'), on='user_id')
review_score_df.head(5)

Unnamed: 0,user_id,business_id,stars,review_count,fans
0,j2wlzrntrbKwyOcOiB3l3w,rBdG_23USc7DletfZ11xGA,4,1,2
1,4hBhtCSgoxkrFgHa4YAD-w,bbEXAEFr4RYHLlZ-HFssTA,5,5,9
2,ZGjgfSvjQK886kiTzLwfLQ,EtKSTHV5Qx_Q7Aur9o4kQQ,5,5,9
3,IKbjLnfBQtEyVzEu8CuOLg,VJEzpfLs_Jnzgqh5A_FVTg,4,5,9
4,DBYhpb5hrAYgQjQaMhNYyQ,oJ4ik-4PZe6gexxW-tSmsw,4,3,3


### 3. Obtain the weighted score of `business` based on `review`
Code reference: https://sparkbyexamples.com/pandas/pandas-groupby-sum-examples/

In [163]:
review_restaurant_df = review_score_df.drop(columns="user_id")
review_restaurant_df = review_restaurant_df.groupby('business_id', as_index=False).mean()
# review_restaurant_df = review_restaurant_df.groupby('business_id', as_index=False).agg(['sum', 'count']).reset_index() 
review_restaurant_df.head(3)

Unnamed: 0,business_id,stars,review_count,fans
0,---kPU91CF4Lq2-WlRu9Lw,4.0,3.0,7.0
1,--epgcb7xHGuJ-4PUeSLAw,4.0,3.2,6.0
2,--hF_3v1JmU9nlu4zfXJ8Q,4.5,1.75,2.25


In [157]:
restaurant_df.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')

In [167]:
restaurant_score_df = pd.DataFrame(restaurant_df['business_id'])
restaurant_score_df = restaurant_score_df.join(review_restaurant_df.set_index('business_id'), on='business_id')
restaurant_score_df.head(3)

Unnamed: 0,business_id,stars,review_count,fans
0,MTSW4McQd7CbVtyjqoe9mw,3.914286,3.485714,6.828571
1,CF33F8-E6oudUQ46HnavjQ,4.0,4.0,5.0
2,bBDDEgkFA1Otx9Lfe7BZUQ,3.0,3.0,9.0


In [189]:
restaurant_df.iloc[:, :-4].head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80
1,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6
2,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10


### 4. Sum all/parts of the scores for clustering

**Examples**
* sum - `restaurant_score_df[['stars', 'review_count', 'fans']].sum(1)`
* define ur own function as below

In [195]:
def my_combine_function(arr):
    output = 1
    for ele in arr:
        output *= ele
    return output

restaurant_score_df[['stars', 'review_count', 'fans']].apply(my_combine_function, axis=1)

0         93.169586
1         80.000000
2         81.000000
3        250.000000
4         70.000000
            ...    
31352     76.901249
31353    170.625000
31354     90.037037
31355     98.222222
31356     85.750000
Length: 31357, dtype: float64

* combination (e.g., `sum` and self-defined functions)

In [198]:
def weighted_score(arr):
    return arr[0]*0.4+arr[1]*0.6
final_score = restaurant_score_df[['stars', 'fans']].apply(my_combine_function, axis=1)
final_score += restaurant_score_df[['fans', 'review_count']].apply(weighted_score, axis=1)
final_score += restaurant_score_df[['stars', 'review_count']].sum(1)
final_score/3

0        12.983946
1        10.800000
2        12.800000
3        22.333333
4        15.333333
           ...    
31352    11.391003
31353    16.508333
31354    12.340741
31355    12.518519
31356    12.133333
Length: 31357, dtype: float64