In [4]:
import json
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [6]:
r_path = "../restaurants_encoded.json"
modeling_path = "../data_modeling.json"
training_path = "../data_training.json"
user_model_path = "user_model_new.json"

##############################################
#
#STEP 1: store restaurants as dict
#
##############################################
restaurants = dict()
with open(r_path, "r") as data:
    for full in data:
        restaurants = json.loads(full)
        
# add restaurant topic and text vectors
restaurant_vectors_df = pd.read_csv("../tdidf_vectors.tsv",sep=",")
restaurant_vectors_dict = dict()
for index,rows in restaurant_vectors_df.iterrows():
    vector = list(restaurant_vectors_df.iloc[index,1:])
    restaurant_id = str(restaurant_vectors_df.loc[index,"restaurant_id"])
    restaurant_vectors_dict[restaurant_id] = vector
    
restaurant_topics_dict = dict()
with open("../restaurant.topic.prob.distribution.json", "r") as data:
    for full in data:
        restaurant_topics_df = json.loads(full)

In [7]:
##############################################
user_restaurant = dict()
user_model = dict()

with open(modeling_path, 'r') as data:
    for full in data:
        user_restaurant = json.loads(full)
        
        users = user_restaurant.keys()
        print(len(users))
        
        for user in users:
            info = dict()
            info['rCount'] = list()
            info['stars'] = list()
            info['alcohol'] = list()
            info['attire'] = list()
            info['wifi'] = list()
            info['noise'] = list()
            info['smoke'] = list()
            info['age'] = list()
            info['price'] = list()
            info['music'] = list()
            info['gfm'] = list()
            info['ab'] = list()
            info['park'] = list()
            info['best'] = list()
            info['diet'] = list()
            info['service'] = list()
            info['policy'] = list()
            info['style'] = list()
            info['topic_prob'] = list()
            info['text_vector'] = list()
 
            for restaurant in user_restaurant[user]:
                res_id = restaurant[0]
                res_rate = restaurant[1]
                
                if res_id in restaurants:
                    info['rCount'].append([restaurants[res_id]['rCount'], res_rate])
                    info['stars'].append([restaurants[res_id]['stars'], res_rate])
                    info['alcohol'].append([restaurants[res_id]['attire'], res_rate])
                    info['wifi'].append([restaurants[res_id]['wifi'], res_rate])
                    info['noise'].append([restaurants[res_id]['noise'], res_rate])
                    info['smoke'].append([restaurants[res_id]['smoke'], res_rate])
                    info['age'].append([restaurants[res_id]['age'], res_rate])
                    info['price'].append([restaurants[res_id]['price'], res_rate])
                    info['music'].append([restaurants[res_id]['music'], res_rate])
                    info['gfm'].append([restaurants[res_id]['gfm'], res_rate])
                    info['ab'].append([restaurants[res_id]['ab'], res_rate])
                    info['park'].append([restaurants[res_id]['park'], res_rate])
                    info['best'].append([restaurants[res_id]['best'], res_rate])
                    info['diet'].append([restaurants[res_id]['diet'], res_rate])
                    info['service'].append([restaurants[res_id]['service'], res_rate])
                    info['policy'].append([restaurants[res_id]['policy'], res_rate])
                    info['style'].append([restaurants[res_id]['style'], res_rate])
                    
                    if res_id in restaurant_topics_dict.keys():
                        info['topic_prob'].append([restaurant_topics_dict[res_id], res_rate])
                    else:
                        info['topic_prob'].append([[0]*50, res_rate])
                    
                    if res_id in restaurant_vectors_dict.keys():
                        info['text_vector'].append([restaurant_vectors_dict[res_id], res_rate])
                    else:
                        info['text_vector'].append([[0]*4660, res_rate])
                    
            user_model[user] = info

print(len(user_model))

280588
280588


In [8]:
def compute_topic_sim(list1,list2):
    
    '''
    Compute the similarity between two restaurants' topic distributions with Total Variation
    Args: two lists of two topic distributions
    Returns: a numeric value
    '''
    
    array1 = np.array(list1)
    array2 = np.array(list2)
    
    return sum(abs(array1 - array2)) / 2

def compute_vector_sim(list1,list2):

    '''
    Compute the similarity between two restaurants' text vectors with cosine similarity
    Args: two lists of two topic distributions
    Returns: a numeric value
    '''
    array1 = np.array(list1)
    array2 = np.array(list2)
    
    # cosine sim with smoothing
    result = np.dot(array1,array2)/(np.linalg.norm(array1)*np.linalg.norm(array2))
    
    if np.isnan(result):
        return 0
    else:
        return result

In [9]:
training_path = "../data_testing.json"
restaurants_path = "../restaurants_encoded.json"
training_X = "../testing_X_with_text_features.json"
training_Y = "../testing_Y_with_text_features.json"

training_list = list()
with open(training_path, 'r') as data:
    for full in data:
        training_list = json.loads(full)

restaurants = dict()
with open(restaurants_path, 'r') as data:
    for full in data:
        restaurants = json.loads(full)

In [10]:
restaurants = dict()
with open(r_path, "r") as data:
    for full in data:
        restaurants = json.loads(full)
        
for k in restaurants.keys():
    
    if k in restaurant_vectors_dict.keys():
        restaurants[k].update({"text_vector": restaurant_vectors_dict[k]})
    else:
        restaurants[k].update({"text_vector": [0]*4660})
        
    if k in restaurant_topics_dict.keys():
        restaurants[k].update({"topic_prob": restaurant_topics_dict[k]})
    else:
        restaurants[k].update({"topic_prob": [0]*50})

In [20]:
def match(uid, bid):
    
    u_info = user_model[uid]
    b_info = restaurants[bid]
    
    vec = list()
    features = u_info.keys() # feature names
    
    for feature in features: # we need to preserve the order of feature here
        
        b_feature = b_info[feature]
        u_feature = u_info[feature]
        value = 0
        
        
        if feature == "topic_prob":
            b_feature = b_info[feature]
            u_feature = u_info[feature]
            for item in u_feature:
                diff = compute_topic_sim(item[0],b_feature)
                value = value + diff * item[1]
        
        if feature == "text_vector":
            b_feature = b_info[feature]
            u_feature = u_info[feature]
            for item in u_feature:
                diff = compute_vector_sim(item[0],b_feature)
                value = value + diff * item[1]
        
        
        elif type(b_feature) is int:
            for item in u_feature:
                diff = abs(item[0]-b_feature)
                if diff == 0:
                    diff = 0.5
                value = value + 1/diff * item[1]
        
        elif type(b_feature) is float:
            for item in u_feature:
                diff = abs(item[0]-b_feature)
                if diff == 0:
                    diff = 0.5
                value = value + 1/diff * item[1]
        else:
            for item in u_feature:
                diff = 0
                for i in range(len(b_feature)):
                    if (b_feature[i] != item[0][i]):
                        diff = diff + 1
                if diff == 0:
                    diff = 0.5
                value = value + 1/diff * item[1]
        
        length = len(u_feature)
        if (length == 0):
            length = 1
        value = value/length
        vec.append(value)
    
    assert len(vec) == 20
    return vec

In [21]:
X = list()
Y = list()

count = 1
for item in training_list:
    if item[0] in user_model and item[1][0] in restaurants:
        x = match(item[0], item[1][0])
        X.append(x)
        Y.append(item[1][1])
        
    count += 1
    if count%5000==0:
        print(count)

5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000


In [23]:
with open(training_X, 'w') as t:
    json.dump(X, t)
    
with open(training_Y, 'w') as t:
    json.dump(Y, t)