In [1]:
import math
import numpy as np
from   numpy import linalg as LA
import scipy
import scipy.io
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
    
from sklearn import datasets     # Machine Learning in Python
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.manifold import MDS 

import torch
import torch.nn as nn            # a neural networks library 
import torch.nn.functional as F  
import torch.optim as optim      # an optimization package to be used with torch.nn

import pandas as pd
import json

np.set_printoptions(precision=4)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def read_json(filepath):
    with open(filepath) as json_file:      
        data = json_file.readlines()
        data = list(map(json.loads, data)) 
    return data

def build_graph(X, knn=True):
    if knn:
        A = kneighbors_graph(X, n_neighbors=5, include_self=False)
        A = np.array(A.todense())
        A = np.clip(A + A.T, 0, 1)
    else:
        A = pairwise_kernels(X, metric='rbf', gamma=1)
    return A

## Create graphs from yelp dataset 

In [5]:
# Read data 

data_business = read_json('dataset/business.json')
business      = pd.DataFrame(data_business)

df1           = business[['business_id', 'categories', 'latitude', 'longitude', 'review_count', 'stars']]
dfna1         = df1.dropna() #[pd.notnull(df)]
dfna1.isnull().any()


'''data_review   = read_json('dataset/review.json')
review        = pd.DataFrame(data_review)

df2           = review[['business_id', 'user_id']]
dfna2         = df2.dropna() #[pd.notnull(df)]
dfna2.isnull().any()'''


data_checkin  = read_json('dataset/checkin.json')
checkin       = pd.DataFrame(data_checkin)

df3           = checkin
dfna3         = df3.dropna() #[pd.notnull(df)]
dfna3.isnull().any()

business_id    False
time           False
dtype: bool

In [7]:
# selection of the restaurants only
restaurant         = dfna1[dfna1['categories'].astype(str).str.contains("Restaurants")]
business_id_full   = restaurant[['business_id']]
stars              = restaurant[['stars']]

# Evaluation -- Stars
Xeval              = stars.values

In [8]:
# Take a subset of the restaurant 

testN = 1000
business_id = business_id_full[0:testN]

order = {}
for (id, index) in zip(business_id.business_id, range(len(business_id))):
    order[id] = index
    
restaurant_testN = restaurant.iloc[0:testN].copy(deep=True)
restaurant_testN = [restaurant_testN[restaurant_testN['business_id'].astype(str).str.contains(m)] for m in business_id.business_id]
restaurant_testN = pd.concat(restaurant_testN)

stars_textN      = restaurant_testN[['stars']]
Xeval_textN      = np.round(stars_textN.values) - 1 

In [9]:
# -- First Graph --

# Graph link: geographical positions
restaurant_geo_pos   = restaurant_testN[['latitude', 'longitude']]
X1                   = restaurant_geo_pos.values # (54618, 2)
graph1               = build_graph(X1)

# Graph Signal -- Review count
restaurant_rev_count = restaurant_testN[['review_count']]
sig1                 = restaurant_rev_count.values # (54618, 2)

In [None]:
# -- Second Graph -- 

# Graph link: if the same user visited two different restaurants, these resto are linked
N                     = len(business_id)
X2                    = np.zeros((N,N))

dfna2_testN = dfna2.iloc[0:testN].copy(deep=True)
dfna2_testN = [dfna2_testN[dfna2_testN['business_id'].astype(str).str.contains(m)] for m in business_id.business_id]
dfna2_testN = pd.concat(dfna2_testN)

business_groupby_user = dfna2_testN.groupby('user_id')
list_groupby_user     = list(business_groupby_user)

for id_list in range(len(list_groupby_user)):
    sublist0 = list_groupby_user[id_list]
    sublist  = sublist0[1]
    sublist  = sublist['business_id']
        
    for id_busi1 in range(len(sublist)):
        id1 = sublist.iloc[id_busi1]
        idx1 = order[id1]
        
        for id_busi in range(id_busi1+1, len(sublist)):
            id2 = sublist.iloc[id_busi]
            idx2 = order[id2]
            X2[idx1,idx2] += 1
            X2[idx2,idx1] += 1

# Normalization 
graph2 = X2 / np.max(X2)

In [10]:
# Second Graph Signal --  number of Checkin in the restaurant
testN = len(dfna3)
dfna3_testN = dfna3.iloc[0:testN].copy(deep=True)

dfna3_testN = [dfna3_testN[dfna3_testN['business_id'].astype(str).str.contains(m)] for m in business_id.business_id]
dfna3_testN = pd.concat(dfna3_testN)

days = ('Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday')

vect = np.zeros((len(business_id),7))

for bus_id in range(len(dfna3_testN)):
    id1 = dfna3_testN.iloc[bus_id]['business_id']
    idx1 = order[id1]
    
    bus_time = dfna3_testN.iloc[bus_id]['time']
    
    for i, d in enumerate(days):
        v = bus_time.get(d)
        if v is not None:
            vect[idx1,i] = np.sum(v.values())
sig2 = vect

TypeError: float() argument must be a string or a number, not 'dict_values'

In [12]:
graph1

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])