# Project task 04:  Restaurant ranking

In [40]:
import numpy as np
import scipy.sparse as sp
from sklearn.preprocessing import normalize #additional package imported for normalizing the sparse matrix

The goal of this task is to rank restaurants using the **PageRank** algorithm. You are given a directed weighted graph where each node represents one restaurant. The edges in this graph are based on users reviews.

Additionally for each restaurant you are given the categories it belongs to, i.e. 'Mexican', 'Italian', etc. Note that each restaurant can belong to multiple categories.

Considering these categories as topics you will perform **Topic-Specific PageRank**, enabling you to e.g. find the top 10 'Mexican' restaurants.

## 1. Load data

* The graph is stored as a sparse adjacency matrix $A$
* The categories are stored in a binary sparse matrix $C$, with $C_{ij}=1$ indicating that restaurant $i$ belongs to category $j$
* We also provide you with a dictionary mapping each category to its corresponding column index in $C$
* The name of each restaurant is provided as a list, with the i-th element in the list corresponding to the i-th node in the graph

In [41]:
A = sp.load_npz('restaurant_graph.npz')
A

<7073x7073 sparse matrix of type '<class 'numpy.float64'>'
	with 1682844 stored elements in Compressed Sparse Row format>

In [42]:
C = sp.load_npz('restaurant_categories.npz')
C

<7073x138 sparse matrix of type '<class 'numpy.float64'>'
	with 19047 stored elements in Compressed Sparse Row format>

In [43]:
categories = np.load('categories.npy').tolist()
categories['Mexican'], categories['Chinese']

(3, 14)

In [44]:
names = np.load('restaurant_names.npy')
names[:3]

array(['Alize Catering', 'Chula Taberna Mexicana', 'Sunnyside Grill'],
      dtype='<U50')

In [45]:
assert A.shape[0] == len(names) == C.shape[0]
assert C.shape[1] == len(categories)

 ## 2. Determine the teleport set
 

Given a list of topics of intereset, i.e. `['Mexican', 'Italian', ...]`, implement a helper function to return all the restaurants that belong to **at least one** of these topics. These restaurants will become part of the teleport set in Topic-Specific PageRank.

In [46]:
def teleport_set(C, topics, categories):
    """
    Finds the teleport set consisting of restaurants that belong to at least one of the specified topics.
    
    Parameters
    ----------
    C             : sp.spmatrix, shape [num_restaurants, num_categories]
                    Binary matrix encoding which restaurants belongs to which categories.
    topics        : List[string]
                    List of topics of interest.
    categories    : dict(string, int)
                    Dictionary mapping each category to its corresponding column index in C.
        
    Returns
    -------
    teleport_idx : np.array, shape [S]
                   The indicies of the nodes in the teleport set.
    """
    teleport_idx=[] #initialize teleport set
    for z in range (len(topics)): #loop that goes through the given topics 
        current_topic=topics[z] #get the current topic
        current_indice=categories.get(current_topic)#get the corresponding indice for the current topic
        nonzero_rows,nonzero_columns=C.nonzero()#get the rows and columns indices of the nonzero elements of the sparse matrix
        mask=(nonzero_columns==current_indice)#define a mask that is true for all nonzero columns that are equal to the topic indice
        teleport_inter=nonzero_rows[mask] #apply the mask to the nonzero_rows to get the restaurants that corresponds to the wanted topic 
        for t in range (len(teleport_inter)):#write the relevant indices into the new array
            current_value = teleport_inter[t]
            teleport_idx.append(current_value)
        
    
    return teleport_idx

 ## 2. Implement Topic-Specific PageRank

In [47]:
def page_rank(A, beta, teleport_idx=None, eps=1e-12):
    """
    Implements Topic-Specific PageRank using power iteration and sparse matrix operations.
    
    Parameters
    ----------
    A           : sp.spmatrix, shape [num_restaurants, num_restaurants]
                  The adjacency matrix representing the graph of restaurants.
    beta        : float, 
                  0 < beta < 1, (1-beta) is the probabilty of teleporting to the nodes in the teleport set
    teleport_idx: np.array, shape [S]
                  The indicies of the nodes in the teleport set. If it equals to None
                  it means runs standard PageRank, i.e. all nodes are in the teleport set.
    
    Returns
    -------
    r          : np.array, shape [num_restaurants]
                 The page rank vector containing the page rank scores for each restaurant.
    """
    num_restaurants,_ = A.shape #get the number of restaurants from the shape of the adjacency matrix
    vector_add_full = np.ones(num_restaurants)*(1/num_restaurants) #vector to be add during the eigenvalue computation for the full teleport set
    r=np.random.rand((num_restaurants)) #initialize vector r randomly
    flag=True #initialize the flag
    if teleport_idx == None: #check if page ranking should be topic specific or not
            while flag == True:
                r_old = r #define the previous computed r as r_old to compute later the difference between r and r_old
                A = normalize(A, norm='l1', axis=0) #normalize A since A isn't a stochastic matrix yet
                r = beta*A.dot(r)+(1-beta)*vector_add_full#compute the new r following the given equation
                r = r/(np.sum(r))#normalize the new r 
                #check if convergence is reached
                if np.mean((r-r_old)**2)<1e-12: #power iteration stops if the mean of the Euclidean distance between the current and the previous vector is under the given threshold epslion
                    flag=False
    else:
        while flag == True: #perform a topic specific page ranking
            r_old = r #define the previous computed r as r_old to compute later the difference between r and r_old
            vector_add = np.zeros((num_restaurants))# vector to be add is initialized with zeros since only position should be unequal zero
            vector_add[teleport_idx] += 1 #add a one for restaurants that corresponds to the given topics
            A = normalize(A, norm='l1', axis=0) #normalize A since A isn't a stochastic matrix yet
            r = beta * A.dot(r) + ((1-beta)*vector_add)/len(teleport_idx)#compute the new r with adding a vector that is only unequal to zero if the restaurant is part of the teleport_idx set and corresponds to one of the given topics
            r = r/(np.sum(r))#normalize r 
            if np.mean((r-r_old)**2) < 1e-12: #same stopping criteria like above
                flag=False
    
    return r

### 3.1 Calculate the standard PageRank scores and print the names of the top 5 restaurants overall

In [48]:
idx_to_category = {v:k for k, v in categories.items()}

In [49]:
r = page_rank(A=A, beta=0.6, teleport_idx=None)

for i, x in enumerate(r.argsort()[-5:]):
    print(i+1, names[x], '\n  Categories: ', [idx_to_category[cat] for cat in C[x].nonzero()[1]])

1 Congee Me 
  Categories:  ['Korean']
2 Go Go China 
  Categories:  ['Chinese']
3 Sushi Making For the Soul 
  Categories:  ['Japanese']
4 Spring Rolls 
  Categories:  ['African']
5 Happy Tummy Filipino Cuisine 
  Categories:  ['Chinese']


### 3.2 Calculate the standard PageRank scores and print the names of top 5 Mexican restaurants

In [50]:
teleport_idx = teleport_set(C, ['Mexican'], categories)
r = page_rank(A=A, beta=0.6, teleport_idx=teleport_idx)

for i, x in enumerate(r.argsort()[-5:]):
    print(i+1, names[x], '\n  Categories: ', [idx_to_category[cat] for cat in C[x].nonzero()[1]])

1 Chill 
  Categories:  ['Mexican']
2 El Taquito 
  Categories:  ['Mexican']
3 The Atlantic 
  Categories:  ['Fast Food', 'Mexican']
4 Burrito Loco 
  Categories:  ['Mexican']
5 El Takito 
  Categories:  ['Mexican']


### 3.3 Calculate the standard PageRank scores and print the names of top 5 Italian or French restaurants


In [51]:
teleport_idx = teleport_set(C, ['Italian', 'French'], categories)
r = page_rank(A=A, beta=0.6, teleport_idx=teleport_idx)

for i, x in enumerate(r.argsort()[-5:]):
    print(i+1, names[x], '\n  Categories: ', [idx_to_category[cat] for cat in C[x].nonzero()[1]])

1 Ali Baba's Middle Eastern Cuisine 
  Categories:  ['Sandwiches', 'Pizza', 'Italian']
2 New May Hong Yuen BBQ 
  Categories:  ['Italian']
3 Sunnyside Café 
  Categories:  ['French']
4 IPho Vietnamese Cuisine 
  Categories:  ['Italian']
5 McDonald's 
  Categories:  ['Italian']
