In [None]:
import pandas as pd
import networkx as nx
import scipy as sc
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
matplotlib.rcParams.update({'font.size': 22})
import operator
import json
import seaborn as sns
import scipy.stats as sc
import numpy as np

In [None]:
tp = pd.read_csv('resources/yelp_academic_dataset_user.csv', iterator=True, chunksize=1000)
df_users = pd.concat(tp, ignore_index=True)
tp = pd.read_csv('resources/yelp_academic_dataset_business.csv', iterator=True, chunksize=1000)
df_business = pd.concat(tp, ignore_index=True)
tp = pd.read_csv('resources/yelp_academic_dataset_review.csv', iterator=True, chunksize=1000)
df_reviews = pd.concat(tp, ignore_index=True)
tp = pd.read_csv('resources/yelp_academic_dataset_checkin.csv', iterator=True, chunksize=1000) 
df_checkin = pd.concat(tp, ignore_index=True)

In [None]:
# Merging the review, user and business table
df_merge = pd.merge(df_reviews,df_users, on='user_id',how='inner')
df_merge = pd.merge(df_merge,df_business,on='business_id',how='inner')

# Question 1: Degree Distribution

In [None]:
# networkx graph where the nodes are the businesses and users and there is an edge between 
# a business and a user if that user has reviewed that business
G=nx.Graph()
for index, row in df_merge.iterrows():
    G.add_edge((row.name_x,"user",row.user_id),(row.name_y,"business",row.business_id)) 

In [None]:
degsUsers = {}
degsBusiness = {}
for n in G.nodes () :
    deg = G.degree ( n )
    if n[1] == "user":
        if deg not in degsUsers :
            degsUsers [ deg ] = 0
        degsUsers [ deg ] += 1 
    if n[1] == "business":
        if deg not in degsBusiness :
            degsBusiness [ deg ] = 0
        degsBusiness [ deg ] += 1
itemsB = sorted ( degsBusiness.items () )

itemsU = sorted ( degsUsers.items () )

In [None]:
# plotting the business degree distribution on linear scale
fig = plt.figure (figsize=(20,10))
ax = fig.add_subplot (111)
ax.plot ([ k for (k , v ) in itemsB ] , [ v for (k ,v ) in itemsB ])
plt.ylim([0,2000])
plt.title ( " Business Degree Distribution " )
plt.show()

In [None]:
# plotting the user degree distribution on linear scale
fig = plt.figure (figsize=(20,10))
ax = fig.add_subplot (111)
ax.plot ([ k for (k , v ) in itemsU ] , [ v for (k ,v ) in itemsU ])
plt.ylim([0,200])
plt.title ( " User Degree Distribution " )
plt.show()

In [None]:
# plotting the business degree distribution on log scale
fig = plt.figure (figsize=(20,10))
ax = fig.add_subplot (111)
ax.plot ([ k for (k , v ) in itemsB ] , [ v for (k ,v ) in itemsB ])
ax.set_xscale ('log')
ax.set_yscale ('log') 
plt.title ( " Business Degree Distribution in Log Scale" )
plt.show()

In [None]:
# plotting the user degree distribution on linear scale
fig = plt.figure (figsize=(20,10))
ax = fig.add_subplot (111)
ax.plot ([ k for (k , v ) in itemsU ] , [ v for (k ,v ) in itemsU ])
ax.set_xscale ( 'log')
ax.set_yscale ( 'log') 
plt.title ( " Users Degree Distribution in Log Scale" )
plt.show()

1. User and business degree distribution seems to follow a power law distribution when plotted on a linear scale. However on plotting both the data on log scale we can see that the business degree distribution first increases and then decreases- this shows its not a power low distribution. The user distribution almost fits a straight line, hence it can be modelled as a power law distribution.

2. The distribution shows that that most of the nodes have a low degree and as the degree increases the number of nodes having that degree decreases exponentially.



# Question 2: PageRank

In [None]:
# Using the networkx provided implementation of the pagerank algorithm
# Networkx converts an undirected graphs to a directed graph 
# with two directed edges for each undirected edge.

pRank = nx.pagerank(G, alpha=0.85)

sorted_x = sorted(pRank.items(), key=lambda x: x[1] if x[0][1] =="business" else 0  
                  ,reverse=True)[:100]
df_top_100 =  pd.DataFrame(data = None, columns = df_business.columns)
for i in range (0,100):
    df2 = (df_business[df_business['business_id'] == sorted_x[i][0][2]])
    frames = [df_top_100, df2]
    df_top_100 = pd.concat(frames)

df_top_100[['name','stars','review_count','attributes_Price Range']]

It can be noted that the top 100 businesses have a good rating and also have a huge no. of reviews, an interesting observation is that most businesses in this list are on the lower side of the price range probably because the lower the price the more people that can afford to try it.

In [None]:
# write to json
data={}
for i in range(0,100):
    key = "Top"+str(i + 1)
    data[key] = {}
    data[key]['pagerank'] =  sorted_x[i][1]
    data[key]['businessId'] = sorted_x[i][0][2]

with open('HW3.json', 'w') as outfile:
    json.dump(data, outfile)

# Question 3 : Ranking Features
Factors which are used to rank businesses 
1. review count
2. average rating
3. number of checkins
4. completeness of profile as measured by the count of Nan values for attributes
5. frequency of reviews i.e number of reviews per year

Price range coul also be a good ranking factor but it is not used since its values are discrete so many businesses fall in the same price range.

In [None]:
# The stars column for business is not continuous so finding the average rating for business using 
# the review dataframe
b_rating = df_merge['stars_x'].groupby(df_merge['business_id']).mean()
df_rating = pd.DataFrame(list(b_rating.items()), columns=['business_id', 'average_rating'])
df_business = pd.merge(df_business,df_rating,on='business_id',how='inner')

In [None]:
# count no. of NaN values per row
df_business['fields_incomplete'] = df_business.isnull().sum(axis=1)

In [None]:
# find the checkin count
df_checkin['checkin_count'] = df_checkin.sum(axis = 1, skipna = True)
df_checkin = df_checkin[['business_id','checkin_count']]
df_business = pd.merge(df_business,df_checkin,on='business_id',how='inner')

In [None]:
# find the frequency of rating
df_merge['date'] = pd.to_datetime(df_merge['date'])
b_rating_freq = df_merge['date'].map( lambda x : x.year).groupby(df_merge['business_id'])
b_rating_freq = b_rating_freq.agg(lambda x: x.value_counts().mean())
df_rating_freq = pd.DataFrame(list(b_rating_freq.items()), 
                              columns=['business_id', 'rating_frequency'])
df_business = pd.merge(df_business,df_rating_freq,on='business_id',how='inner')

In [None]:
# plotting the correlation matrix
cols = ['average_rating','review_count','checkin_count','fields_incomplete','rating_frequency']
correlations = df_business[cols].corr(method='spearman')

fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,5,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(cols)
ax.set_yticklabels(cols)
plt.show()

In [None]:
# table showing the spearman correlation coefficient for all pairs of ranking features 
correlations

In [None]:
# permutation test
for i in range(0,len(cols)):
    for j in range(i+1,len(cols)):
        result = correlations[cols[i]][cols[j]]
        print("spearman coeff. b/w",cols[i],"&",cols[j],":",result)
        r_list = []
        for k in range(0,10000):
            r_list.append(sc.spearmanr(df_business[cols[i]], 
                                       df_business[cols[j]].reindex(
                                        np.random.permutation(df_business.index)))[0])
        if result > 0:
            pvalue = len([l for l in r_list if l >= float(result)])/10000
        else:
            pvalue = len([l for l in r_list if l <= float(result)])/10000
        
        sns.distplot(r_list)
        plt.axvline(x=result,ymax=0.7,color='g',linewidth=4)
        plt.show()
        print("\n p-value calculated from permutation test: " + str(pvalue) + "\n")

The pvalue calculated using the spearmanr correlation and the one from the permutation test are show above.
Since the calculated pvalues for all pair of variables are lower than alpha, the level of significance of 0.05, the correlation between all the pair of variables are significant and not likely to have occured by chance and hence we can reject the null hypothesis.

# Question 4: Pagerank with aggregate ranking features

In [None]:
# Normalize all the ranking features for calculating an aggregate score
max = df_business['average_rating'].max()
min = df_business['average_rating'].min()
df_business['average_rating_N'] =  df_business['average_rating'].apply(
                                            lambda x :(x - min)/(max - min), 1)

max = df_business['review_count'].max()
min = df_business['review_count'].min()
df_business['review_count_N'] =  df_business['review_count'].apply( 
                                            lambda x :(x - min)/(max - min), 1)

max = df_business['checkin_count'].max()
min = df_business['checkin_count'].min()
df_business['checkin_count_N'] =  df_business['checkin_count'].apply(
                                            lambda x :(x - min)/(max - min), 1)

max = df_business['fields_incomplete'].max()
min = df_business['fields_incomplete'].min()
df_business['fields_incomplete_N'] =  df_business['fields_incomplete'].apply(
                                            lambda x :(x - min)/(max - min), 1)

max = df_business['rating_frequency'].max()
min = df_business['rating_frequency'].min()
df_business['rating_frequency_N'] =  df_business['rating_frequency'].apply(
                                            lambda x :(x - min)/(max - min), 1)

df_business['business_score'] = df_business.apply(
    lambda row: row['average_rating_N'] + row['review_count_N'] + row['checkin_count_N'] 
        + row['fields_incomplete_N'] + row['rating_frequency_N'], axis=1)

In [None]:
#calculate user score
# the user score is based on the usefulness of votes, no. of fans
# and no. of times the person was an elite user

df_users['user_score'] = df_users.apply(lambda row: row['votes.useful'] + len(row['elite']) + 
                             row['fans'], axis=1)
# normalize the score
max = df_users['user_score'].max()
min = df_users['user_score'].min()
df_users['user_score'] = df_users['user_score'].apply( lambda x:(x - min)/(max - min), 1)
df_users['user_score'] 
df_users = df_users[['name','user_score','user_id']]

In [None]:
df_merge = pd.merge(df_reviews,df_users, on='user_id',how='inner')
df_merge = pd.merge(df_merge,df_business,on='business_id',how='inner')

In [None]:
G=nx.Graph()
for index, row in df_merge.iterrows():
    G.add_edge((row.name_x,"user",row.user_id,row.user_score),(row.name_y,"business",
                                                               row.business_id,row.business_score)) 

In [None]:
div = df_merge['user_score'].sum() + df_merge['business_score'].sum() 
#convert the score to a probability
personalize = dict((n, n[3]/div) for n in G)

In [None]:
pRank_new = nx.pagerank(G, alpha=0.9,personalization=personalize)
sorted_x_new = sorted(pRank_new.items(), key=lambda x: x[1] if x[0][1] =="business" else 0 ,
                      reverse=True)[:100]

In [None]:
df_top_100_new =  pd.DataFrame(data = None, columns = df_business.columns)
for i in range (0,100):
    df2 = (df_business[df_business['business_id'] == sorted_x_new[i][0][2]])
    frames = [df_top_100_new, df2]
    df_top_100_new = pd.concat(frames)

df_top_100_new[['name','average_rating','checkin_count','rating_frequency','review_count',
                'attributes_Price Range']]

Yes the rankings do differ after incorporating edge weights in the graph.

When the pagerank was run with a uniform distribution prior the top 100 businesses where the ones with the highest number of user ratings without taking into account the quality of those rating, while with the second approach the users were assigned a score based on their activity on yelp thereby assigning more weightage to reviews from elite users also the businesses were assigned a score which is a aggregation of different business properties so the top 100 businesses performed well in a range of metrics and not just the count of reviews
For e.g. the average rating for the top1 business in the first case (Gangnam Asian BBQ Dining )is 4.69 whereas in the second case (Mon Ami Gabi) is only 4.14 but this business has a very high checkin count, rating frequency and review count and hence is a more well rounded business overall.