In [1]:
import pandas as pd 
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import json  
import networkx as nx
import csv
from multiprocessing import Pool

### Prepping user and review datasets for assortativity and node neighbor computations for our friend networks

In [2]:
# load and manipulate user data

Yelp_Users = pd.read_json('YelpDataset/user.json',lines=True) # read in user dataset
Yelp_Users['number of friends'] = Yelp_Users['friends'].apply(len) # get number of friends
df_usersNarrow = Yelp_Users[Yelp_Users['number of friends']>0] # sifting dataset to people who have at least one friend
df_usersNarrow = df_usersNarrow[['user_id','friends','number of friends']]

In [3]:
# load in review data with latent business categories associated to each business

Yelp_ReviewsWithLatCats = pd.read_csv("Yelp_ReviewsWITHLATENTCATEGORIES.csv")
Yelp_ReviewsWithLatCats = Yelp_ReviewsWithLatCats[['business_id',
                                                   'review_id','user_id','stars',
                                                   'businessLatentCategory']]
Yelp_ReviewsWithLatCats = Yelp_ReviewsWithLatCats.dropna()

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
# users in reviews dataset
reviewsGroupBy = Yelp_ReviewsWithLatCats.groupby("user_id")
Reviews_setOfUserIds = set(list(reviewsGroupBy.groups.keys()))
print(len(Reviews_setOfUserIds))

# users in users dataset where users have at least one friend
Users_setOfUserIds = set(df_usersNarrow['user_id'])
print(len(Users_setOfUserIds)) 

1325415
760008


In [7]:
# Only want reviews of users with at least one friend!
Yelp_ReviewsWithLatCats = Yelp_ReviewsWithLatCats[Yelp_ReviewsWithLatCats['user_id'].isin(Users_setOfUserIds)]

# group by users and business latent categories
groupbyUSERandCAT = Yelp_ReviewsWithLatCats.groupby(['user_id','businessLatentCategory'])
userLatentCats_List = list(groupbyUSERandCAT.groups) 

In [11]:
# Getting list of users who wrote reviews (and creating dictionary for having reviews
# as node attributes)
YelpReviews_Users = Yelp_ReviewsWithLatCats.groupby('user_id')
Yelp_ReviewsUserDict = {g: d for g, d in YelpReviews_Users}
ListOfUSERS = list(YelpReviews_Users.groups)

In [7]:
# Reading in edge lists for larger and smaller friend networks
BigYelp = nx.read_edgelist("BigYelpEdgeList.txt")
yelpinho = nx.read_edgelist("YelpinhoEdgeList.txt")

In [77]:
# ASSORTATIVITY BASED ON STAR AVERAGE

ratingMean = Yelp_ReviewsWithLatCats.groupby("user_id").agg("mean")
ratingMeanDict = dict(zip(ratingMean.index,ratingMean.stars))
nx.set_node_attributes(yelpinho, ratingMeanDict, "AverageStar")
nx.attribute_assortativity_coefficient(yelpinho,"AverageStar")

0.027079046453203907

In [96]:
# ASSORTATIVITY BASED ON REVIEW COUNT

revCountSeries = Yelp_ReviewsWithLatCats.groupby("user_id").size()
revCountSeriesDict = revCountSeries.to_dict()
nx.set_node_attributes(yelpinho, reviewsRevCountSeriesDict, "ReviewCount")
nx.attribute_assortativity_coefficient(yelpinho,"ReviewCount")

0.026268943031893734

In [115]:
# ASSORTATIVITY BASED ON CATEGORY AVERAGE RATINGS

busLatCatsList = list(Yelp_ReviewsWithLatCats['businessLatentCategory'].unique())

siftedReviewDfsList = []
siftedListDicts = []
for latentCat in busLatCatsList:
    siftedDf = Yelp_ReviewsWithLatCats[Yelp_ReviewsWithLatCats['businessLatentCategory'] == latentCat]
    siftedReviewDfsList.append((latentCat,siftedDf))
    print(latentCat)
    SiftedAggStarMean = siftedDf.groupby("user_id").agg("mean")
    SiftedAggStarMeanDict = dict(zip(SiftedAggStarMean.index,SiftedAggStarMean.stars))
    siftedListDicts.append((latentCat,SiftedAggStarMeanDict))
    
    
    SiftedRevCountSeries = siftedDf.groupby("user_id").size()
    SiftedRevCountSeriesDict = SiftedRevCountSeries.to_dict()
    
    
    
for latentCatDictTuple in siftedListDicts:
    latentCategory = latentCatDictTuple[0]
    localDict = latentCatDictTuple[1]
    nx.set_node_attributes(yelpinho, localDict, latentCategory)

for latCat in busLatCatsList:
    print(latCat + " assortativity: "+str(nx.attribute_assortativity_coefficient(yelpinho,latCat)))

Bars assortativity: 0.02708162228366847
Restaurants assortativity: 0.0403651435345962
Fashion assortativity: 0.07108086867204101
Beauty & Spas assortativity: 0.08342029331960416
Pubs assortativity: 0.06724732150377787
Active Life assortativity: 0.09773029468230386
Cosmetics & Beauty Supply assortativity: 0.05674868615639566
Home & Garden assortativity: 0.09314802793194313
Sports Clubs assortativity: 0.07849327460664403
Used assortativity: 0.0690736806092073
Preschools assortativity: 0.0457145961510417
Financial Services assortativity: 0.0531528655192846
Hair Removal assortativity: 0.09022263489582312
Oil Change Stations assortativity: 0.04593194437626429
Home Cleaning assortativity: 0.06490892747692766


## Now getting node degrees from yelpinho and BigYelp to compute correlation coefficients

In [47]:
# create dictionaries with node degrees in each network 
yelpinhoDegreeDict = dict(yelpinho.degree())
BigYelpDegreeDict = dict(BigYelp.degree())

# creating dataframes with shared nodes and their degrees in each network
dfDegrees = pd.DataFrame.from_dict(yelpinhoDegreeDict,orient = 'index',columns = ['yelpinho'])
dfDegrees['shared_nodes'] = dfDegrees.index
dfDegrees['BigYelp'] = dfDegrees['shared_nodes'].apply(lambda ID: BigYelpDegreeDict[ID])

In [59]:
dfDegrees['yelpinho'].corr(dfDegrees['BigYelp'], method = 'pearson')

0.8685414501447632

In [58]:
dfDegrees['yelpinho'].corr(dfDegrees['BigYelp'],method = 'spearman')

0.772002464623897

## Neighbor (friend) computations
- Will see if a user's friends will influence or correlate with how that user rates items (businesses)
- I compute average ratings (each friend has equal weight) and weighted average ratings (friends that rate more are weighted more heavily than those who do not) for both general reviews and reviews about a specific business latent category

In [28]:
# vars to keep track of user friends for whom we have no rating data 
# (keeping track of those friends we exclude)
count1 = 0
count2 = 0
count3 = 0
count4 = 0
noNeighborInfo1 = 0
noNeighborInfo2 = 0
noNeighborInfo3 = 0
noNeighborInfo4 = 0

def neighborsAverageRating(userIdNode):
    # returns average rating of a user's friends
    
    global count1
    global noNeighborInfo1
    count1+=1
    totalAvStarSum = 0
    totalPeople = 0
    
    # get the user's friends 
    if userIdNode in yelpinho:
        listNeighbors = list(nx.all_neighbors(yelpinho,userIdNode))
        numNeighbors = len(listNeighbors)
        for neighborId in listNeighbors:
            try: # computing neighbor averages
                testNodeAttributes = nx.get_node_attributes(yelpinho,"Reviews")[neighborId] 
                if len(testNodeAttributes) > 0 :
                    sumOfStarsNeighbor = testNodeAttributes['stars'].sum()
                    sumOfNumReviews = len(testNodeAttributes['stars']) 
                    avStarNeighbor = sumOfStarsNeighbor/sumOfNumReviews 
                    totalAvStarSum+=avStarNeighbor
                    totalPeople+=1
            except KeyError:
                # if we do not have rating data on friend
                print("function 1, count = "+str(count1)+": user: "+userIdNode +", friendId: "+ neighborId)
                noNeighborInfo1 +=1
                print("No Neighbor count: "+str(noNeighborInfo1))
        av = totalAvStarSum/totalPeople
        return (av,numNeighbors)
    else:
        return (np.NAN,numNeighbors)
    
def neighborsWeightedAverageRating(userIdNode):
    # returns weighted average rating of a user's friends

    global count2
    global noNeighborInfo2
    count2+=1
    totalStarSum = 0
    totalReviews = 0
    
    # get the user's friends 
    if userIdNode in yelpinho:
        listNeighbors = list(nx.all_neighbors(yelpinho,userIdNode))
        numNeighbors = len(listNeighbors)
        for neighborId in listNeighbors:
            try: # computing neighbor weighted averages
                testNodeAttributes = nx.get_node_attributes(yelpinho,"Reviews")[neighborId]
                if len(testNodeAttributes) > 0 :
                    totalStarSum += testNodeAttributes['stars'].sum()
                    totalReviews += len(testNodeAttributes['stars']) 
            except KeyError:
                # if we do not have rating data on friend
                print("function 2, count = "+str(count2)+": user: "+userIdNode +", friendId: "+ neighborId)
                noNeighborInfo2 +=1
                print("No Neighbor count: "+str(noNeighborInfo2))
        if totalReviews > 0:
            avRating = totalStarSum/totalReviews
            return (avRating,numNeighbors)
    else:
        return (np.NAN,numNeighbors)
    

def neighborsAverageRatingByBusinessCategory(tupleUserIdCat):
    # returns average rating of a user's friends pertaining to a certain business category

    global count3
    global noNeighborInfo3
    count3+=1
    userIdNode = tupleUserIdCat[0]
    busLatCat = tupleUserIdCat[1]
    totalAvStarSum = 0
    totalPeople = 0
    
    # get the user's friends 
    if userIdNode in yelpinho:
        listNeighbors = list(nx.all_neighbors(yelpinho,userIdNode))
        numNeighbors = len(listNeighbors)
        for neighborId in listNeighbors:
            try: # computing neighbor averages by business category
                testNodeAttributes = nx.get_node_attributes(yelpinho,"Reviews")[neighborId] 
                if busLatCat in testNodeAttributes['businessLatentCategory'].values:
                    busCatSumOfStarsNeighbor = testNodeAttributes.groupby("businessLatentCategory").get_group(busLatCat)['stars'].agg("sum")
                    busCatSumOfNumReviews = len(testNodeAttributes.groupby("businessLatentCategory").get_group(busLatCat)['stars'])
                    busLatCatStarsMean = busCatSumOfStarsNeighbor/busCatSumOfNumReviews
                    totalAvStarSum+=busLatCatStarsMean
                    totalPeople+=1
            except KeyError:
                # if we do not have rating data on friend
                print("function 3, count = "+str(count3)+": user: "+userIdNode +", friendId: "+ neighborId)
                noNeighborInfo3 +=1
                print("No Neighbor count: "+str(noNeighborInfo3))
        if totalPeople > 0:
            av = totalAvStarSum/totalPeople
            return (av,numNeighbors)
    else:
        return (np.NAN,numNeighbors)
    
def neighborsWeightedAverageRatingByBusinessCategory(tupleUserIdCat):
    # returns weighted average rating of a user's friends pertaining 
    # to a certain business category

    global count4
    global noNeighborInfo4
    count4+=1
    userIdNode = tupleUserIdCat[0]
    busLatCat = tupleUserIdCat[1]
    totalStarSum = 0
    totalReviews = 0
    
    # get the user's friends
    if userIdNode in yelpinho:
        listNeighbors = list(nx.all_neighbors(yelpinho,userIdNode))
        numNeighbors = len(listNeighbors)
        for neighborId in listNeighbors:
            try: # computing neighbor weighted averages by business category
                testNodeAttributes = nx.get_node_attributes(yelpinho,"Reviews")[neighborId] 
                if busLatCat in testNodeAttributes['businessLatentCategory'].values:
                    totalStarSum += testNodeAttributes.groupby("businessLatentCategory").get_group(busLatCat)['stars'].agg("sum")
                    totalReviews += len(testNodeAttributes.groupby("businessLatentCategory").get_group(busLatCat)['stars'])
            except KeyError:
                # if we do not have rating data on friend
                print("function 4, count = "+str(count4)+": user: "+userIdNode +", friendId: "+ neighborId)
                noNeighborInfo4 +=1
                print("No Neighbor count: "+str(noNeighborInfo4))

        if totalReviews > 0:
            avRating = totalStarSum/totalReviews
            return (avRating,numNeighbors)
    else:
        return (np.NAN,numNeighbors) 

In [29]:
# need to improve parallelization!!

def ratingAverages_UserFriends(LIST): 
    # computing all averages above for all users in yelpinho
    List = LIST[0]
    whichCSV = LIST[2]
    if LIST[1] == 1: 
        csvfile = ''
        if whichCSV == 'first':
            csvfile = '1_first.csv'
        elif whichCSV == 'second':
            csvfile = '1_second.csv'
        with open(csvfile, 'a') as newFile:
            newFileWriter = csv.writer(newFile)
            for i in range(len(List)):
                locDic1 = {}
                locDic1[List[i]] = (neighborsAverageRating(List[i]))
                print(locDic1)
                newFileWriter.writerow([List[i], locDic1[List[i]]]) 
        return {}
    
    if LIST[1] == 2: 
        csvfile = ''
        if whichCSV == 'first':
            csvfile = '2_first.csv'
        elif whichCSV == 'second':
            csvfile = '2_second.csv'
        with open(csvfile, 'a') as newFile:
            newFileWriter = csv.writer(newFile)
            for i in range(len(List)):
                locDic2 = {}
                locDic2[List[i]] = (neighborsWeightedAverageRating(List[i]))
                print(locDic2)
                newFileWriter.writerow([List[i], locDic2[List[i]]]) 
        return {}
    
    if LIST[1] == 3: 
        csvfile = ''
        if whichCSV == 'first':
            csvfile = '3_first.csv'
        elif whichCSV == 'second':
            csvfile = '3_second.csv'
        with open(csvfile, 'a') as newFile:
            newFileWriter = csv.writer(newFile)
            for i in range(len(List)):
                locDic3 = {}
                locDic3[List[i]] = (neighborsAverageRatingByBusinessCategory(List[i]))
                print(locDic3)
                newFileWriter.writerow([List[i], locDic3[List[i]]]) 
        return {}
    
    if LIST[1] == 4: 
        csvfile = ''
        if whichCSV == 'first':
            csvfile = '4_first.csv'
        elif whichCSV == 'second':
            csvfile = '4_second.csv'
        with open(csvfile, 'a') as newFile:
            newFileWriter = csv.writer(newFile)
            for i in range(len(List)):
                locDic4 = {}
                locDic4[List[i]] = (neighborsWeightedAverageRatingByBusinessCategory(List[i]))
                print(locDic4)
                newFileWriter.writerow([List[i], locDic4[List[i]]]) 
        return {}

In [None]:
# NEED TO IMPROVE PARALLELIZATION OF TASKS (first attempt)
def f(arg):
    return ratingAverages_UserFriends(arg)

pool=Pool(processes=8)

listOfDicts = pool.map(f, [(ListOfUSERS[0:379868],1,"first"),\
                                (ListOfUSERS[379868:len(ListOfUSERS)],1,"second"),\
                                (ListOfUSERS[0:379868],2,"first"),\
                                (ListOfUSERS[379868:len(ListOfUSERS)],2,"second"),\
                                (userLatentCats_List[0:770514],3,"first"),\
                                (userLatentCats_List[770514:len(userLatentCats_List)],3,"second"),\
                                (userLatentCats_List[0:770514],4,"first"),\
                                (userLatentCats_List[770514:len(userLatentCats_List)],4,"second")])

