In [1]:
import pandas as pd 
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import json  
import networkx as nx
import csv
from multiprocessing import Pool

### Prepping user and review datasets for assortativity and node neighbor computations for our friend networks

In [2]:
# load and manipulate user data

Yelp_Users = pd.read_json('YelpDataset/user.json',lines=True) # read in user dataset
Yelp_Users['number of friends'] = Yelp_Users['friends'].apply(len) # get number of friends
df_usersNarrow = Yelp_Users[Yelp_Users['number of friends']>0] # sifting dataset to people who have at least one friend
df_usersNarrow = df_usersNarrow[['user_id','friends','number of friends']]

In [3]:
# load in review data with latent business categories associated to each business

Yelp_ReviewsWithLatCats = pd.read_csv("Yelp_ReviewsWITHLATENTCATEGORIES.csv")
Yelp_ReviewsWithLatCats = Yelp_ReviewsWithLatCats[['business_id',\
                                                   'review_id','user_id','stars',\
                                                   'businessLatentCategory']]
Yelp_ReviewsWithLatCats = Yelp_ReviewsWithLatCats.dropna()

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
# users in reviews dataset
reviewsGroupBy = Yelp_ReviewsWithLatCats.groupby("user_id")
Reviews_setOfUserIds = set(list(reviewsGroupBy.groups.keys()))
print(len(Reviews_setOfUserIds))

# users in users dataset where users have at least one friend
Users_setOfUserIds = set(df_usersNarrow['user_id'])
print(len(Users_setOfUserIds)) 

1325415
760008


In [7]:
# Only want reviews of users with at least one friend!
Yelp_ReviewsWithLatCats = Yelp_ReviewsWithLatCats[Yelp_ReviewsWithLatCats['user_id'].isin(Users_setOfUserIds)]

# group by users and business latent categories
groupbyUSERandCAT = Yelp_ReviewsWithLatCats.groupby(['user_id','businessLatentCategory'])
userLatentCats_List = list(groupbyUSERandCAT.groups) 

In [11]:
# Getting list of users who wrote reviews (and creating dictionary for having reviews
# as node attributes)
YelpReviews_Users = Yelp_ReviewsWithLatCats.groupby('user_id')
Yelp_ReviewsUserDict = {g: d for g, d in YelpReviews_Users}
ListOfUSERS = list(YelpReviews_Users.groups)

In [7]:
# Reading in edge lists for larger and smaller friend networks
BigYelp = nx.read_edgelist("BigYelpEdgeList.txt")
yelpino = nx.read_edgelist("YelpinoEdgeList.txt")

In [None]:
# ASSORTATIVITY BASED ON STAR AVERAGE

In [47]:
ratingMean = Yelp_ReviewsWithLatCats.groupby("user_id").agg("mean")

In [73]:
ratingMeanDict = dict(zip(ratingMean.index,ratingMean.stars))

In [76]:
nx.set_node_attributes(yelpino, ratingMeanDict, "AverageStar")

In [77]:
nx.attribute_assortativity_coefficient(yelpino,"AverageStar")

0.027079046453203907

In [None]:
# ASSORTATIVITY BASED ON REVIEW COUNT

In [90]:
revCountSeries = Yelp_ReviewsWithLatCats.groupby("user_id").size()

In [91]:
revCountSeriesDict = revCountSeries.to_dict()

In [95]:
nx.set_node_attributes(yelpino, reviewsRevCountSeriesDict, "ReviewCount")

In [96]:
nx.attribute_assortativity_coefficient(yelpino,"ReviewCount")

0.026268943031893734

In [None]:
## PREPPING ASSORTATIVITY BASED ON CATEGORY AVERAGE RATINGS

In [99]:
busLatCatsList = list(Yelp_ReviewsWithLatCats['businessLatentCategory'].unique())

In [120]:
siftedReviewDfsList = []
siftedListDicts = []
siftedListEnthusDicts = []
for latentCat in busLatCatsList:
    siftedDf = Yelp_ReviewsWithLatCats[Yelp_ReviewsWithLatCats['businessLatentCategory'] == latentCat]
    siftedReviewDfsList.append((latentCat,siftedDf))
    print(latentCat)
    SiftedAggStarMean = siftedDf.groupby("user_id").agg("mean")
    SiftedAggStarMeanDict = dict(zip(SiftedAggStarMean.index,SiftedAggStarMean.stars))
    siftedListDicts.append((latentCat,SiftedAggStarMeanDict))
    
    
    SiftedRevCountSeries = siftedDf.groupby("user_id").size()
    SiftedRevCountSeriesDict = SiftedRevCountSeries.to_dict()
    
    enthusiasmByCatDict = {k: SiftedRevCountSeriesDict[k]*SiftedAggStarMeanDict[k] for k in SiftedAggStarMeanDict}
    
    siftedListEnthusDicts.append((latentCat,enthusiasmByCatDict))
    
    print("Done "+latentCat)

Bars
Done Bars
Restaurants
Done Restaurants
Fashion
Done Fashion
Beauty & Spas
Done Beauty & Spas
Pubs
Done Pubs
Active Life
Done Active Life
Cosmetics & Beauty Supply
Done Cosmetics & Beauty Supply
Home & Garden
Done Home & Garden
Sports Clubs
Done Sports Clubs
Used
Done Used
Preschools
Done Preschools
Financial Services
Done Financial Services
Hair Removal
Done Hair Removal
Oil Change Stations
Done Oil Change Stations
Home Cleaning
Done Home Cleaning


In [122]:
for latentCatDictTuple in siftedListDicts:
    latentCategory = latentCatDictTuple[0]
    localDict = latentCatDictTuple[1]
    nx.set_node_attributes(yelpino, localDict, latentCategory)

In [115]:
for latCat in busLatCatsList:
    print(latCat + " assortativity: "+str(nx.attribute_assortativity_coefficient(yelpino,latCat)))

Bars assortativity: 0.02708162228366847
Restaurants assortativity: 0.0403651435345962
Fashion assortativity: 0.07108086867204101
Beauty & Spas assortativity: 0.08342029331960416
Pubs assortativity: 0.06724732150377787
Active Life assortativity: 0.09773029468230386
Cosmetics & Beauty Supply assortativity: 0.05674868615639566
Home & Garden assortativity: 0.09314802793194313
Sports Clubs assortativity: 0.07849327460664403
Used assortativity: 0.0690736806092073
Preschools assortativity: 0.0457145961510417
Financial Services assortativity: 0.0531528655192846
Hair Removal assortativity: 0.09022263489582312
Oil Change Stations assortativity: 0.04593194437626429
Home Cleaning assortativity: 0.06490892747692766


In [123]:
for latentCatDictEnthusTuple in siftedListEnthusDicts:
    latentCategory = latentCatDictEnthusTuple[0] + "_enthusiasm"
    print(latentCategory)
    localDict = latentCatDictEnthusTuple[1]
    nx.set_node_attributes(yelpino, localDict, latentCategory)

Bars_enthusiasm
Restaurants_enthusiasm
Fashion_enthusiasm
Beauty & Spas_enthusiasm
Pubs_enthusiasm
Active Life_enthusiasm
Cosmetics & Beauty Supply_enthusiasm
Home & Garden_enthusiasm
Sports Clubs_enthusiasm
Used_enthusiasm
Preschools_enthusiasm
Financial Services_enthusiasm
Hair Removal_enthusiasm
Oil Change Stations_enthusiasm
Home Cleaning_enthusiasm


In [124]:
for latCat in busLatCatsList:
    latCat = latCat+"_enthusiasm"
    print(latCat + " enthusiasm assortativity: "+str(nx.attribute_assortativity_coefficient(yelpino,latCat)))

Bars_enthusiasm enthusiasm assortativity: 0.02192287124656674
Restaurants_enthusiasm enthusiasm assortativity: 0.03627511539814643
Fashion_enthusiasm enthusiasm assortativity: 0.06788735870720426
Beauty & Spas_enthusiasm enthusiasm assortativity: 0.07471112694138975
Pubs_enthusiasm enthusiasm assortativity: 0.06441924516203301
Active Life_enthusiasm enthusiasm assortativity: 0.09242651536644597
Cosmetics & Beauty Supply_enthusiasm enthusiasm assortativity: 0.053342333502640206
Home & Garden_enthusiasm enthusiasm assortativity: 0.08746606430984903
Sports Clubs_enthusiasm enthusiasm assortativity: 0.07440785056301338
Used_enthusiasm enthusiasm assortativity: 0.06531539563115533
Preschools_enthusiasm enthusiasm assortativity: 0.04274641169593429
Financial Services_enthusiasm enthusiasm assortativity: 0.04954848219294601
Hair Removal_enthusiasm enthusiasm assortativity: 0.08188676823559288
Oil Change Stations_enthusiasm enthusiasm assortativity: 0.04361531503715075
Home Cleaning_enthusiasm

In [8]:
smallDegreeDict = dict(yelpino.degree())

In [16]:
smallDegreeDict['--2HUmLkcNHZp0xw6AMBPg']

3

In [17]:
largeDegreeDict = dict(BigYelp.degree())

In [19]:
largeDegreeDict['--2HUmLkcNHZp0xw6AMBPg']

58

In [31]:
dfDegrees = pd.DataFrame.from_dict(smallDegreeDict,orient = 'index',columns = ['SmallYelp'])

In [47]:
dfDegrees['index1'] = dfDegrees.index

In [54]:
dfDegrees.head()

Unnamed: 0,SmallYelp,index1
oMy_rEb0UBEmMlu-zcxnoQ,2,oMy_rEb0UBEmMlu-zcxnoQ
cvVMmlU1ouS3I5fhutaryQ,18,cvVMmlU1ouS3I5fhutaryQ
nj6UZ8tdGo8YJ9lUMTVWNw,7,nj6UZ8tdGo8YJ9lUMTVWNw
JJ-aSuM4pCFPdkfoZ34q0Q,27,JJ-aSuM4pCFPdkfoZ34q0Q
HVUAmApa0fCbHHVJ0ALshw,29,HVUAmApa0fCbHHVJ0ALshw


In [55]:
dfDegrees['BigYelp'] = dfDegrees['index1'].apply(lambda ID: largeDegreeDict[ID])

In [59]:
dfDegrees['SmallYelp'].corr(dfDegrees['BigYelp'], method = 'pearson')

0.8685414501447632

In [58]:
dfDegrees['SmallYelp'].corr(dfDegrees['BigYelp'],method = 'spearman')

0.772002464623897

In [28]:
count1 = 0
count2 = 0
count3 = 0
count4 = 0
noNeighborInfo1 = 0
noNeighborInfo2 = 0
noNeighborInfo3 = 0
noNeighborInfo4 = 0

def neighborsAverageRating(userIdNode):
    global count1
    global noNeighborInfo1
    count1+=1
    userIdInQuestion = userIdNode
    totalAvStarSum = 0
    totalPeople = 0
    if count1 %2 == 0:
        print("1CountPerc: "+str(count1/10))
        # OVER 379868
    if userIdInQuestion in yelpino:
        listNeighbors = list(nx.all_neighbors(yelpino,userIdInQuestion))
        numNeighbors = len(listNeighbors)
        for neighborId in listNeighbors:
            try:
                testNodeAttributes = nx.get_node_attributes(yelpino,"Reviews")[neighborId] # get dataframe for neighbor
                if len(testNodeAttributes) > 0 :
                    sumOfStarsNeighbor = testNodeAttributes['stars'].sum()
                    sumOfNumReviews = len(testNodeAttributes['stars']) 
                    avStarNeighbor = sumOfStarsNeighbor/sumOfNumReviews 
                    totalAvStarSum+=avStarNeighbor
                    totalPeople+=1
            except KeyError:
                print("function 1, count = "+str(count1)+": user: "+userIdInQuestion +", friendId: "+ neighborId)
                noNeighborInfo1 +=1
                print("No Neighbor count: "+str(noNeighborInfo1))
        av = totalAvStarSum/totalPeople
        return (av,numNeighbors)
    else:
        return (np.NAN,numNeighbors)
    
def neighborsWeightedAverageRating(userIdNode):
    global count2
    global noNeighborInfo2
    count2+=1
    userIdInQuestion = userIdNode
    totalStarSum = 0
    totalReviews = 0
    if count2 %2 == 0:
        print("2CountPerc: "+str(count2/10))
        # OVER 379868
    if userIdInQuestion in yelpino:
        listNeighbors = list(nx.all_neighbors(yelpino,userIdInQuestion))
        numNeighbors = len(listNeighbors)
        for neighborId in listNeighbors:
            try:
                testNodeAttributes = nx.get_node_attributes(yelpino,"Reviews")[neighborId] # get dataframe for neighbor
                if len(testNodeAttributes) > 0 :
                    totalStarSum += testNodeAttributes['stars'].sum()
                    totalReviews += len(testNodeAttributes['stars']) 
            except KeyError:
                print("function 2, count = "+str(count2)+": user: "+userIdInQuestion +", friendId: "+ neighborId)
                noNeighborInfo2 +=1
                print("No Neighbor count: "+str(noNeighborInfo2))
        if totalReviews > 0:
            avRating = totalStarSum/totalReviews
            return (avRating,numNeighbors)
    else:
        return (np.NAN,numNeighbors)
    

def neighborsAverageRatingByBusinessCategory(tupleUserIdCat):
    global count3
    global noNeighborInfo3
    count3+=1
    userIdInQuestion = tupleUserIdCat[0]
    busLatCat = tupleUserIdCat[1]
    totalAvStarSum = 0
    totalPeople = 0
    if count3 %2 == 0:
        print("3CountPerc: "+str(count3/10))
        #770514
    if userIdInQuestion in yelpino:
        print("3: " + str(userIdInQuestion))
        listNeighbors = list(nx.all_neighbors(yelpino,userIdInQuestion))
        numNeighbors = len(listNeighbors)
        for neighborId in listNeighbors:
            try:
                testNodeAttributes = nx.get_node_attributes(yelpino,"Reviews")[neighborId] # get dataframe for neighbor
                if busLatCat in testNodeAttributes['businessLatentCategory'].values:
                    busCatSumOfStarsNeighbor = testNodeAttributes.groupby("businessLatentCategory").get_group(busLatCat)['stars'].agg("sum")
                    busCatSumOfNumReviews = len(testNodeAttributes.groupby("businessLatentCategory").get_group(busLatCat)['stars'])
                    busLatCatStarsMean = busCatSumOfStarsNeighbor/busCatSumOfNumReviews
                    totalAvStarSum+=busLatCatStarsMean
                    totalPeople+=1
            except KeyError:
                print("function 3, count = "+str(count3)+": user: "+userIdInQuestion +", friendId: "+ neighborId)
                noNeighborInfo3 +=1
                print("No Neighbor count: "+str(noNeighborInfo3))
        if totalPeople > 0:
            av = totalAvStarSum/totalPeople
            return (av,numNeighbors)
    else:
        return (np.NAN,numNeighbors)
    
def neighborsWeightedAverageRatingByBusinessCategory(tupleUserIdCat):
    global count4
    global noNeighborInfo4
    count4+=1
    userIdInQuestion = tupleUserIdCat[0]
    busLatCat = tupleUserIdCat[1]
    totalStarSum = 0
    totalReviews = 0
    if count4 %2 == 0:
        print("4CountPerc: "+str(count4/10))
        #OVER 770514
    if userIdInQuestion in yelpino:
        print("4: " + str(userIdInQuestion))
        listNeighbors = list(nx.all_neighbors(yelpino,userIdInQuestion))
        numNeighbors = len(listNeighbors)
        for neighborId in listNeighbors:
            try:
                testNodeAttributes = nx.get_node_attributes(yelpino,"Reviews")[neighborId] # get dataframe for neighbor
                if busLatCat in testNodeAttributes['businessLatentCategory'].values:
                    totalStarSum += testNodeAttributes.groupby("businessLatentCategory").get_group(busLatCat)['stars'].agg("sum")
                    totalReviews += len(testNodeAttributes.groupby("businessLatentCategory").get_group(busLatCat)['stars'])
            except KeyError:
                print("function 4, count = "+str(count4)+": user: "+userIdInQuestion +", friendId: "+ neighborId)
                noNeighborInfo4 +=1
                print("No Neighbor count: "+str(noNeighborInfo4))

        if totalReviews > 0:
            avRating = totalStarSum/totalReviews
            return (avRating,numNeighbors)
    else:
        return (np.NAN,numNeighbors) 

    








In [29]:
def testFunction(LIST): 
    List = LIST[0]
    whichCSV = LIST[2]
    if LIST[1] == 1: 
        csvfile = ''
        if whichCSV == 'first':
            csvfile = '1_first.csv'
        elif whichCSV == 'second':
            csvfile = '1_second.csv'
        with open(csvfile, 'a') as newFile:
            newFileWriter = csv.writer(newFile)
            for i in range(len(List)):
                locDic1 = {}
                locDic1[List[i]] = (neighborsAverageRating(List[i]))
                print(locDic1)
                newFileWriter.writerow([List[i], locDic1[List[i]]]) 
        return {}
    
    if LIST[1] == 2: 
        csvfile = ''
        if whichCSV == 'first':
            csvfile = '2_first.csv'
        elif whichCSV == 'second':
            csvfile = '2_second.csv'
        with open(csvfile, 'a') as newFile:
            newFileWriter = csv.writer(newFile)
            for i in range(len(List)):
                locDic2 = {}
                locDic2[List[i]] = (neighborsWeightedAverageRating(List[i]))
                print(locDic2)
                newFileWriter.writerow([List[i], locDic2[List[i]]]) 
        return {}
    
    if LIST[1] == 3: 
        csvfile = ''
        if whichCSV == 'first':
            csvfile = '3_first.csv'
        elif whichCSV == 'second':
            csvfile = '3_second.csv'
        with open(csvfile, 'a') as newFile:
            newFileWriter = csv.writer(newFile)
            for i in range(len(List)):
                locDic3 = {}
                locDic3[List[i]] = (neighborsAverageRatingByBusinessCategory(List[i]))
                print(locDic3)
                newFileWriter.writerow([List[i], locDic3[List[i]]]) 
        return {}
    
    if LIST[1] == 4: 
        csvfile = ''
        if whichCSV == 'first':
            csvfile = '4_first.csv'
        elif whichCSV == 'second':
            csvfile = '4_second.csv'
        with open(csvfile, 'a') as newFile:
            newFileWriter = csv.writer(newFile)
            for i in range(len(List)):
                locDic4 = {}
                locDic4[List[i]] = (neighborsWeightedAverageRatingByBusinessCategory(List[i]))
                print(locDic4)
                newFileWriter.writerow([List[i], locDic4[List[i]]]) 
        return {}


In [None]:
# NEED TO IMPROVE PARALLELIZATION OF TASKS
def f(arg):
    return testFunction(arg)

pool=Pool(processes=8)

listOfDicts = pool.map(f, [(ListOfUSERS[0:379868],1,"first"),\
                                (ListOfUSERS[379868:len(ListOfUSERS)],1,"second"),\
                                (ListOfUSERS[0:379868],2,"first"),\
                                (ListOfUSERS[379868:len(ListOfUSERS)],2,"second"),\
                                (userLatentCats_List[0:770514],3,"first"),\
                                (userLatentCats_List[770514:len(userLatentCats_List)],3,"second"),\
                                (userLatentCats_List[0:770514],4,"first"),\
                                (userLatentCats_List[770514:len(userLatentCats_List)],4,"second")])

