# Business Category Construction
In the dataset, each business has a set of business categories. Upon registration to Yelp, a business indicates business categories it feels best represents its essence, style, or brand. Wide reaching and varying in specificity, some example categories include “Comfort Food”, “Seafood”, “Venues and Event Spaces”, “Internet Service”, and “Ophthalmologists”. These categories can provide users with a preliminary semi-abstract idea of the services it offers. On the Yelp platform, a business can provide as many or as few categories as the owners or select registrants deem necessary, which, in the dataset, achieves a minimum of 0 categories and a maximum of 36 categories. I suspect that these categories can provide our deep learning models with relevant perspective on relationships between businesses.

However, due to the overwhelming power of self-identifcation and all the issues that come with it (indicated in the BusinessAI paper), there was a problem with grouping together similar businesses and discriminating dissimilar ones. To tackle this issue, I devise a method to create a hierarchical community structure of business categories to accomplish the aforementionned task using the Nested Stochastic Block Model.

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import json
from itertools import combinations
import networkx as nx
from glob import glob
from networkx.algorithms import community
import graph_tool.all as gt
import itertools
import csv
import ast
%matplotlib inline

In [None]:
# Read in business information
Yelp_Business = pd.read_json('YelpDataset/business.json',lines=True)

## Step 1: Create weighted undirected network of business categories 
- See p.6 of BusinessAI paper

In [None]:
# cleaning business category data
# original format -> business1: [cat 1, cat 2, ..., cat n]

# making column where the list of categories of business are 
# one string (categories separated by commas)
def listStringtoString(row):
    rowList = row['categories']
    return ','.join(rowList)
    
Yelp_Business['categoriesStr'] = Yelp_Business.apply(listStringtoString, axis = 1)

business_cats=','.join(Yelp_Business['categoriesStr'])

In [None]:
# #UNCOMMENT FOR CHART OF MOST POPULAR CATEGORIES

# # getting dataframe off all the categories and then doing value counts
# catsDF=pd.DataFrame(business_cats.split(','),columns=['category'])
# x=catsDF.category.value_counts()
# #prep for chart
# categoryCounts=x.sort_values(ascending=False)
# x=categoryCounts.iloc[0:15] # top 30 categories
# #expand this for more categories

# #chart
# plt.figure(figsize=(25,12))
# ax = sns.barplot(x.index, x.values, alpha=0.8)#,color=color[5])
# plt.title("What are the top categories?",fontsize=25)
# locs, labels = plt.xticks()
# plt.setp(labels, rotation=80)
# plt.ylabel('# businesses', fontsize=12)
# plt.xlabel('Category', fontsize=12)

# #adding the text labels
# rects = ax.patches
# labels = x.values
# for rect, label in zip(rects, labels):
#     height = rect.get_height()
#     ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
# plt.savefig("TechReportFigures/BusCatCommunities/topCategories.svg")
# plt.show()

In [None]:
# making list of combinations of categories for each business. Sorted so we don't get a1_a2 a2_a1 problem.
Nested2SubListsBus = [[it for it in combinations(sorted(item),2)] for item in Yelp_Business['categories']]

# expanding list of lists into one list containing all combinations of categories of businesses
categoryCombinations = [item for sublist in Nested2SubListsBus for item in sublist]

# dataframe of these combinations
categoryBusinessDf = pd.DataFrame(categoryCombinations, columns = ['source','target'])

# concatenate two categories in tuple pair for every pair in dataframe
categoryBusinessDf['st'] = categoryBusinessDf['source']+categoryBusinessDf['target']

# grouping dataframe by these concatenated category pairs so they are unique. 
categoryBusinessDf = categoryBusinessDf.groupby('st').agg({"source":'first','target':'first','st':'count'})

categoryBusinessDf.columns = ['source','target','weight']

categoryBusinessDf.reset_index(drop = True, inplace=True)


# save network to file for storage purposes
g = gt.Graph(directed=False)
# Add Edges
g.add_edge_list(categoryBusinessDf.values)

In [2]:
#create graph-tool graph
catGraph = gt.Graph(directed = False)        

#load in graphml file into catGraph
catGraph = gt.load_graph("categoryCommunities/categoryGraph.graphml")
print (catGraph.list_properties())

_graphml_vertex_id (vertex)  (type: string)
_graphml_edge_id (edge)    (type: string)
weight         (edge)    (type: int64_t)
None


## Step 2: Nested Stochastic Block Model
- See p.7

In [3]:
# get Nested BlockState containing hierarchical partition 
# determined by Stochastic Block Model
state=gt.minimize_nested_blockmodel_dl(catGraph,deg_corr=True) 

# print the results: shows the number of nodes and groups in all levels
state.print_summary()

# l = level
# N = number of nodes at level
# B = number of communities at level

l: 0, N: 1293, B: 82
l: 1, N: 82, B: 19
l: 2, N: 19, B: 5
l: 3, N: 5, B: 1


In [4]:
def nameOfNode(row, graph):
    # returns the name of the node based on how it was read in
    return graph.vertex_properties['_graphml_vertex_id'][row]

def degreeOfNode(row, graph):
    # returns the degree of a node
    nodeIndex = graph.vertex(row)
    return nodeIndex.out_degree()

def weightedDegreeOfNode(row, graph):
    # This function returns the weighted degree of a node in an undirected graph
    v = graph.vertex(row)
    sumWeights = 0
    for e in v.out_edges():
        sumWeights+=graph.edge_properties['weight'][e]
    return sumWeights    

def nameOfCommunities(communityFixedLevelGroupBy,stateNextLevelGetBlocks,overallDf):
    # gets the names of the lowest tier communities
    testList = []
    for i in range(len(list(stateNextLevelGetBlocks))): # == 57, state 1 nodes
        subgroup0 = communityFixedLevelGroupBy.get_group(i)
        xx = subgroup0['weightedDegreeOfNode']
        maxcommunity = xx.max()
        listOfIndexMax = subgroup0.index[subgroup0['weightedDegreeOfNode'] == maxcommunity].tolist()
        indexOfMax = listOfIndexMax[0]
        nameMaxAppears = subgroup0['nameOfNode'][indexOfMax]
        testList.append(nameMaxAppears)
    overallDf["Community_1_Names"] = overallDf["Community_1"].apply(lambda row: testList[row])
    return overallDf


def nameOfCommunitiesGeq2(communityFixedLevelGroupBy,stateNextLevelGetBlocks,overallDf,comInt):
    # gets the names of communities at level greater than 1
    testList = []
    for i in range(len(list(stateNextLevelGetBlocks))): 
        subgroup0 = communityFixedLevelGroupBy.get_group(i)
        xx = subgroup0['weightedDegreeOfNode']
        max3communityList = list(xx.nlargest(3)) 
        localList = []
        for k in range(3):
            listOfIndexMax = subgroup0.index[subgroup0['weightedDegreeOfNode'] == max3communityList[k]].tolist()
            localList.append(listOfIndexMax)
        localList = [item for sublist in localList for item in sublist]
        indicesOfMaxs = [item for item in localList]
        namesMaxAppearsList = [subgroup0['nameOfNode'][index] for index in indicesOfMaxs]
        testList.append(namesMaxAppearsList)
    comStrNames = "Community_"+str(comInt)+"_Names"
    comStr= "Community_"+str(comInt)
    overallDf[comStrNames] = overallDf[comStr].apply(lambda row: testList[row])
    return overallDf

def graphToolBlockStateToDataFrame(state,graph,needNodeNames,needDegreesOfNodes,needWeightedDegreesOfNodes,needComNames):
    # receives a graph-tool graph and returns a dataframe with relevant information
    
    listOfStates = []
    for i in range(len(state.get_levels())):
        listOfStates.append(state.get_levels()[i]) # BlockState object at first level with blocks and vertices
        listOfStates[i] = listOfStates[i].get_blocks() # Property Map with key as vertex and value as group
        
    state0 = listOfStates[0]

    state1 = listOfStates[1]

    nodesInCluster = state0.get_array() 
    # Returns array of length = # of nodes where indices are nodes and values are groups B it belongs to

    nodesInCluster = pd.Series(nodesInCluster) #converting to series

    indexOfNode = pd.Series(nodesInCluster.index) #series of indices of nodes (identity series)

    data = {"indexOfNode":indexOfNode,"Community_1":nodesInCluster} 
    # data that is going to initially be fed to dataframe (index of Node and first super group/community)

    NodesDf = pd.DataFrame(data)
    if needNodeNames:
        NodesDf['nameOfNode'] = NodesDf['indexOfNode'].apply(lambda row: nameOfNode(row,graph)) 
       # create column of names of nodes
    if needDegreesOfNodes:
        NodesDf['degreeOfNode'] = NodesDf['indexOfNode'].apply(lambda row:degreeOfNode(row,graph))
    
    if needWeightedDegreesOfNodes:
        NodesDf['weightedDegreeOfNode'] = NodesDf['indexOfNode'].apply(lambda row:weightedDegreeOfNode(row,graph))
        
        weightedDegreeSeries = NodesDf['weightedDegreeOfNode']
        weightedDegreeMean = weightedDegreeSeries.mean()
        weightedDegreeMax = weightedDegreeSeries.max() 
        weightedDegreeMin = weightedDegreeSeries.min()
        weightedDegreeStd = weightedDegreeSeries.std()
        
        NodesDf['std_WeightedDegreeOfNode'] = NodesDf['weightedDegreeOfNode'].apply(lambda x: (x - weightedDegreeMean)/weightedDegreeStd)
        NodesDf['normalized_WeightedDegreeOfNode'] = NodesDf['weightedDegreeOfNode'].apply(lambda x:(x-weightedDegreeMin)/(weightedDegreeMax-weightedDegreeMin))   
    
            
    DegreeSeries = NodesDf['degreeOfNode']
    DegreeMean = DegreeSeries.mean()
    DegreeMax = DegreeSeries.max() 
    DegreeMin = DegreeSeries.min()
    DegreeStd = DegreeSeries.std()
        
    NodesDf['std_DegreeOfNode'] = NodesDf['degreeOfNode'].apply(lambda x: (x - DegreeMean)/DegreeStd)
    NodesDf['normalized_DegreeOfNode'] = NodesDf['degreeOfNode'].apply(lambda x:(x-DegreeMin)/(DegreeMax-DegreeMin))  
    
    for i in range(len(state.get_levels())-1):
        localState = state.get_levels()[i+1]
        localState = localState.get_blocks() # Property Map with key as vertex and value as group
        Cluster = localState.get_array()
        Cluster = pd.Series(Cluster)
        prevStringCommunity = "Community_"+str(i+1)
        stringCommunity = "Community_"+str(i+2)
        NodesDf[stringCommunity] = NodesDf[prevStringCommunity].apply(lambda row: Cluster[row])
        overallCommunityStrCol = "Community_"+ str(len(state.get_levels()))
    NodesDf = NodesDf.rename(columns = {overallCommunityStrCol:"Overall_Community"})
    
    if needComNames:
        communityGroupBys = []
        for i in range(len((state.get_levels()))-1): 
            w = i+2  #2, 3
            if w == len(state.get_levels()):
                communityGroupByStr = "Overall_Community"
            else:
                communityGroupByStr = "Community_"+str(w)
            communityGroupBys.append(NodesDf.groupby(communityGroupByStr))
        community1GroupBy = NodesDf.groupby('Community_1')
        NodesDf = nameOfCommunities(community1GroupBy,state1,NodesDf)
        for i in range(len(communityGroupBys)-1):  
            NodesDf = nameOfCommunitiesGeq2(communityGroupBys[i],listOfStates[i+2],NodesDf,i+2)
    return NodesDf



def informationAboutNodeHierarchy(graphDf, nodeIndexOrName):
    # Takes a graphDf and index of node (0 ... numNodes-1) and returns the row information
    if type(nodeIndexOrName) == int: # given a node index
        rowInQuestion = graphDf[nodeIndexOrName:nodeIndexOrName+1]
    elif type(nodeIndexOrName) == str:
        rowInQuestion = graphDf[graphDf['nameOfNode'] == nodeIndexOrName]
    return rowInQuestion



def businessAttributeToCommunity(businessCategorySelfIndicatedStr, graphDf, level, string_or_int_Rep):
    rowOfCat = informationAboutNodeHierarchy(graphDf,businessCategorySelfIndicatedStr)
    localIndex = rowOfCat.index
    localIndex = localIndex[0]
    intRepCommunityCol = "Community_"+str(level)
    strRepCommunityCol = "Community_"+str(level)+"_Names"
    if string_or_int_Rep == "string":
        temp = rowOfCat[strRepCommunityCol]
        return temp.at[localIndex]
    elif string_or_int_Rep == "int":
        temp = rowOfCat[intRepCommunityCol]
        return temp.at[localIndex]

In [5]:
busCatDf = graphToolBlockStateToDataFrame(state,catGraph,True,True,True,True)

In [6]:
busCatDf.head(3)

Unnamed: 0,indexOfNode,Community_1,nameOfNode,degreeOfNode,weightedDegreeOfNode,std_WeightedDegreeOfNode,normalized_WeightedDegreeOfNode,std_DegreeOfNode,normalized_DegreeOfNode,Community_2,Community_3,Overall_Community,Community_1_Names,Community_2_Names,Community_3_Names
0,0,0,& Probates,44,427,-0.204427,0.002725,-0.328852,0.047461,0,0,0,Lawyers,"[Electronics, IT Services & Computer Repair, M...","[Electronics, IT Services & Computer Repair, M..."
1,1,0,Accountants,51,988,-0.129468,0.006313,-0.253697,0.055188,0,0,0,Lawyers,"[Electronics, IT Services & Computer Repair, M...","[Electronics, IT Services & Computer Repair, M..."
2,2,1,Antiques,174,2178,0.029534,0.013925,1.066883,0.190949,1,1,0,Used,"[Books, Mags, Music & Video, Books, Mags, Musi...","[Automotive, Fashion, Home & Garden]"


### Get capacities of the first level communities

In [7]:
busCatDf1 = busCatDf[['Community_1','nameOfNode']]
busCatDf1.columns = ["Community_1","Capacity"]
busCatDf1 = busCatDf1.groupby("Community_1").agg('count')

In [8]:
busCatDf1[:35]

Unnamed: 0_level_0,Capacity
Community_1,Unnamed: 1_level_1
0,39
1,10
2,5
3,6
4,9
5,8
6,18
7,19
8,8
9,11


In [9]:
busCatDf1[35:]

Unnamed: 0_level_0,Capacity
Community_1,Unnamed: 1_level_1
35,8
36,9
37,24
38,8
39,18
40,12
41,5
42,33
43,6
44,9


### Get business categories that are members of specified first level communities

In [10]:
busCatDf2 = busCatDf
busCatDf2 = busCatDf[['Community_1','nameOfNode','degreeOfNode','weightedDegreeOfNode']]
busCatDf2 = busCatDf2.groupby(['Community_1','nameOfNode']).count()
busCatDf2 = busCatDf2.drop(['degreeOfNode','weightedDegreeOfNode'],axis = 1)
busCatDf2 = busCatDf2.reset_index()

In [75]:
busCatDf2[busCatDf2['Community_1'] == 50] # example

Unnamed: 0,Community_1,nameOfNode
819,50,Cosmetic Dentists
820,50,Dentists
821,50,Eyewear & Opticians
822,50,General Dentistry
823,50,Laser Eye Surgery/Lasik
824,50,Ophthalmologists
825,50,Optometrists
826,50,Orthodontists
827,50,Orthotics
828,50,Urologists


In [97]:
# save our Category graph to csv

busCatDf.to_csv("categoryCommunities/categoryHierarchyDf.csv")

In [98]:
# FROM HERE YOU CAN READ IN THE CSV.

#busCatDf = pd.read_csv("categoryCommunities/categoryHierarchyDf.csv")
#busCatDf.head()

## Iterative Naming Process
- We now a hierarchical structure of business categories and a system for naming the communities. But, each business has multiple categories and these categories can belong to different communities. To navigate this, we will run the NSBM an arbitrary amount of times, and for each iteration, we will keep track of which communities the categories belong to. The community that appears the most frequently will be the business' community.

In [12]:
Yelp_Business = pd.read_json('YelpDataset/business.json',lines=True)
businessCatSeries = Yelp_Business['categories']
Yelp_Business['categoryDictCounts'] = Yelp_Business['categories'].apply(lambda x: {})

In [52]:
# LEVEL 1 DICT COUNTS; function to get the level two communities (labels) of each category that a business indicates
# counts the number of occurences of each of the community labels

def categoriesToDictCounts1(row):
    locList = []
    Row0BusinessCats = row['categories']
    Row0Dict1 = row['categoryDictCounts']
    for i in range(len(Row0BusinessCats)):
        locList.append(businessAttributeToCommunity(Row0BusinessCats[i],busCatDf,1,"string")) #level 2
    for elem in locList:
        if elem not in Row0Dict:
            Row0Dict1[elem] = 1
        elif elem in Row0Dict:
            Row0Dict1[elem] += 1
    return Row0Dict1


In [29]:
# LEVEL 2 DICT COUNTS; function to get the level two communities (labels) of each category that a business indicates
# counts the number of occurences of each of the community labels

def categoriesToDictCounts(row):
    locList = []
    Row0BusinessCats = row['categories']
    Row0Dict = row['categoryDictCounts']
    for i in range(len(Row0BusinessCats)):
        locList.append(businessAttributeToCommunity(Row0BusinessCats[i],busCatDf,2,"string")) #level 2
    for elem in locList: 
        for i in range(len(elem)):
            if elem[i] not in Row0Dict:
                Row0Dict[elem[i]] = 1
            elif elem[i] in Row0Dict:
                Row0Dict[elem[i]] += 1
    return Row0Dict


In [None]:
# Running a total of 17 times here (18 times total) where at each iteration, we run the NSBM, get the new state,
# get the communities of the business categories and add to frequency counts in dictionary 

count = 0
while count < 17:
    # get Nested BlockState containing hierarchical partition determined by Stochastic Block Model
    state=gt.minimize_nested_blockmodel_dl(catGraph,deg_corr=True) 

    # print the results: shows the number of nodes and groups in all levels
    state.print_summary()
    Yelp_Business['categoryDictCounts'] = Yelp_Business.apply(categoriesToDictCounts,axis = 1)
    count+=1

In [55]:
import operator

# get the key with the max value 

def getKeyWithMaxValue(row):
    Row0Dict = row['categoryDictCounts']
    if row['categoryDictCounts'] != {}:
        return max(Row0Dict.items(), key=operator.itemgetter(1))[0]
    else:
        return
    
def getKeyWithMaxValueLevel1(row):
    Row0Dict = row['catDictCountsLevel1']
    if row['catDictCountsLevel1'] != {}:
        return max(Row0Dict.items(), key=operator.itemgetter(1))[0]
    else:
        return
    

In [425]:
Yelp_Business["businessLatentCategory"] = Yelp_Business.apply(getKeyWithMaxValue,axis=1)
Yelp_Business['businessCatLevel1'] = Yelp_Business.apply(getKeyWithMaxValueLevel1,axis=1)
Yelp_Business.to_csv("BusinessesWithLatentCategories.csv")