In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import json
#import seaborn as sns
from itertools import combinations
import networkx as nx
from glob import glob
from networkx.algorithms import community
import graph_tool.all as gt
import itertools
import csv
import ast
%matplotlib inline

In [2]:
#create graph-tool graph
catGraph = gt.Graph(directed = False)        

#load in graphml file into catGraph
catGraph = gt.load_graph("categoryCommunities/categoryGraph.graphml")
print (catGraph.list_properties())

_graphml_vertex_id (vertex)  (type: string)
_graphml_edge_id (edge)    (type: string)
weight         (edge)    (type: int64_t)
None


In [3]:
# get Nested BlockState containing hierarchical partition determined by Stochastic Block Model
state=gt.minimize_nested_blockmodel_dl(catGraph,deg_corr=True) 

# print the results: shows the number of nodes and groups in all levels
state.print_summary()

# l = level
# N = number of nodes at level
# B = number of groups at level

l: 0, N: 1293, B: 82
l: 1, N: 82, B: 19
l: 2, N: 19, B: 5
l: 3, N: 5, B: 1


In [4]:
q = state.get_levels()[0]
q = q.get_blocks() #
w = q.get_array()
w = pd.Series(w)
len(list(w.value_counts())),len(set(list(w.value_counts())))

(97, 30)

In [62]:
#grab the position array of the plot. It contains a position property map, and a hierarchical structure of the graph
#display the nested hierarchical graph
pos_c = gt.draw_hierarchy(state,output="BusinessCategorySBM.png",vertex_size=5,
                          output_size=(4024, 4024))

ExpatError: no element found: line 1, column 0

In [9]:
w.value_counts()

28    129
59     70
62     61
64     51
0      41
38     40
63     37
65     36
53     35
19     35
32     33
30     32
27     31
20     29
34     26
12     25
22     23
54     22
10     22
55     21
16     21
9      21
29     21
15     20
25     20
23     20
56     19
36     19
43     18
42     18
     ... 
1      12
14     12
46     12
2      11
17     10
45      9
33      9
48      9
40      9
35      8
39      8
31      7
26      7
3       7
4       7
5       7
60      6
50      6
51      6
6       5
18      5
52      5
37      5
44      4
13      4
24      4
8       3
21      3
11      3
41      2
Length: 66, dtype: int64

In [4]:
def nameOfNode(row, graph):
    # returns the name of the node based on how it was read in
    return graph.vertex_properties['_graphml_vertex_id'][row]

def degreeOfNode(row, graph):
    # returns the degree of a node
    nodeIndex = graph.vertex(row)
    return nodeIndex.out_degree()

def weightedDegreeOfNode(row, graph):
    # This function returns the weighted degree of a node in an undirected graph
    v = graph.vertex(row)
    sumWeights = 0
    for e in v.out_edges():
        sumWeights+=graph.edge_properties['weight'][e]
    return sumWeights    

def nameOfCommunities(communityFixedLevelGroupBy,stateNextLevelGetBlocks,overallDf):
    testList = []
    for i in range(len(list(stateNextLevelGetBlocks))): # == 57, state 1 nodes
        subgroup0 = communityFixedLevelGroupBy.get_group(i)
        xx = subgroup0['weightedDegreeOfNode']
        maxcommunity = xx.max()
        listOfIndexMax = subgroup0.index[subgroup0['weightedDegreeOfNode'] == maxcommunity].tolist()
        indexOfMax = listOfIndexMax[0]
        nameMaxAppears = subgroup0['nameOfNode'][indexOfMax]
        testList.append(nameMaxAppears)
    overallDf["Community_1_Names"] = overallDf["Community_1"].apply(lambda row: testList[row])
    return overallDf


def nameOfCommunitiesGeq2(communityFixedLevelGroupBy,stateNextLevelGetBlocks,overallDf,comInt):
    testList = []
    for i in range(len(list(stateNextLevelGetBlocks))): # somthing wrong her
        subgroup0 = communityFixedLevelGroupBy.get_group(i)
        xx = subgroup0['weightedDegreeOfNode']
        max3communityList = list(xx.nlargest(3)) 
        localList = []
        for k in range(3):
            listOfIndexMax = subgroup0.index[subgroup0['weightedDegreeOfNode'] == max3communityList[k]].tolist()
            localList.append(listOfIndexMax)
        localList = [item for sublist in localList for item in sublist]
        indicesOfMaxs = [item for item in localList]
        namesMaxAppearsList = [subgroup0['nameOfNode'][index] for index in indicesOfMaxs]
        testList.append(namesMaxAppearsList)
    comStrNames = "Community_"+str(comInt)+"_Names"
    comStr= "Community_"+str(comInt)
    overallDf[comStrNames] = overallDf[comStr].apply(lambda row: testList[row])
    return overallDf

def graphToolBlockStateToDataFrame(state,graph,needNodeNames,needDegreesOfNodes,needWeightedDegreesOfNodes,needComNames):
    # receives a graph-tool graph and returns a dataframe with relevant information
    
    listOfStates = []
    for i in range(len(state.get_levels())):
        listOfStates.append(state.get_levels()[i]) # BlockState object at first level with blocks and vertices
        listOfStates[i] = listOfStates[i].get_blocks() # Property Map with key as vertex and value as group
        
    state0 = listOfStates[0]

    state1 = listOfStates[1]

    nodesInCluster = state0.get_array() 
    # Returns array of length = # of nodes where indices are nodes and values are groups B it belongs to

    nodesInCluster = pd.Series(nodesInCluster) #converting to series

    indexOfNode = pd.Series(nodesInCluster.index) #series of indices of nodes (identity series)

    data = {"indexOfNode":indexOfNode,"Community_1":nodesInCluster} 
    # data that is going to initially be fed to dataframe (index of Node and first super group/community)

    NodesDf = pd.DataFrame(data)
    if needNodeNames:
        NodesDf['nameOfNode'] = NodesDf['indexOfNode'].apply(lambda row: nameOfNode(row,graph)) 
       # create column of names of nodes
    if needDegreesOfNodes:
        NodesDf['degreeOfNode'] = NodesDf['indexOfNode'].apply(lambda row:degreeOfNode(row,graph))
    
    if needWeightedDegreesOfNodes:
        NodesDf['weightedDegreeOfNode'] = NodesDf['indexOfNode'].apply(lambda row:weightedDegreeOfNode(row,graph))
        
        weightedDegreeSeries = NodesDf['weightedDegreeOfNode']
        weightedDegreeMean = weightedDegreeSeries.mean()
        weightedDegreeMax = weightedDegreeSeries.max() 
        weightedDegreeMin = weightedDegreeSeries.min()
        weightedDegreeStd = weightedDegreeSeries.std()
        
        NodesDf['std_WeightedDegreeOfNode'] = NodesDf['weightedDegreeOfNode'].apply(lambda x: (x - weightedDegreeMean)/weightedDegreeStd)
        NodesDf['normalized_WeightedDegreeOfNode'] = NodesDf['weightedDegreeOfNode'].apply(lambda x:(x-weightedDegreeMin)/(weightedDegreeMax-weightedDegreeMin))   
    
            
    DegreeSeries = NodesDf['degreeOfNode']
    DegreeMean = DegreeSeries.mean()
    DegreeMax = DegreeSeries.max() 
    DegreeMin = DegreeSeries.min()
    DegreeStd = DegreeSeries.std()
        
    NodesDf['std_DegreeOfNode'] = NodesDf['degreeOfNode'].apply(lambda x: (x - DegreeMean)/DegreeStd)
    NodesDf['normalized_DegreeOfNode'] = NodesDf['degreeOfNode'].apply(lambda x:(x-DegreeMin)/(DegreeMax-DegreeMin))  
    
    for i in range(len(state.get_levels())-1):
        localState = state.get_levels()[i+1]
        localState = localState.get_blocks() # Property Map with key as vertex and value as group
        Cluster = localState.get_array()
        Cluster = pd.Series(Cluster)
        prevStringCommunity = "Community_"+str(i+1)
        stringCommunity = "Community_"+str(i+2)
        NodesDf[stringCommunity] = NodesDf[prevStringCommunity].apply(lambda row: Cluster[row])
        overallCommunityStrCol = "Community_"+ str(len(state.get_levels()))
    NodesDf = NodesDf.rename(columns = {overallCommunityStrCol:"Overall_Community"})
    
    if needComNames:
        communityGroupBys = []
        for i in range(len((state.get_levels()))-1): 
            w = i+2  #2, 3
            if w == len(state.get_levels()):
                communityGroupByStr = "Overall_Community"
            else:
                communityGroupByStr = "Community_"+str(w)
            communityGroupBys.append(NodesDf.groupby(communityGroupByStr))
        community1GroupBy = NodesDf.groupby('Community_1') # COMMUNITY 1 
        NodesDf = nameOfCommunities(community1GroupBy,state1,NodesDf)
        for i in range(len(communityGroupBys)-1):  ######   SOME PROBLEM HERE!!!  -2  #### 
            NodesDf = nameOfCommunitiesGeq2(communityGroupBys[i],listOfStates[i+2],NodesDf,i+2)
    return NodesDf



def informationAboutNodeHierarchy(graphDf, nodeIndexOrName):
    # Takes a graphDf and index of node (0 ... numNodes-1) and returns the row information
    if type(nodeIndexOrName) == int: # given a node index
        rowInQuestion = graphDf[nodeIndexOrName:nodeIndexOrName+1]
    elif type(nodeIndexOrName) == str:
        rowInQuestion = graphDf[graphDf['nameOfNode'] == nodeIndexOrName]
    return rowInQuestion



def businessAttributeToCommunity(businessCategorySelfIndicatedStr, graphDf, level, string_or_int_Rep):
    rowOfCat = informationAboutNodeHierarchy(graphDf,businessCategorySelfIndicatedStr)
    localIndex = rowOfCat.index
    localIndex = localIndex[0]
    intRepCommunityCol = "Community_"+str(level)
    strRepCommunityCol = "Community_"+str(level)+"_Names"
    if string_or_int_Rep == "string":
        temp = rowOfCat[strRepCommunityCol]
        return temp.at[localIndex]
    elif string_or_int_Rep == "int":
        temp = rowOfCat[intRepCommunityCol]
        return temp.at[localIndex]
        
        

In [5]:
busCatDf = graphToolBlockStateToDataFrame(state,catGraph,True,True,True,True)

In [6]:
busCatDf.head(3)

Unnamed: 0,indexOfNode,Community_1,nameOfNode,degreeOfNode,weightedDegreeOfNode,std_WeightedDegreeOfNode,normalized_WeightedDegreeOfNode,std_DegreeOfNode,normalized_DegreeOfNode,Community_2,Community_3,Overall_Community,Community_1_Names,Community_2_Names,Community_3_Names
0,0,0,& Probates,44,427,-0.204427,0.002725,-0.328852,0.047461,0,0,0,Lawyers,"[Electronics, IT Services & Computer Repair, M...","[Electronics, IT Services & Computer Repair, M..."
1,1,0,Accountants,51,988,-0.129468,0.006313,-0.253697,0.055188,0,0,0,Lawyers,"[Electronics, IT Services & Computer Repair, M...","[Electronics, IT Services & Computer Repair, M..."
2,2,1,Antiques,174,2178,0.029534,0.013925,1.066883,0.190949,1,1,0,Used,"[Books, Mags, Music & Video, Books, Mags, Musi...","[Automotive, Fashion, Home & Garden]"


### Get counts of the first level communities

In [7]:

busCatDf1 = busCatDf
busCatDf1 = busCatDf1[['Community_1','nameOfNode']]
busCatDf1.columns = ["Community_1","Capacity"]
busCatDf1 = busCatDf1.groupby("Community_1").agg('count')


In [8]:
busCatDf1[:35]

Unnamed: 0_level_0,Capacity
Community_1,Unnamed: 1_level_1
0,39
1,10
2,5
3,6
4,9
5,8
6,18
7,19
8,8
9,11


In [9]:
busCatDf1[35:]

Unnamed: 0_level_0,Capacity
Community_1,Unnamed: 1_level_1
35,8
36,9
37,24
38,8
39,18
40,12
41,5
42,33
43,6
44,9


### Get categories of the first level communities

In [10]:
busCatDf2 = busCatDf
busCatDf2 = busCatDf2[['Community_1','nameOfNode','degreeOfNode','weightedDegreeOfNode']]
busCatDf2 = busCatDf2.groupby(['Community_1','nameOfNode']).count()

In [11]:
busCatDf2 = busCatDf2.drop(['degreeOfNode','weightedDegreeOfNode'],axis = 1)

In [12]:
busCatDf2 = busCatDf2.reset_index()

In [75]:
busCatDf2[busCatDf2['Community_1'] == 50]

Unnamed: 0,Community_1,nameOfNode
819,50,Cosmetic Dentists
820,50,Dentists
821,50,Eyewear & Opticians
822,50,General Dentistry
823,50,Laser Eye Surgery/Lasik
824,50,Ophthalmologists
825,50,Optometrists
826,50,Orthodontists
827,50,Orthotics
828,50,Urologists


In [23]:
busCatDf2.to_csv("Making_NSBMVisual_Cats.csv")

In [22]:
busCatDf['Community_2'].unique()

array([ 0, 10,  2,  5,  6,  1,  7,  8,  9,  4,  3, 13, 11, 12])

In [59]:
for i in range(14):
    locList = []
    print(str(i)+": ")
    locUnique = busCatDf.groupby("Community_2").get_group(i)['Community_1'].unique()
    print("Community 2 children: " + str(locUnique))
    for unique in locUnique:
        locList.append(busCatDf1['Capacity'][unique])
    print("Community 2 children capacities: " +str(locList))


0: 
Community 2 children: [ 0  3  4  5 10 12 14]
Community 2 children capacities: [41, 7, 7, 7, 22, 25, 12]
1: 
Community 2 children: [13 25 52 58 62]
Community 2 children capacities: [4, 20, 5, 18, 61]
2: 
Community 2 children: [ 2  7 44 48]
Community 2 children capacities: [11, 15, 4, 9]
3: 
Community 2 children: [29 31]
Community 2 children capacities: [21, 7]
4: 
Community 2 children: [23 55]
Community 2 children capacities: [20, 21]
5: 
Community 2 children: [ 6  8 11 18]
Community 2 children capacities: [5, 3, 3, 5]
6: 
Community 2 children: [ 9 28 30 32]
Community 2 children capacities: [21, 129, 32, 33]
7: 
Community 2 children: [15 16 20 47 61]
Community 2 children capacities: [20, 21, 29, 15, 15]
8: 
Community 2 children: [17 19 22 26 27 36 53]
Community 2 children capacities: [10, 35, 23, 7, 31, 19, 35]
9: 
Community 2 children: [21 24 34 41 49 51]
Community 2 children capacities: [3, 4, 26, 2, 13, 6]
10: 
Community 2 children: [ 1 33 37 38 39 43]
Community 2 children capaci

In [97]:
# save our Category graph to csv

#busCatDf.to_csv("categoryCommunities/categoryHierarchyDf.csv")

In [98]:
# FROM HERE YOU CAN READ IN THE CSV.

#busCatDf = pd.read_csv("categoryCommunities/categoryHierarchyDf.csv")
#busCatDf.head()

In [85]:
#busCatDf['Community_1_Names'].unique() # Node Categories

In [84]:
#check the unique communities

# unique_datacom2 = [list(x) for x in set(tuple(x) for x in busCatDf['Community_2_Names'])]

In [12]:
busCatDf['degreeOfNode'].corr(busCatDf['weightedDegreeOfNode'], method="spearman") 
# spearman = rank correlation #pearson is normal
# high rank correlation 
# high degree correlated highly with high weighted degree

0.944293275168237

In [12]:
Yelp_Business = pd.read_json('YelpDataset/business.json',lines=True)

In [13]:
Yelp_Business.head(3)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,"4855 E Warner Rd, Ste B9","{'AcceptsInsurance': True, 'ByAppointmentOnly'...",FYWN1wneV18bWNgQjJ2GNg,"[Dentists, General Dentistry, Health & Medical...",Ahwatukee,"{'Friday': '7:30-17:00', 'Tuesday': '7:30-17:0...",1,33.33069,-111.978599,Dental by Design,,85044,22,4.0,AZ
1,3101 Washington Rd,"{'BusinessParking': {'garage': False, 'street'...",He-G7vWjzVUysIKrfNbPUQ,"[Hair Stylists, Hair Salons, Men's Hair Salons...",McMurray,"{'Monday': '9:00-20:00', 'Tuesday': '9:00-20:0...",1,40.291685,-80.1049,Stephen Szabo Salon,,15317,11,3.0,PA
2,"6025 N 27th Ave, Ste 1",{},KQPW8lFf1y5BT2MxiSZ3QA,"[Departments of Motor Vehicles, Public Service...",Phoenix,{},1,33.524903,-112.11531,Western Motor Vehicle,,85017,18,1.5,AZ


In [14]:
businessCatSeries = Yelp_Business['categories']

In [15]:
Yelp_Business['catCount'] = Yelp_Business['categories'].apply(len)

In [16]:
Yelp_Business['categoryDictCounts'] = Yelp_Business['categories'].apply(lambda x: {})

In [17]:
Yelp_Business.head(2)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state,catCount,categoryDictCounts
0,"4855 E Warner Rd, Ste B9","{'AcceptsInsurance': True, 'ByAppointmentOnly'...",FYWN1wneV18bWNgQjJ2GNg,"[Dentists, General Dentistry, Health & Medical...",Ahwatukee,"{'Friday': '7:30-17:00', 'Tuesday': '7:30-17:0...",1,33.33069,-111.978599,Dental by Design,,85044,22,4.0,AZ,6,{}
1,3101 Washington Rd,"{'BusinessParking': {'garage': False, 'street'...",He-G7vWjzVUysIKrfNbPUQ,"[Hair Stylists, Hair Salons, Men's Hair Salons...",McMurray,"{'Monday': '9:00-20:00', 'Tuesday': '9:00-20:0...",1,40.291685,-80.1049,Stephen Szabo Salon,,15317,11,3.0,PA,6,{}


In [32]:
Row0Dict = {} 
Row0Dict2 = {}
Row0BusinessCats = Yelp_Business['categories'][0]
Row0BusinessCats

['Dentists',
 'General Dentistry',
 'Health & Medical',
 'Oral Surgeons',
 'Cosmetic Dentists',
 'Orthodontists']

In [33]:
locList = []
for i in range(len(Row0BusinessCats)):
    locList.append(businessAttributeToCommunity(Row0BusinessCats[i],busCatDf,2,"string"))

In [34]:
locList

[['Dentists', 'General Dentistry', 'Cosmetic Dentists'],
 ['Dentists', 'General Dentistry', 'Cosmetic Dentists'],
 ['Beauty & Spas', 'Health & Medical', 'Hair Salons'],
 ['Dentists', 'General Dentistry', 'Cosmetic Dentists'],
 ['Dentists', 'General Dentistry', 'Cosmetic Dentists'],
 ['Dentists', 'General Dentistry', 'Cosmetic Dentists']]

In [35]:
locList2 = []
for i in range(len(Row0BusinessCats)):
    locList2.append(businessAttributeToCommunity(Row0BusinessCats[i],busCatDf,1,"string"))
locList2

['Dentists',
 'Dentists',
 'Health & Medical',
 'Dentists',
 'Dentists',
 'Dentists']

In [37]:
for elem in locList2: 
    if elem not in Row0Dict2:
        Row0Dict2[elem] = 1
    elif elem in Row0Dict2:
        Row0Dict2[elem] += 1

In [38]:
Row0Dict2

{'Dentists': 5, 'Health & Medical': 1}

In [52]:
# LEVEL 1 DICT COUNTS; function to get the level two communities (labels) of each category that a business indicates
# counts the number of occurences of each of the community labels

def categoriesToDictCounts1(row):
    locList = []
    Row0BusinessCats = row['categories']
    Row0Dict1 = row['categoryDictCounts']
    for i in range(len(Row0BusinessCats)):
        locList.append(businessAttributeToCommunity(Row0BusinessCats[i],busCatDf,1,"string")) #level 2
    for elem in locList:
        if elem not in Row0Dict:
            Row0Dict1[elem] = 1
        elif elem in Row0Dict:
            Row0Dict1[elem] += 1
    return Row0Dict1


In [29]:
# LEVEL 2 DICT COUNTS; function to get the level two communities (labels) of each category that a business indicates
# counts the number of occurences of each of the community labels

def categoriesToDictCounts(row):
    locList = []
    Row0BusinessCats = row['categories']
    Row0Dict = row['categoryDictCounts']
    for i in range(len(Row0BusinessCats)):
        locList.append(businessAttributeToCommunity(Row0BusinessCats[i],busCatDf,2,"string")) #level 2
    for elem in locList: 
        for i in range(len(elem)):
            if elem[i] not in Row0Dict:
                Row0Dict[elem[i]] = 1
            elif elem[i] in Row0Dict:
                Row0Dict[elem[i]] += 1
    return Row0Dict


In [53]:
Yelp_Business['catDictCountsLevel1'] = Yelp_Business.apply(categoriesToDictCounts1,axis = 1)
# RUNNING THE FUNCTION ONCE TO CHECK IF WORKING

In [54]:
Yelp_Business['catDictCountsLevel1'][3]

{'Pets': 2,
 'Sporting Goods': 1,
 'Pet Services': 2,
 'Shopping': 1,
 'Home Services': 2,
 'Local Services': 2}

In [369]:
# Running a total of 17 times here (18 times total) where at each iteration, we run the NSBM, get the new state,
# get the communities of the business categories and add to frequency counts in dictionary 

count = 0
while count < 17:
    # get Nested BlockState containing hierarchical partition determined by Stochastic Block Model
    state=gt.minimize_nested_blockmodel_dl(catGraph,deg_corr=True) 

    # print the results: shows the number of nodes and groups in all levels
    state.print_summary()
    Yelp_Business['categoryDictCounts'] = Yelp_Business.apply(categoriesToDictCounts,axis = 1)
    count+=1
    

l: 0, N: 1293, B: 90
l: 1, N: 90, B: 24
l: 2, N: 24, B: 4
l: 3, N: 4, B: 1
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
l: 0, N: 1293, B: 62
l: 1, N: 62, B: 16
l: 2, N: 16, B: 2
l: 3, N: 2, B: 1
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
l: 0, N: 1293, B: 58
l: 1, N: 58, B: 12
l: 2, N: 12, B: 1
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
l: 0, N: 1293, B: 68
l: 1, N: 68, B: 20
l: 2, N: 20, B: 5
l: 3, N: 5, B: 1
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
l: 0, N: 1293, B: 80
l: 1, N: 80, B: 21
l: 2, N: 21, B: 5
l: 3, N: 5, B: 1
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
l: 0, N: 1293, B: 82
l: 1, N: 82, B: 24
l: 2, N: 24, B: 6
l: 3, N: 6, B: 1
0
10000

KeyboardInterrupt: 

In [55]:
import operator

# get the key with the max value 

def getKeyWithMaxValue(row):
    Row0Dict = row['categoryDictCounts']
    if row['categoryDictCounts'] != {}:
        return max(Row0Dict.items(), key=operator.itemgetter(1))[0]
    else:
        return
    
def getKeyWithMaxValueLevel1(row):
    Row0Dict = row['catDictCountsLevel1']
    if row['catDictCountsLevel1'] != {}:
        return max(Row0Dict.items(), key=operator.itemgetter(1))[0]
    else:
        return
    

In [425]:
Yelp_Business["businessLatentCategory"] = Yelp_Business.apply(getKeyWithMaxValue,axis=1)

In [56]:
Yelp_Business['businessCatLevel1'] = Yelp_Business.apply(getKeyWithMaxValueLevel1,axis=1)

In [436]:
Yelp_Business.to_csv("BusinessesWithLatentCategories.csv")

In [57]:
Yelp_Business.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,...,postal_code,review_count,stars,state,catCount,categoryDictCounts,categoryDictCounts1,busCatLevel1,catDictCountsLevel1,businessCatLevel1
0,"4855 E Warner Rd, Ste B9","{'AcceptsInsurance': True, 'ByAppointmentOnly'...",FYWN1wneV18bWNgQjJ2GNg,"[Dentists, General Dentistry, Health & Medical...",Ahwatukee,"{'Friday': '7:30-17:00', 'Tuesday': '7:30-17:0...",1,33.33069,-111.978599,Dental by Design,...,85044,22,4.0,AZ,6,"{'Dentists': 1, 'General Dentistry': 10, 'Cosm...","{'Dentists': 1, 'General Dentistry': 10, 'Cosm...",Dentists,"{'Dentists': 1, 'General Dentistry': 10, 'Cosm...",General Dentistry
1,3101 Washington Rd,"{'BusinessParking': {'garage': False, 'street'...",He-G7vWjzVUysIKrfNbPUQ,"[Hair Stylists, Hair Salons, Men's Hair Salons...",McMurray,"{'Monday': '9:00-20:00', 'Tuesday': '9:00-20:0...",1,40.291685,-80.1049,Stephen Szabo Salon,...,15317,11,3.0,PA,6,"{'Beauty & Spas': 1, 'Health & Medical': 10, '...","{'Beauty & Spas': 1, 'Health & Medical': 10, '...",Beauty & Spas,"{'Beauty & Spas': 1, 'Health & Medical': 10, '...",Health & Medical
2,"6025 N 27th Ave, Ste 1",{},KQPW8lFf1y5BT2MxiSZ3QA,"[Departments of Motor Vehicles, Public Service...",Phoenix,{},1,33.524903,-112.11531,Western Motor Vehicle,...,85017,18,1.5,AZ,2,"{'Financial Services': 2, 'Lawyers': 2, 'Insur...","{'Financial Services': 2, 'Lawyers': 2, 'Insur...",Financial Services,"{'Financial Services': 2, 'Lawyers': 2, 'Insur...",Financial Services
3,"5000 Arizona Mills Cr, Ste 435","{'BusinessAcceptsCreditCards': True, 'Restaura...",8DShNS-LuFqpEWIp0HxijA,"[Sporting Goods, Shopping]",Tempe,"{'Monday': '10:00-21:00', 'Tuesday': '10:00-21...",0,33.383147,-111.964725,Sports Authority,...,85282,9,3.0,AZ,2,"{'Pets': 2, 'Sporting Goods': 1, 'Pet Services...","{'Pets': 2, 'Sporting Goods': 1, 'Pet Services...",Pets,"{'Pets': 2, 'Sporting Goods': 1, 'Pet Services...",Pets
4,581 Howe Ave,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",PfOCPjBrlQAnz__NXj9h_w,"[American (New), Nightlife, Bars, Sandwiches, ...",Cuyahoga Falls,"{'Monday': '11:00-1:00', 'Tuesday': '11:00-1:0...",1,41.119535,-81.47569,Brick House Tavern + Tap,...,44221,116,3.5,OH,7,"{'Bars': 1, 'American (Traditional)': 6, 'Coff...","{'Bars': 1, 'American (Traditional)': 6, 'Coff...",Bars,"{'Bars': 1, 'American (Traditional)': 6, 'Coff...",American (Traditional)


In [61]:
Yelp_Business['businessCatLevel1'].unique()

array(['General Dentistry', 'Health & Medical', 'Financial Services',
       'Pets', 'American (Traditional)', 'Breakfast & Brunch',
       'Automotive', 'Home Services', 'Auto Repair', 'Bars', 'Food',
       "Women's Clothing", 'Education', 'Carpet Cleaning', 'Home Decor',
       'Restaurants', 'Sandwiches', 'Eyelash Service',
       'Fitness & Instruction', 'Auto Glass Services', 'Japanese',
       'Beauty & Spas', 'Accessories', 'Formal Wear', 'Gyms',
       'Sporting Goods', 'Auto Detailing', 'Hair Salons', 'Dentists',
       'Electronics', 'Steakhouses', 'IT Services & Computer Repair',
       'Dance Studios', 'Landscaping', 'Parks', 'Body Shops', 'Lawyers',
       'Preschools', 'Swimming Pools', 'Insurance', 'Waxing', None,
       'Home & Garden', 'Child Care & Day Care', 'Sushi Bars',
       'Laser Hair Removal', 'Flooring', 'Party Supplies',
       'Cosmetic Surgeons', 'Watches', 'Family Practice', 'Shopping',
       'Pet Services', 'Jewelry Repair'], dtype=object)

In [65]:
Yelp_BusinessNarrow = Yelp_Business[['business_id','businessCatLevel1','latitude','longitude','postal_code']]

#ADD ZIP CODE ETC????

In [66]:
Yelp_BusinessNarrow.head()

Unnamed: 0,business_id,businessCatLevel1,latitude,longitude,postal_code
0,FYWN1wneV18bWNgQjJ2GNg,General Dentistry,33.33069,-111.978599,85044
1,He-G7vWjzVUysIKrfNbPUQ,Health & Medical,40.291685,-80.1049,15317
2,KQPW8lFf1y5BT2MxiSZ3QA,Financial Services,33.524903,-112.11531,85017
3,8DShNS-LuFqpEWIp0HxijA,Pets,33.383147,-111.964725,85282
4,PfOCPjBrlQAnz__NXj9h_w,American (Traditional),41.119535,-81.47569,44221


In [67]:
Yelp_BusinessNarrow.to_csv("Businesses_LatCats_loc_NARROW.csv")

In [64]:
Yelp_Business.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,...,postal_code,review_count,stars,state,catCount,categoryDictCounts,categoryDictCounts1,busCatLevel1,catDictCountsLevel1,businessCatLevel1
0,"4855 E Warner Rd, Ste B9","{'AcceptsInsurance': True, 'ByAppointmentOnly'...",FYWN1wneV18bWNgQjJ2GNg,"[Dentists, General Dentistry, Health & Medical...",Ahwatukee,"{'Friday': '7:30-17:00', 'Tuesday': '7:30-17:0...",1,33.33069,-111.978599,Dental by Design,...,85044,22,4.0,AZ,6,"{'Dentists': 1, 'General Dentistry': 10, 'Cosm...","{'Dentists': 1, 'General Dentistry': 10, 'Cosm...",Dentists,"{'Dentists': 1, 'General Dentistry': 10, 'Cosm...",General Dentistry
1,3101 Washington Rd,"{'BusinessParking': {'garage': False, 'street'...",He-G7vWjzVUysIKrfNbPUQ,"[Hair Stylists, Hair Salons, Men's Hair Salons...",McMurray,"{'Monday': '9:00-20:00', 'Tuesday': '9:00-20:0...",1,40.291685,-80.1049,Stephen Szabo Salon,...,15317,11,3.0,PA,6,"{'Beauty & Spas': 1, 'Health & Medical': 10, '...","{'Beauty & Spas': 1, 'Health & Medical': 10, '...",Beauty & Spas,"{'Beauty & Spas': 1, 'Health & Medical': 10, '...",Health & Medical
2,"6025 N 27th Ave, Ste 1",{},KQPW8lFf1y5BT2MxiSZ3QA,"[Departments of Motor Vehicles, Public Service...",Phoenix,{},1,33.524903,-112.11531,Western Motor Vehicle,...,85017,18,1.5,AZ,2,"{'Financial Services': 2, 'Lawyers': 2, 'Insur...","{'Financial Services': 2, 'Lawyers': 2, 'Insur...",Financial Services,"{'Financial Services': 2, 'Lawyers': 2, 'Insur...",Financial Services
3,"5000 Arizona Mills Cr, Ste 435","{'BusinessAcceptsCreditCards': True, 'Restaura...",8DShNS-LuFqpEWIp0HxijA,"[Sporting Goods, Shopping]",Tempe,"{'Monday': '10:00-21:00', 'Tuesday': '10:00-21...",0,33.383147,-111.964725,Sports Authority,...,85282,9,3.0,AZ,2,"{'Pets': 2, 'Sporting Goods': 1, 'Pet Services...","{'Pets': 2, 'Sporting Goods': 1, 'Pet Services...",Pets,"{'Pets': 2, 'Sporting Goods': 1, 'Pet Services...",Pets
4,581 Howe Ave,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",PfOCPjBrlQAnz__NXj9h_w,"[American (New), Nightlife, Bars, Sandwiches, ...",Cuyahoga Falls,"{'Monday': '11:00-1:00', 'Tuesday': '11:00-1:0...",1,41.119535,-81.47569,Brick House Tavern + Tap,...,44221,116,3.5,OH,7,"{'Bars': 1, 'American (Traditional)': 6, 'Coff...","{'Bars': 1, 'American (Traditional)': 6, 'Coff...",Bars,"{'Bars': 1, 'American (Traditional)': 6, 'Coff...",American (Traditional)
