## Network of Businesses - App Backend 
### Goals:
- Network approach: Tracking what businesses users tend to visit after visiting business X.
   - i.e. if a user A has positively rated business X, what business would user A enjoy visiting next?
- Data used as proof-of-concept prototype for iOS application 
   - A further improvement could be implementing the deep learning model as the backend of the app, but due to data restrictions and time, the network-based backend was considered more feasible.
   
Charlotte is the city indicated in this script, but I also had this script do Las vegas, Phoenix, and Toronto as well.

In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import json
import collections
from itertools import combinations
import networkx as nx
from glob import glob
from networkx.algorithms import community
%matplotlib inline
from matplotlib import *
import graph_tool.all as gt
import math
import datetime

In [None]:
city = "Charlotte"

In [None]:
# Yelp_Businesses has business IDs, names, and location 

# Goal is to get IDs and names of businesses in specified city from
# Yelp_Businesses in order to determine which reviews from Yelp_Reviews 
# are about businesses based in specified city

Yelp_Businesses = pd.read_json('YelpDataset/business.json',lines=True)
Yelp_BusinessesCity = Yelp_Businesses[Yelp_Businesses['city'] == city]


In [2]:
Yelp_Reviews = pd.read_csv("HOPEFUL.csv") # our ironic dataset name


# filtering out business reviews with bad ratings

Yelp_Reviews = Yelp_Reviews[['business_id','date','user_id','business_catgrylv2'\
                             ,'business_catgrylv1','stars','business_num_reviews'\
                            , 'business_average_stars']]
Yelp_Reviews = Yelp_Reviews[Yelp_Reviews['stars']>3]


# filtering out business reviews of businesses not based in select city

busCitySet = set(Yelp_BusinessesCity['business_id'])
def businessInCity(business):
    if business in busCitySet: return city
    else: return "NOT " + city

Yelp_Reviews['city'] = Yelp_Reviews['business_id'].apply(businessInCity)
Yelp_ReviewsCity = Yelp_Reviews[Yelp_Reviews['city'] == city]


In [16]:
# ordering reviews in chronological order

Yelp_ReviewsCity = Yelp_ReviewsCity.sort_index(by = 'date')
Yelp_ReviewsCity = Yelp_ReviewsCity.reset_index()

  """Entry point for launching an IPython kernel.


In [18]:
user_BusinessesVisitedList = []
def generateListUserTracking():
    # creates a list where the indices are users reviewing businesses
    # in select city and values are lists of tuples of businesses that
    # the user visited and the date the user visited
    
    # initializing temp user
    temp_user = Yelp_ReviewsCity['user_id'][0]
    temp_list = []
    for x in range(0, len(Yelp_ReviewsCity) - 1):
        if (temp_user == Yelp_ReviewsCity['user_id'][x]):
            # Append business id this current temp list
            temp_tuple = (Yelp_ReviewsCity['business_id'][x], Yelp_ReviewsCity['date'][x])
            temp_list.append(temp_tuple)
        elif (temp_user != Yelp_ReviewsCity['user_id'][x]):
            # Append the current temp list to list of lists
            user_BusinessesVisitedList.append(temp_list)
            # Clear temp list
            temp_list = []
            # Add this value to the new temp list
            temp_tuple = (Yelp_ReviewsCity['business_id'][x], Yelp_ReviewsCity['date'][x])
            temp_list.append(temp_tuple)
            # Make the new user the temp user
            temp_user = Yelp_ReviewsCity['user_id'][x]

generateListUserTracking()

In [20]:
# verify it worked... first 3 users and businesses visited with dates
print(user_BusinessesVisitedList[:3])

[[('2Q2uE1dwhqMFOE5CYfvMEQ', '2004-10-25')], [('ydUqgWsF3F27TbauOyib0w', '2004-12-19')], [('nMo0ozHO7cqsPaxw_7N6ow', '2005-06-17'), ('osSwv6CJy5hDKQdOKeyTow', '2005-06-17'), ('ZCXjNG1EBFiKKMtR9DOTGg', '2005-06-17'), ('Oy2WnPyiOlPFvPKMIuOC8w', '2005-06-18'), ('8q_q76SAbCLeAbZsW7oAFQ', '2005-06-18'), ('KEGLWeFAWXvo0W2LnujhtQ', '2005-07-24')]]


In [21]:
# filtering out users who only visited one business

user_BusinessesVisitedNewList = []
for i in range(len(user_BusinessesVisitedList)):
    if len(user_BusinessesVisitedList[i]) != 1:
        user_BusinessesVisitedNewList.append(user_BusinessesVisitedList[i])
        
# Removing user ids and only adding business sequences to businessIDs_only
temp_list = []
businessIDs_only = []
for x in range(0, len(user_BusinessesVisitedNewList) - 1):
    temp_list = [item[0] for item in user_BusinessesVisitedNewList[x]]
    businessIDs_only.append(temp_list)       
    

In [23]:
# Take each pair and make it into a tuple
businesses_tuple = []

for y in range(0, len(businessIDs_only) - 1):
    for x in range(0, len(businessIDs_only[y]) - 1):
        businesses_tuple.append((tuple(businessIDs_only[y][x:(x+2)])))

In [24]:
# creating dataframe with these source/dest tuples and weight 
# corresponds to how many users visited dest business after source business

businessesEdgesDf = pd.Series(businesses_tuple).value_counts().to_frame('weight').reset_index()

In [25]:
# creating edge list with weights (highest weighted edge of node will
# be given priority)

def makeEdgeWithWeight(row):
    edge = row['index']
    weight = row['weight']
    edge = edge + ({'weight':weight},)
    return edge

businessesEdgesDf['newEdges'] = businessesEdgesDf.apply(makeEdgeWithWeight,axis = 1)

In [26]:
# example source/dest/edge weight
businessesEdgesDf['newEdges'][123]

('wiEQrYSNofc10q7Pm26b0w', 'MUf-xrAxrliNQ4IvOeQ9oQ', {'weight': 1})

In [27]:
businessEdges = list(businessesEdgesDf['newEdges'])

## Nodes:
- Businesses

## Directed Weighted Edges: 
- Number of people having visited the predecessor business and subsequently visiting the successor business

In [29]:
# Put business id's into a list to represent the vertices
Business_IDs = list(busCitySet)

In [30]:
# Ensure the correct information has been appended
print(Business_IDs[:5])

['_xYM9osYumECb42IzSk-tA', 'uyH2PBvordYWCZ8FyrJH8A', 'nUgd6F-xIFdGSyQbzRojWQ', 'uPY5oatcnkBa6MFnUQHFCw', 'qs-o8DFpkPNoUSfy5IPIFg']


In [31]:
# Check the number of nodes
print(len(Business_IDs))

8553


## Creating the Network

In [33]:
G=nx.DiGraph()

# Add Nodes
G.add_nodes_from(Business_IDs)

# Add Edges
G.add_edges_from(businessEdges)

## Setting up dataframe with business nodes and respective neighbors with highest edge weight

In [41]:
recommendedBusinesses_Series = pd.Series(index = list(G.nodes()))
recommendedBusinesses_Series = recommendedBusinesses_Series.reset_index()

In [43]:
def getMaxNeighbor(origin):
    #finds the neighbor of a node with the highest edge weight
    maxInDegNeighbor = 0
    maxNeighbor = ""
    for neighbor in G.successors(origin):
        currentNeighborIn = G.in_degree(neighbor)
        if currentNeighborIn >=  maxInDegNeighbor:
            maxInDegNeighbor = currentNeighborIn
            maxNeighbor = neighbor
    return maxNeighbor

recommendedBusinesses_Series['destination'] = recommendedBusinesses_Series['index'].apply(getMaxNeighbor)

In [46]:
# Dropping nodes that don't have a successor
len(recommendedBusinesses_Series[recommendedBusinesses_Series['destination']!=""])
recommendedBusinesses_Series = recommendedBusinesses_Series[recommendedBusinesses_Series['destination']!=""].drop([0],axis = 1)

In [47]:
recommendedBusinesses_Series = recommendedBusinesses_Series.rename(columns = {'index':'current_business'})

In [49]:
# getting business names to replace business IDs

Yelp_BusinessesCityNAMES = dict(zip(Yelp_BusinessesCity.business_id,Yelp_BusinessesCity.name))
recommendedBusinesses_Series['name_current_business'] = recommendedBusinesses_Series['current_business'].apply(lambda bus: Yelp_BusinessesCityNAMES[bus])
recommendedBusinesses_Series['name_destination'] = recommendedBusinesses_Series['destination'].apply(lambda bus: Yelp_BusinessesCityNAMES[bus])


## Saving recommended businesses to visit to file

In [52]:
recommendedBusinesses_Series = recommendedBusinesses_Series[['name_current_business','name_destination']]

In [53]:
file = "APP_source_dest_" + city + ".txt"

with open(file,"w") as f:
    for row in recommendedBusinesses_Series.iterrows(): #
        txtrow = str(row[1]['name_current_business']) +"|"+ str(row[1]['name_destination']) + "\n"
        f.writelines("%s" % txtrow)
f.close()

In [54]:
recommendedBusinesses_Series

Unnamed: 0,name_current_business,name_destination
0,Dillard's,LittleSpoon
4,Chick-fil-A,Cabo Fish Taco
5,The Fillmore Charlotte,VBGB Beer Hall and Garden
6,Kickstand Burgers & Bar,Lang Van Vietnamese
10,Mellow Mushroom,Soul Gastrolounge
15,Joyfully Curly,Bella Bridesmaids
20,Caliber Collision,Best Wok
24,Lorenzo's Pizzeria,Cast Iron Waffles
25,Apple Store,Amélie's French Bakery & Café
26,Coppa Coffee and Tea Cafe,Los Arcos Mexican Restaurant
