In [2]:
from pyspark.sql import DataFrame, Row, SparkSession
import os

# choose needed cols from json and save as csv

In [3]:
def get_spark(name="Recsys", cores=2, local_dir="/tmp/spark-temp") -> SparkSession:
    # make sure the local_dir exists in your file system
    # default settings work for linux
    spark = (SparkSession
             .builder
             .appName(name)
             .master("local[{}]".format(cores))
             .config("spark.memory.offHeap.enabled", True)
             .config("spark.memory.offHeap.size", "16g")
             .config("spark.local.dir", local_dir)
             .getOrCreate())

    spark.sparkContext.setCheckpointDir(os.path.join(local_dir, "chkpts/"))
    return spark

In [None]:
spark = get_spark(cores=4)

In [None]:
sc = spark.sparkContext

In [94]:
df = spark.read.json("./yelp_dataset/review.json")

In [None]:
df.printSchema()

In [None]:
df = df.select("business_id", "cool", "date", "funny", "review_id", "stars", "useful", "user_id")

In [None]:
df.coalesce(1).write.option("header", "true").csv("ratings.csv")

# Get Business catefory info using Yelp API

In [8]:
import requests
from urllib.error import HTTPError
from urllib.parse import quote
from urllib.parse import urlencode

def request(host, path, api_key, url_params=None):
    """Given your API_KEY, send a GET request to the API.
    Args:
        host (str): The domain host of the API.
        path (str): The path of the API after the domain.
        API_KEY (str): Your API Key.
        url_params (dict): An optional set of query parameters in the request.
    Returns:
        dict: The JSON response from the request.
    Raises:
        HTTPError: An error occurs from the HTTP request.
    """
    url_params = url_params or {}
    url = '{0}{1}'.format(host, quote(path.encode('utf8')))
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }

    print(u'Querying {0} ...'.format(url))

    response = requests.request('GET', url, headers=headers, params=url_params)

    return response.json()

In [9]:
key="HwkCu8bbu6Lv43rqp__sk9Z36evS2vSApDUzNSPHb9fguOLzX7G8bOMI2GFZOuVee9mlkY1y-0xif98nQgVTRGcTj9VTjG-BzxQDrdhKrZRbP0fsFXwy7zzCVKHZXXYx"
cat_dict = request(host="https://api.yelp.com/v3", path="/categories", api_key=key)["categories"]

Querying https://api.yelp.com/v3/categories ...


In [10]:
alias_to_title = {cat["alias"]: cat["title"] for cat in cat_dict}

In [11]:
import networkx as nx

In [12]:
G = nx.DiGraph()
for cat in cat_dict:
    parent = cat["parent_aliases"]
    parent_title = alias_to_title[parent[0]] if parent else None
    
    if parent_title:
        G.add_edge(parent_title, cat["title"])

In [13]:
zero_in_degree_nodes = filter(lambda tup: tup[1] == 0, G.in_degree(G.node()))
top_categories = list(map(lambda tup: tup[0], zero_in_degree_nodes))

In [14]:
top_categories

['Local Services',
 'Food',
 'Professional Services',
 'Beauty & Spas',
 'Health & Medical',
 'Shopping',
 'Education',
 'Nightlife',
 'Restaurants',
 'Religious Organizations',
 'Automotive',
 'Hotels & Travel',
 'Active Life',
 'Pets',
 'Arts & Entertainment',
 'Home Services',
 'Public Services & Government',
 'Event Planning & Services',
 'Financial Services',
 'Bicycles',
 'Local Flavor',
 'Mass Media']

In [131]:
with open("../../yelp_dataset/top_categories.json", "w") as f:
    f.write(json.dumps(top_categories))