In [2]:
from pyspark.sql import DataFrame, Row, SparkSession
import os

In [3]:
def get_spark(name="Recsys", cores=2, local_dir="/tmp/spark-temp") -> SparkSession:
    # make sure the local_dir exists in your file system
    # default settings work for linux
    spark = (SparkSession
             .builder
             .appName(name)
             .master("local[{}]".format(cores))
             .config("spark.memory.offHeap.enabled", True)
             .config("spark.memory.offHeap.size", "16g")
             .config("spark.local.dir", local_dir)
             .getOrCreate())

    spark.sparkContext.setCheckpointDir(os.path.join(local_dir, "chkpts/"))
    return spark

In [None]:
spark = get_spark(cores=4)

In [None]:
sc = spark.sparkContext

In [94]:
df = spark.read.json("../../yelp_dataset/review.json")

In [None]:
df.printSchema()

In [None]:
df = df.select("business_id", "cool", "date", "funny", "review_id", "stars", "useful", "user_id")

In [None]:
df.count()

In [None]:
df.coalesce(1).write.option("header", "true").csv("ratings.csv")

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("./ratings.csv")

In [None]:
user_count = df.groupby("user_id")["review_id"].count().reset_index()

In [None]:
filtered_user = user_count[user_count.review_id >= 10]["user_id"]

In [None]:
merged_df = df.merge(filtered_user, on="user_id")

In [None]:
d

# Business meta data

In [4]:
spark = get_spark(cores=4)

In [91]:
df_meta.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [5]:
df_meta = spark.read.json("../../yelp_dataset/business.json")

In [103]:
df_selected_meta = df_meta.select(["business_id", "categories", "city", "name", "review_count", "stars", "state"])

In [104]:
df_selected_meta.show()

+--------------------+--------------------+------------------+--------------------+------------+-----+-----+
|         business_id|          categories|              city|                name|review_count|stars|state|
+--------------------+--------------------+------------------+--------------------+------------+-----+-----+
|1SWheh84yJXfytovI...|   Golf, Active Life|           Phoenix|Arizona Biltmore ...|           5|  3.0|   AZ|
|QXAEGFB4oINsVuTFx...|Specialty Food, R...|       Mississauga|Emerald Chinese R...|         128|  2.5|   ON|
|gnKjwL_1w79qoiV3I...|Sushi Bars, Resta...|         Charlotte|Musashi Japanese ...|         170|  4.0|   NC|
|xvX2CttrVhyG2z1dF...|Insurance, Financ...|          Goodyear|Farmers Insurance...|           3|  5.0|   AZ|
|HhyxOkGAM07SRYtlQ...|Plumbing, Shoppin...|         Charlotte| Queen City Plumbing|           4|  4.0|   NC|
|68dUKd8_8liJ7in4a...|Shipping Centers,...|       Mississauga|       The UPS Store|           3|  2.5|   ON|
|5JucpCfHZltJh5r1J.

In [8]:
import requests
from urllib.error import HTTPError
from urllib.parse import quote
from urllib.parse import urlencode

def request(host, path, api_key, url_params=None):
    """Given your API_KEY, send a GET request to the API.
    Args:
        host (str): The domain host of the API.
        path (str): The path of the API after the domain.
        API_KEY (str): Your API Key.
        url_params (dict): An optional set of query parameters in the request.
    Returns:
        dict: The JSON response from the request.
    Raises:
        HTTPError: An error occurs from the HTTP request.
    """
    url_params = url_params or {}
    url = '{0}{1}'.format(host, quote(path.encode('utf8')))
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }

    print(u'Querying {0} ...'.format(url))

    response = requests.request('GET', url, headers=headers, params=url_params)

    return response.json()

In [9]:
key="HwkCu8bbu6Lv43rqp__sk9Z36evS2vSApDUzNSPHb9fguOLzX7G8bOMI2GFZOuVee9mlkY1y-0xif98nQgVTRGcTj9VTjG-BzxQDrdhKrZRbP0fsFXwy7zzCVKHZXXYx"
cat_dict = request(host="https://api.yelp.com/v3", path="/categories", api_key=key)["categories"]

Querying https://api.yelp.com/v3/categories ...


In [10]:
alias_to_title = {cat["alias"]: cat["title"] for cat in cat_dict}

In [11]:
import networkx as nx

In [12]:
G = nx.DiGraph()
for cat in cat_dict:
    parent = cat["parent_aliases"]
    parent_title = alias_to_title[parent[0]] if parent else None
    
    if parent_title:
        G.add_edge(parent_title, cat["title"])

In [13]:
zero_in_degree_nodes = filter(lambda tup: tup[1] == 0, G.in_degree(G.node()))
top_categories = list(map(lambda tup: tup[0], zero_in_degree_nodes))

In [14]:
top_categories

['Local Services',
 'Food',
 'Professional Services',
 'Beauty & Spas',
 'Health & Medical',
 'Shopping',
 'Education',
 'Nightlife',
 'Restaurants',
 'Religious Organizations',
 'Automotive',
 'Hotels & Travel',
 'Active Life',
 'Pets',
 'Arts & Entertainment',
 'Home Services',
 'Public Services & Government',
 'Event Planning & Services',
 'Financial Services',
 'Bicycles',
 'Local Flavor',
 'Mass Media']

In [131]:
with open("../../yelp_dataset/top_categories.json", "w") as f:
    f.write(json.dumps(top_categories))

In [16]:
def find_top_category(str_cats):
    not_found = "NotFound"
    if not str_cats: return not_found
    
    cats = set(map(lambda x: x.strip(), str_cats.split(",")))
    intersections = cats.intersection(set(top_categories))
    
    return list(intersections)[0] if intersections else not_found

In [17]:
find_top_category("Golf, Active Life")

'Active Life'

In [18]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

find_top_cat_udf = udf(find_top_category, StringType())

In [19]:
df_filtered = df_selected_meta.withColumn("top_category", find_top_cat_udf("categories"))

In [124]:
import pyspark.sql.functions as F
df_filtered.filter(col("top_category") == "NotFound").select("review_count", "business_id").withColumn("rank", dense_rank().over(Window.orderBy(desc("review_count")))).show(truncate=False)

+------------+----------------------+----+
|review_count|business_id           |rank|
+------------+----------------------+----+
|51          |KPK0Tsr7_mIZJmuVIMIXGg|1   |
|22          |KE2x93Tltbj9eQbaBqgY8g|2   |
|19          |l1gJuDUXH3erEKKuXMELkA|3   |
|17          |yo8TQab_kYgAZm_DJkRjfA|4   |
|17          |OcZvEbjudxPdlRdmFx-QSA|4   |
|15          |9Jk2NZv58bYmoxuuKUD9TQ|5   |
|15          |I0C8HvraosuTo2lRPjOedA|5   |
|14          |T38PudlpzBl8Pu46WmR5KA|6   |
|14          |h9B-VivFS_8BRX7MKF1bJw|6   |
|14          |2EcEsmpxAOXBBtH9kGVSaA|6   |
|13          |keYMjcRYI6YTR8lmhgvjPQ|7   |
|13          |rS2V2_n5pPgBUJPHFfm9Pw|7   |
|12          |yGpVxscoFR0Z51ERu3dz2w|8   |
|12          |sR_od2KqmI4A6rFx-wLb0A|8   |
|12          |I5tXvU7xf4Og26RIXdclTw|8   |
|11          |VUrBExjDoBpNURe_tfXEXw|9   |
|11          |BrkFoWcPapSxH1jvia0_FA|9   |
|11          |8E4axw7ZyXtsie2F1JNnpQ|9   |
|10          |nxQE3EkPt63yNwDZr8dDDA|10  |
|10          |sWfRvsDIt6uqNE1ByZQCxQ|10  |
+----------

In [97]:
df.filter(col("business_id") == "xOR85RicYj642O3_iJ7hgg").show()

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|xOR85RicYj642O3_i...|   0|2018-05-23 02:25:30|    0|CCmarUnBTnaOGh3yP...|  5.0|Andrew is a class...|     0|5qYWXXVRHM2hP7za5...|
|xOR85RicYj642O3_i...|   0|2018-06-18 21:35:26|    0|xK4jt3ViCHohazNaP...|  5.0|Couldn't have had...|     0|g0TXe-0RY7JszAefl...|
|xOR85RicYj642O3_i...|   0|2015-05-20 04:50:39|    1|cBLClxXwaADbYVVLE...|  1.0|Andrew did not ac...|     2|KFOYvo_F3aQF2uReb...|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+



In [28]:
df_null_cats = df_selected_meta.filter(col("categories").isNull()).select("business_id").collect()

In [32]:
df_null_cats = list(map(lambda r: r.business_id, df_null_cats))

In [33]:
df_null_cats

['xOR85RicYj642O3_iJ7hgg',
 'tWvep1usNNI3NYkoRbSb7g',
 'llG2YxnevtFhW7LVa4QJXQ',
 'zi2CV7isWyGRxVXJxdDQXg',
 'oyPZZX2fQYpSBHk25tL26A',
 'AEpvBhxrmHJud4UgvF2yWw',
 '3egQ7WCM-zQn9SXEBXbsDA',
 'AMhPJSh9S6Qz9Q3DuwPgtA',
 'KnAV3ZhOWWNhqnvqIkxYOg',
 'TyQ5wC1XolgYclbGoga_gA',
 'm_7y1JLy-g4IDKn7wmWRtw',
 'A28Cy-EJQbSoljM8os75_Q',
 'yVGggY8DCt2MIkdfUP7Bhg',
 'ThEUWXkgLbi5RiulgybrTA',
 'pLMdNuomFNGNt88-MQjCTA',
 'zdKWo-B3xcYevFoXS1OU9g',
 'ER56uNZR258-DXFYLNIGRA',
 'Pjmnta--nDOS6jHCDhsLGg',
 '851BjaHB_fBWxTa6NWa1eQ',
 'yo8TQab_kYgAZm_DJkRjfA',
 'K523J2fkBWEIZD8MV1Awiw',
 'xI_nnDMMF6TeUe9ISRGP2A',
 'ql8_D-3jsGvzWxrLpv8QpA',
 'QvVRV-XkZeWWmRyIO1Oq6g',
 'MPkPxObPAEgVhhIhAcw7UA',
 'KE2x93Tltbj9eQbaBqgY8g',
 '7Zwh96kpZWloOnVouI0O0w',
 'CFWoroWZPq7p3pnjxZWpug',
 '-qboK8vzF3vwFM5heO70mQ',
 'ZDOW_bRF6wtCNwkakMF-lA',
 'dVXPpWBka9AT2dHNnuPp6g',
 'pu0rpDtMPltl8Q-1Ad8hOQ',
 'BeGJJZaQbGVYEZo5rJ8P2A',
 '0dN_UGbAW1Ei0wRocUfWGw',
 'rWsoiLQeFy-RCdYtQvH-JA',
 'SoiXeBUyWZQsBvjBMqkjZQ',
 'nxQE3EkPt63yNwDZr8dDDA',
 

In [41]:
augment_business = {}
for missing_business in df_null_cats:
    result = request(host="https://api.yelp.com/v3", path="/businesses/{}".format(missing_business), api_key=key)
    augment_business[missing_business] = result

Querying https://api.yelp.com/v3/businesses/xOR85RicYj642O3_iJ7hgg ...
Querying https://api.yelp.com/v3/businesses/tWvep1usNNI3NYkoRbSb7g ...
Querying https://api.yelp.com/v3/businesses/llG2YxnevtFhW7LVa4QJXQ ...
Querying https://api.yelp.com/v3/businesses/zi2CV7isWyGRxVXJxdDQXg ...
Querying https://api.yelp.com/v3/businesses/oyPZZX2fQYpSBHk25tL26A ...
Querying https://api.yelp.com/v3/businesses/AEpvBhxrmHJud4UgvF2yWw ...
Querying https://api.yelp.com/v3/businesses/3egQ7WCM-zQn9SXEBXbsDA ...
Querying https://api.yelp.com/v3/businesses/AMhPJSh9S6Qz9Q3DuwPgtA ...
Querying https://api.yelp.com/v3/businesses/KnAV3ZhOWWNhqnvqIkxYOg ...
Querying https://api.yelp.com/v3/businesses/TyQ5wC1XolgYclbGoga_gA ...
Querying https://api.yelp.com/v3/businesses/m_7y1JLy-g4IDKn7wmWRtw ...
Querying https://api.yelp.com/v3/businesses/A28Cy-EJQbSoljM8os75_Q ...
Querying https://api.yelp.com/v3/businesses/yVGggY8DCt2MIkdfUP7Bhg ...
Querying https://api.yelp.com/v3/businesses/ThEUWXkgLbi5RiulgybrTA ...
Queryi

Querying https://api.yelp.com/v3/businesses/modnHkJKGUZL9XqaB9D0cA ...
Querying https://api.yelp.com/v3/businesses/jozuj1ySOk7DPs7OJloj3A ...
Querying https://api.yelp.com/v3/businesses/ry4kg-DZIDNfzqvmyjd9mw ...
Querying https://api.yelp.com/v3/businesses/gErOxDh3gMecoEmG5hYblA ...
Querying https://api.yelp.com/v3/businesses/-alO0UiSDdiNHKxbdShsKw ...
Querying https://api.yelp.com/v3/businesses/90O7AQ-qOkHKWxYkHZ-eHA ...
Querying https://api.yelp.com/v3/businesses/Ez1NHUBVDYrAUB-VtZImtQ ...
Querying https://api.yelp.com/v3/businesses/Xli12DG5UKWqcT_0rSWV8w ...
Querying https://api.yelp.com/v3/businesses/B0tuAmCiBRm66bqKV5IehA ...
Querying https://api.yelp.com/v3/businesses/JMRY7UHUHrWrWhB7IY9Lcg ...
Querying https://api.yelp.com/v3/businesses/bgRN6iUfQ8zLMTtTSkFzVw ...
Querying https://api.yelp.com/v3/businesses/VKMl433OFg1lolKdHgX5Wg ...
Querying https://api.yelp.com/v3/businesses/vxSN48M97AA08CcEbeJAxg ...
Querying https://api.yelp.com/v3/businesses/-dr5L_FCXNxtbA5h8_JZGQ ...
Queryi

Querying https://api.yelp.com/v3/businesses/KyMDPAPSBciO6cCtsEBXNg ...
Querying https://api.yelp.com/v3/businesses/20dcMWU65FpczwZjYkqF7Q ...
Querying https://api.yelp.com/v3/businesses/9mNXjexhq4OjOiyOYoV67A ...
Querying https://api.yelp.com/v3/businesses/_NvNEbQpyCPGs5CswhZsfw ...
Querying https://api.yelp.com/v3/businesses/u6wZ5dU5CmvLyevgfreyJw ...
Querying https://api.yelp.com/v3/businesses/AXnerw4iD-Konye6M89tEQ ...
Querying https://api.yelp.com/v3/businesses/JgwKfDXQk98P9ynkJxU9Iw ...
Querying https://api.yelp.com/v3/businesses/RbNfbSTdfmLvDE5tYBZOsw ...
Querying https://api.yelp.com/v3/businesses/lbkIGoU9Cb5MYpVvQHRgkA ...
Querying https://api.yelp.com/v3/businesses/b4Ylosj5CHyKRtTxvJxIwA ...
Querying https://api.yelp.com/v3/businesses/rc6ZkTxIpBWl8vZO8i5UDg ...
Querying https://api.yelp.com/v3/businesses/S_ZkBgavh2_1O38w05U4pA ...
Querying https://api.yelp.com/v3/businesses/4d9UENLYmmOjUSj6zADmJA ...
Querying https://api.yelp.com/v3/businesses/UeUQqyjgbiXIfXwkh3NRjg ...
Queryi

Querying https://api.yelp.com/v3/businesses/ILYhl0cu38IEtZNY_JfZrw ...
Querying https://api.yelp.com/v3/businesses/oYFUHj8XFM36SdPCoQaXPQ ...
Querying https://api.yelp.com/v3/businesses/jbD3a2F4GfYqXllJdgt_yg ...
Querying https://api.yelp.com/v3/businesses/bD_pIC9BvLJx0XMJSXuE2Q ...
Querying https://api.yelp.com/v3/businesses/uQBpvekquAbioiHNxaPTHw ...
Querying https://api.yelp.com/v3/businesses/h1TkuhYSSdyTwa7FJJj3yg ...
Querying https://api.yelp.com/v3/businesses/fPnBu-W0-Mcpq2ljfu7OpA ...
Querying https://api.yelp.com/v3/businesses/exKjpnTtF9GMfqdzQC4XlA ...
Querying https://api.yelp.com/v3/businesses/ufz94E6pYIKrtWv9P4zZNw ...
Querying https://api.yelp.com/v3/businesses/oJYo0mAzlasXiOnJ5UPEHQ ...
Querying https://api.yelp.com/v3/businesses/I5tXvU7xf4Og26RIXdclTw ...
Querying https://api.yelp.com/v3/businesses/Fmei46LMZP00dQ3EuTx7kg ...
Querying https://api.yelp.com/v3/businesses/gMwFWxyHCG0L8CYvHPy_zQ ...
Querying https://api.yelp.com/v3/businesses/Ovk7r6zq6ExLJvj8sGN1BA ...
Queryi

Querying https://api.yelp.com/v3/businesses/q9gSbV-hY0O7eIs--O2AfA ...
Querying https://api.yelp.com/v3/businesses/xV7Oe6ZE56qDVJ8NqSLJuQ ...
Querying https://api.yelp.com/v3/businesses/h-JSVmjhz1DRrof_M66AEA ...
Querying https://api.yelp.com/v3/businesses/0-nfZsIDDNXnlMN7qSRh5Q ...
Querying https://api.yelp.com/v3/businesses/t_VsrUZjQp-d7f-ix-0SeQ ...
Querying https://api.yelp.com/v3/businesses/YJkJZkWXOv0WsJI2u-T9uA ...
Querying https://api.yelp.com/v3/businesses/XDEoW3CfiDgFKnrTEFTQJQ ...
Querying https://api.yelp.com/v3/businesses/FQziJtb-cJ-EKIdsEpTLPg ...
Querying https://api.yelp.com/v3/businesses/8WYIG5N1tWsRtjL6DpZxqQ ...
Querying https://api.yelp.com/v3/businesses/J1poq215-JQFCSrFfUN4ZA ...
Querying https://api.yelp.com/v3/businesses/VyZuyPnFMVoK_1w-UGqAqQ ...
Querying https://api.yelp.com/v3/businesses/IWDAWWrKhH_RQJPe47a4jQ ...
Querying https://api.yelp.com/v3/businesses/VpU8gUZUWd6mleMHGekf4Q ...
Querying https://api.yelp.com/v3/businesses/mgv_2irNc-FwWGCU104d0g ...
Queryi

In [37]:
alias_to_title["appraisalservices"]

'Appraisal Services'

In [48]:
len(list(filter(lambda x: "categories" not in x or x["categories"] == [], augment_business.values())))

360

In [50]:
import json

with open("../../yelp_dataset/augment_business.json", "w") as f:
    f.write(json.dumps(augment_business, indent=0))

In [58]:
count_ = 0
for bid in augment_business:
    if "name" not in augment_business[bid]:
        count_ += 1
        continue
        
    name = augment_business[bid]["name"]
    count = df_selected_meta.filter(col("name") == name).count()
    if count <= 1:
        count_ += 1
print(count_)

377


In [57]:
df_selected_meta.filter(col("city") == "Fairview Park").count()

106

In [126]:
city_count = df_filtered.groupBy("state", "top_category").agg(F.count("*").alias("count"))

In [129]:
city_count.show()

DataFrame[state: string, top_category: string, count: bigint]

In [134]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

city_count.withColumn("rank", rank().over(Window.orderBy(asc("count")))).show(100)

+-----+--------------------+-----+----+
|state|        top_category|count|rank|
+-----+--------------------+-----+----+
|   VT|Event Planning & ...|    1|   1|
|   CA|        Local Flavor|    1|   1|
|   AL|    Health & Medical|    1|   1|
|   TX|         Restaurants|    1|   1|
|   FL|          Automotive|    1|   1|
|   CA|                Pets|    1|   1|
|  XWY|         Restaurants|    1|   1|
|   NY|       Beauty & Spas|    1|   1|
|   AK|           Education|    1|   1|
|   FL|     Hotels & Travel|    1|   1|
|   TN|       Home Services|    1|   1|
|   WA|       Home Services|    1|   1|
|   TX|Professional Serv...|    1|   1|
|  BAS|                Food|    1|   1|
|   GA|      Local Services|    1|   1|
|  CON|         Restaurants|    1|   1|
|  DUR|Professional Serv...|    1|   1|
|   BC|         Restaurants|    1|   1|
|   TX|            Shopping|    1|   1|
|   VT|       Home Services|    1|   1|
|   GA|       Home Services|    1|   1|
|  XGM|       Home Services|    1|   1|


In [89]:
city_count.filter(col("count") <= 100).count()

1074

In [128]:
city_count.count()

301