### Description
In This the <b>business.json</b> file and the <b>review.json</b> file are explored together. The average stars for each business category and the top <b>n</b> categories with the highest average stars.

#### Two Versions are employed
- With Spark
- Without Spark

### Spark Implementation

In [1]:
import findspark
findspark.init("/Users/chukuemekaogudu/Documents/Dev-Spark-Apache/Apache-Spark/spark-2.4.5-bin-hadoop2.7")
import os
import json
from pyspark import SparkContext, SparkConf

In [2]:
data_dir = "/Volumes/oli2/inf533_datasets"

In [3]:
conf = SparkConf().setMaster("local[*]").setAppName("Task2")
sc = SparkContext(conf=conf).getOrCreate()

In [4]:
def loadReviews(jsonObj):
    return (jsonObj["business_id"], (float(jsonObj["stars"]), 1))

In [5]:
def loadBusiness(jsonObj):
    categories = jsonObj.get("categories")
    categories_list = None
    if categories == None or categories == "":
        categories_list = []
    else:
        categories = categories.split(",")
        categories_list = [category.strip() for category in categories]
    return (jsonObj["business_id"], categories_list)

In [6]:
businessJson = sc.textFile(os.path.join(data_dir, "business.json"))

In [7]:
reviewsJson = sc.textFile(os.path.join(data_dir, "review.json"))

In [8]:
# Create Business RDD
businessRDD = businessJson.map(json.loads).map(lambda x: loadBusiness(x))

In [9]:
businessRDD.take(5)

[('1SWheh84yJXfytovILXOAQ', ['Golf', 'Active Life']),
 ('QXAEGFB4oINsVuTFxEYKFQ',
  ['Specialty Food',
   'Restaurants',
   'Dim Sum',
   'Imported Food',
   'Food',
   'Chinese',
   'Ethnic Food',
   'Seafood']),
 ('gnKjwL_1w79qoiV3IC_xQQ', ['Sushi Bars', 'Restaurants', 'Japanese']),
 ('xvX2CttrVhyG2z1dFg_0xw', ['Insurance', 'Financial Services']),
 ('HhyxOkGAM07SRYtlQ4wMFQ',
  ['Plumbing',
   'Shopping',
   'Local Services',
   'Home Services',
   'Kitchen & Bath',
   'Home & Garden',
   'Water Heater Installation/Repair'])]

In [10]:
# Create Reviews RDD
reviewsRDD = reviewsJson.map(json.loads).map(lambda x: loadReviews(x))

In [11]:
reviewsRDD.take(5)

[('mRUVMJkUGxrByzMQ2MuOpA', (1.0, 1)),
 ('LUN6swQYa4xJKaM_UEUOEw', (4.0, 1)),
 ('NyLYY8q1-H3hfsTwuwLPCg', (4.0, 1)),
 ('6lj2BJ4tJeu7db5asGHQ4w', (5.0, 1)),
 ('Mem13A3C202RzT53npn4NA', (5.0, 1))]

### Natural Join Business with Reviews

In [12]:
business_reviews = businessRDD.join(reviewsRDD).cache()

In [13]:
business_reviews.take(10)

[('jZ23B--fu21is2zrWiy4Kg',
  (['Pet Groomers', 'Pets', 'Pet Sitting', 'Pet Services'], (5.0, 1))),
 ('jZ23B--fu21is2zrWiy4Kg',
  (['Pet Groomers', 'Pets', 'Pet Sitting', 'Pet Services'], (5.0, 1))),
 ('2ffee6OI50skuFyEVmOBZQ',
  (['Eyelash Service', 'Beauty & Spas', 'Skin Care'], (5.0, 1))),
 ('pNnHgna1cCjUEZyXRUc6oA', (['Home Services', 'Cabinetry'], (5.0, 1))),
 ('Tuu_B0QGo7SQUuXW_Nd4hg',
  (['Middle Eastern', 'Pizza', 'Sandwiches', 'Chicken Wings', 'Restaurants'],
   (1.0, 1))),
 ('Tuu_B0QGo7SQUuXW_Nd4hg',
  (['Middle Eastern', 'Pizza', 'Sandwiches', 'Chicken Wings', 'Restaurants'],
   (5.0, 1))),
 ('iMb2NjGdBO6_JOe1OwoL7A',
  (['Hair Salons',
    "Men's Hair Salons",
    'Hair Stylists',
    'Barbers',
    'Beauty & Spas'],
   (5.0, 1))),
 ('iMb2NjGdBO6_JOe1OwoL7A',
  (['Hair Salons',
    "Men's Hair Salons",
    'Hair Stylists',
    'Barbers',
    'Beauty & Spas'],
   (5.0, 1))),
 ('iMb2NjGdBO6_JOe1OwoL7A',
  (['Hair Salons',
    "Men's Hair Salons",
    'Hair Stylists',
    'Bar

In [14]:
def mapCategory(values):
    cartegories_list = values[0]
    stars_count = values[1]
    categoriesMap = [(value, stars_count) for value in cartegories_list]
    return categoriesMap

In [15]:
category_stars = business_reviews.filter(lambda x: len(x[1][0]) != 0) \
                                 .map(lambda x: mapCategory(x[1])) \
                                 .flatMap(lambda x: x) \
                                 .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

In [16]:
category_stars.take(5)

[('Pet Sitting', (19364.0, 4752)),
 ('American (Traditional)', (456663.0, 126661)),
 ('Convenience Stores', (12670.0, 3866)),
 ('Grocery', (61282.0, 16876)),
 ("Children's Clothing", (4355.0, 1293))]

In [17]:
avg_stars = category_stars.mapValues(lambda x: x[0]/x[1]) \
                          .sortBy(lambda x: x[1], ascending=False)

In [103]:
avg_stars.take(50)

[('Calabrian', 5.0),
 ('Storefront Clinics', 5.0),
 ('Safety Equipment', 5.0),
 ('Christmas Markets', 5.0),
 ('Vocal Coach', 5.0),
 ('Bocce Ball', 5.0),
 ('Hearing Aids', 5.0),
 ('Astrologers', 5.0),
 ('Registry Office', 5.0),
 ('Makerspaces', 5.0),
 ('Crane Services', 5.0),
 ('Public Adjusters', 5.0),
 ('Structural Engineers', 5.0),
 ('Duplication Services', 5.0),
 ('Surf Schools', 5.0),
 ('Mobile Home Repair', 5.0),
 ('Geneticists', 5.0),
 ('Snorkeling', 5.0),
 ('Audio/Visual Equipment Rental', 5.0),
 ('Outdoor Movies', 5.0),
 ('Chinese Martial Arts', 5.0),
 ('Japanese Sweets', 5.0),
 ('IP & Internet Law', 5.0),
 ('DIY Auto Shop', 5.0),
 ('Water Suppliers', 5.0),
 ('Pathologists', 5.0),
 ('Sledding', 5.0),
 ('Rodeo', 5.0),
 ('Trivia Hosts', 5.0),
 ('Fireworks', 5.0),
 ('Habilitative Services', 5.0),
 ('Indoor Landscaping', 5.0),
 ('Taxidermy', 4.933333333333334),
 ('Caricatures', 4.933333333333334),
 ('Party Characters', 4.873015873015873),
 ('Surfing', 4.857142857142857),
 ('Qi Gong

### No Spark

In [18]:
def loadBusinessJson():
    id_cat = {}
    cat_count = {}
    with open(os.path.join(data_dir, "business.json"), "r") as file:
        lines = file.readlines()
        
        for line in lines:
            jsonObj = json.loads(line)
        
            categories = jsonObj.get("categories")
        
            if categories != None and categories != "":
                categories = categories.split(",")
                categories_list = [category.strip() for category in categories]
                id_cat[jsonObj["business_id"]] = categories_list
            
                for category in categories_list:
                    if cat_count.get(category) == None:
                        cat_count[category] = [0.0, 0]  
    return id_cat, cat_count

In [19]:
id_cat, cat_count = loadBusinessJson()

In [22]:
def mapBusinessReviews(id_cat, cat_count):
    file = open(os.path.join(data_dir, "review.json"), "r")
    
    while True:
        line = file.readline()
        
        if not line:
            break
            
        jsonObj = json.loads(line)
        
        categories_list = id_cat.get(jsonObj["business_id"])
        
        if categories_list != None:
            for category in categories_list:
                cat_count[category][0] += float(jsonObj["stars"])
                cat_count[category][1] += 1
    return cat_count

In [23]:
cat_count = mapBusinessReviews(id_cat, cat_count)

In [24]:
cat_count

{'Golf': [10522.0, 2886],
 'Active Life': [149116.0, 38349],
 'Specialty Food': [144384.0, 36358],
 'Restaurants': [2701752.0, 726531],
 'Dim Sum': [19749.0, 5700],
 'Imported Food': [9772.0, 2608],
 'Food': [996456.0, 261005],
 'Chinese': [159911.0, 45504],
 'Ethnic Food': [45739.0, 11796],
 'Seafood': [224805.0, 59127],
 'Sushi Bars': [166348.0, 43897],
 'Japanese': [204192.0, 53507],
 'Insurance': [5651.0, 1614],
 'Financial Services': [16194.0, 4976],
 'Plumbing': [17914.0, 4601],
 'Shopping': [330031.0, 89581],
 'Local Services': [138705.0, 36241],
 'Home Services': [180694.0, 49694],
 'Kitchen & Bath': [8301.0, 2210],
 'Home & Garden': [60912.0, 17113],
 'Water Heater Installation/Repair': [8887.0, 2178],
 'Shipping Centers': [5896.0, 2090],
 'Couriers & Delivery Services': [3606.0, 1081],
 'Printing Services': [8426.0, 2412],
 'Beauty & Spas': [296102.0, 74914],
 'Hair Salons': [92904.0, 22840],
 'Hair Stylists': [27561.0, 6359],
 'Barbers': [28441.0, 6926],
 "Men's Hair Salons"

In [32]:
avg_cat = {}

for category, star_count in cat_count.items():
    try:
        avg_cat[category] = star_count[0]/star_count[1]
    except ZeroDivisionError as e:
        avg_cat[category] = 0.0

In [33]:
avg_cat = sorted(avg_cat.items(), key=lambda kv: kv[1], reverse=True)

In [34]:
avg_cat

[('Surf Schools', 5.0),
 ('Rodeo', 5.0),
 ('IP & Internet Law', 5.0),
 ('Habilitative Services', 5.0),
 ('Mobile Home Repair', 5.0),
 ('Snorkeling', 5.0),
 ('Structural Engineers', 5.0),
 ('Astrologers', 5.0),
 ('Indoor Landscaping', 5.0),
 ('DIY Auto Shop', 5.0),
 ('Pathologists', 5.0),
 ('Vocal Coach', 5.0),
 ('Audio/Visual Equipment Rental', 5.0),
 ('Japanese Sweets', 5.0),
 ('Chinese Martial Arts', 5.0),
 ('Trivia Hosts', 5.0),
 ('Fireworks', 5.0),
 ('Duplication Services', 5.0),
 ('Safety Equipment', 5.0),
 ('Outdoor Movies', 5.0),
 ('Water Suppliers', 5.0),
 ('Public Adjusters', 5.0),
 ('Crane Services', 5.0),
 ('Storefront Clinics', 5.0),
 ('Hearing Aids', 5.0),
 ('Registry Office', 5.0),
 ('Sledding', 5.0),
 ('Geneticists', 5.0),
 ('Makerspaces', 5.0),
 ('Bocce Ball', 5.0),
 ('Christmas Markets', 5.0),
 ('Calabrian', 5.0),
 ('Caricatures', 4.933333333333334),
 ('Taxidermy', 4.933333333333334),
 ('Party Characters', 4.873015873015873),
 ('Boudoir Photography', 4.857142857142857)