### This  file is specific to preprocessing and first-round data cleaning requirements. Take the new JSON files generated from here and use it separately for the purposes of exploration and modelling

In [1]:
import numpy as np
import scipy
import random
import urllib
import matplotlib.pyplot as plt
import json

In [2]:
random.seed(0) #set an initial seed so that we can reproduce consistently

In [3]:
def readJSONfromFile(fname):
    yelp_data = []
    with open(fname) as f:
        for line in f:
            yelp_data.append(json.loads(line))
    return yelp_data

In [4]:
data = readJSONfromFile('data/yelp_academic_dataset_business.json')

In [5]:
print("The dataset has {} businesses".format(len(data)))

The dataset has 144072 businesses


In [10]:
data[11]

{u'address': u'1500 N Green Valley Pkwy, Ste 230',
 u'attributes': [u'Alcohol: none',
  u"Ambience: {'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}",
  u'BusinessAcceptsCreditCards: True',
  u"BusinessParking: {'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
  u'Caters: True',
  u'GoodForKids: True',
  u"GoodForMeal: {'dessert': False, 'latenight': False, 'lunch': True, 'dinner': True, 'breakfast': False, 'brunch': False}",
  u'HasTV: True',
  u'NoiseLevel: quiet',
  u'OutdoorSeating: False',
  u'RestaurantsAttire: casual',
  u'RestaurantsDelivery: False',
  u'RestaurantsGoodForGroups: True',
  u'RestaurantsPriceRange2: 1',
  u'RestaurantsReservations: False',
  u'RestaurantsTableService: False',
  u'RestaurantsTakeOut: True',
  u'WiFi: no'],
 u'business_id': u'42romV8altAeuZuP2OC1gw',
 u'categories': [u'Hawaiian', u'Restaurants', u'Barbeq

### Restaurant Data preprocessing

#### We only want restaurant data. Yelp data is clean, in the sense that every business which is a restaurant, has a restaurant category at the least. Bakeries or sometime coffee places are not tagged restaurant, which is acceptable

In [11]:
restaurant_data = []
restaurant_id_set = set()   #set of ids of business = restaurant useful for processing of other json files
for business in data:
    if business["categories"]!= None and "Restaurants" in business["categories"]:
        restaurant_data.append(business)
        restaurant_id_set.add(business["business_id"])
        

In [12]:
print("The number of businesses which are just restaurants is {}".format(len(restaurant_data)))

The number of businesses which are just restaurants is 48485


In [13]:
with open('yelp_restaurant_data.json', 'w') as outfile:
    json.dump(restaurant_data, outfile)


### User review data processing
#### Choose only those reviews pertaining to restaurants and store the reduced set of reviews

In [16]:
yelp_user_review_data = []
with open("data/yelp_academic_dataset_review.json", "U") as f:
    for line in f:
        review = json.loads(line)
        if review["business_id"] in restaurant_id_set:
            yelp_user_review_data.append(review)

In [23]:
print("Number of user reviews pertaining to restaurants is {}".format(len(yelp_user_review_data)))

Number of user reviews pertaining to restaurants is 2577298


In [24]:
with open('yelp_user_review_data.json', 'w') as outfile:
    json.dump(yelp_user_review_data, outfile)

In [27]:
f.close()
outfile.close()

### User data processing
#### the user.json file has many attributes for each JSON object which are not relevant to our task. These are removed and a new json file is created

#### The attributes retained has the review_count and the average_stars which is across all Yelp business. We can get the right count from the restaurant review data specifically.
#### We only want the users who have written atleast one review (i.e. gone to one restaurant at the least). This removes the cold-start problem for now

In [57]:
yelp_user_data = []
keep_property_arr= ["user_id","name","yelping_since","review_count", "average_stars","fans"]  #these are the attributes to keep

with open("data/yelp_academic_dataset_user.json", "U") as f:
    for line in f:
        user = json.loads(line)
        user_jsobobj = dict()
        for property in keep_property_arr:
            user_jsobobj[property] = user[property]
        yelp_user_data.append(user_jsobobj)
        

In [59]:
yelp_user_data[100]

{'average_stars': 3.76,
 'fans': 15,
 'name': u'Derek',
 'review_count': 341,
 'user_id': u'i3aWph4gS_pUEV8f5i5lzg',
 'yelping_since': u'2013-05-02'}

In [60]:
len(yelp_user_data)

1029432

In [61]:
with open('yelp_user_data.json', 'w') as outfile:
    json.dump(yelp_user_data, outfile)
    print("File {} written".format("yelp_user_data.json"))
    outfile.close()
    f.close()

File yelp_user_data.json written
