## Wrangling the Yelp business data

In [1]:
import numpy as np
import pandas as pd
import json

%matplotlib inline

In [71]:
toronto_all_business_df = pd.read_csv('yelp_dataset/toronto_all_business.csv')
toronto_restaurant_business_df = pd.read_csv('yelp_dataset/toronto_restaurant_business.csv')
toronto_japanese_business_df = pd.read_csv('yelp_dataset/toronto_japanese_business.csv')

Wrangle the `attributes` column into a JSON-parsable format:

In [73]:
def wrangle_attributes(row):
    attributes_data = row.attributes
    
    if (pd.isnull(attributes_data)):
        row['attributes'] = '{}'
        return row
        
    attributes_data = attributes_data.replace("\"u\'", "\'")
    attributes_data = attributes_data.replace('\'', '\"')
    attributes_data = attributes_data.replace('""', '"')
    attributes_data = attributes_data.replace('"{', '{')
    attributes_data = attributes_data.replace('}"', '}')
    attributes_data = attributes_data.replace(' False', ' "False"')
    attributes_data = attributes_data.replace(' True', ' "True"')
    attributes_data = attributes_data.replace(' None', ' "None"')
    
    row['attributes'] = attributes_data
    return row

In [65]:
# This is a test to see if all rows in the largest dataset can be easily converted to JSON format

for index, row in toronto_all_business_df.apply(lambda row: wrangle_attributes(row), axis = 1).iterrows():
    json.loads(row.attributes)

In [74]:
toronto_all_business_df = toronto_all_business_df.apply(lambda row: wrangle_attributes(row), axis = 1)
toronto_restaurant_business_df = toronto_restaurant_business_df.apply(lambda row: wrangle_attributes(row), axis = 1)
toronto_japanese_business_df = toronto_japanese_business_df.apply(lambda row: wrangle_attributes(row), axis = 1)

Wrangle the `hours` column into a fixed and consistent hour format:

In [84]:
def wrangle_hours(row):
    hours_data = row.hours
    
    if (pd.isnull(hours_data)):
        row['hours'] = '{}'
        return row
        
    hours_data = hours_data.replace('\'', '\"')
    hours_data = hours_data.replace('""', '"')
    hours_data = hours_data.replace('"{', '{')
    hours_data = hours_data.replace('}"', '}')
    hours_data = hours_data.replace(':0', ':00')
    
    row['hours'] = hours_data
    return row

In [88]:
# This is a test to see if all rows in the largest dataset can be easily converted to JSON format

for index, row in toronto_all_business_df.apply(lambda row: wrangle_hours(row), axis = 1).iterrows():
    json.loads(row.hours)

In [89]:
toronto_all_business_df = toronto_all_business_df.apply(lambda row: wrangle_hours(row), axis = 1)
toronto_restaurant_business_df = toronto_restaurant_business_df.apply(lambda row: wrangle_hours(row), axis = 1)
toronto_japanese_business_df = toronto_japanese_business_df.apply(lambda row: wrangle_hours(row), axis = 1)

In [91]:
toronto_all_business_df.to_csv('yelp_dataset/toronto_all_business.csv', index = False)
toronto_restaurant_business_df.to_csv('yelp_dataset/toronto_restaurant_business.csv', index = False)
toronto_japanese_business_df.to_csv('yelp_dataset/toronto_japanese_business.csv', index = False)