In [None]:
# convert yelp_business.json to csv after splitting attributes into their own columns and converting hours to datetime
import pandas as pd

input_path  = 'data/json/yelp_business.json'

df = pd.read_json(input_path, lines=True)

def normalize_time(t):
    '''add leading 0s to time values for open and close times'''
    if pd.isna(t):
        return None
    h, m = map(int, t.split(':'))
    return f'{h:02d}:{m:02d}'


# split dictionary of attributes into their own columns
df = pd.concat([df.drop('attributes', axis=1), pd.json_normalize(df['attributes'])], axis=1)

hours_df = pd.json_normalize(df['hours'])

for col in hours_df.columns:
    # split hours into separate columns 
    hours_df[[f'{col}_open', f'{col}_close']] = hours_df[col].str.split('-', expand=True)

    # add leading zeros if necessary and convert to datetime for open and close
    hours_df[f'{col}_open'] = hours_df[f'{col}_open'].apply(normalize_time)
    hours_df[f'{col}_open'] = pd.to_datetime(hours_df[f'{col}_open'], format='%H:%M').dt.time

    hours_df[f'{col}_close'] = hours_df[f'{col}_close'].apply(normalize_time)
    hours_df[f'{col}_close'] = pd.to_datetime(hours_df[f'{col}_close'], format='%H:%M').dt.time

    hours_df = hours_df.drop(col, axis=1)
    
# concat opening and closing hours w/ main df
df = pd.concat([df.drop('hours', axis=1), hours_df], axis=1)

# # explode categories into their own row for each business w/ multiple categories
# df['categories'] = df['categories'].dropna().str.split(',')
# df_exploded = df.explode('categories').reset_index(drop=True)
# df_exploded


df.to_csv('data/csv/yelp_business.csv', index=False)

In [39]:
# convert review jsons to csv
import pandas as pd

for i in range(1, 11):
    input_path  = f'data/json/yelp_review_{i}.json'

    df = pd.read_json(input_path, lines=True)
    # downcast values to reduce file size
    for col in ['stars', 'useful', 'funny', 'cool']:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in ['review_id', 'user_id', 'business_id', 'text']:
        df[col] = df[col].astype('string')
    
    df.to_csv(f'data/csv/yelp_review_{i}.csv', index=False)

In [5]:
# convert users json to csv
import pandas as pd

for i in range(1, 9):
    input_path  = f'data/json/yelp_user_{i}.json'

    df = pd.read_json(input_path, lines=True)
    # downcast values to reduce file size
    for col in ['review_count', 'useful', 'funny', 'cool', 'fans', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note',
                'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos']:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in ['user_id', 'name']:
        df[col] = df[col].astype('string')
    df['yelping_since'] = pd.to_datetime(df['yelping_since'])
    df['average_stars'] = pd.to_numeric(df['average_stars'], downcast='float')
    
    df.to_csv(f'data/csv/yelp_user_{i}.csv', index=False)

In [10]:
# convert yelp tips json to csv
import pandas as pd

input_path  = 'data/json/yelp_tip.json'

df = pd.read_json(input_path, lines=True)

for col in ['user_id', 'business_id', 'text']:
    df[col] = df[col].astype('string')
df['compliment_count'] = pd.to_numeric(df['compliment_count'], downcast='integer')

df.to_csv(f'data/csv/yelp_tip.csv', index=False)