# Clean and Scale Data 
### Author: Jasmin Pena 
### Contribution and details
- I adopted Jennifer Nguyen's method of importing data and Adefemi Abimbola's feature extraction and engineering. 
- I added many more features that might be useful for analysis
- I scaled continous variables 

In [301]:
# Import libraries and load datasets
import json
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm  # For progress bars
from datetime import datetime # To get hours open
from sklearn.preprocessing import StandardScaler #To scale continous features 
import ast

In [302]:
#Reading and saving datasets 

#Paths to load datasets
business_path = 'yelp_academic_dataset_business.json'
review_path = 'yelp_academic_dataset_review.json'
user_path = 'yelp_academic_dataset_user.json'
checkin_path = 'yelp_academic_dataset_checkin.json'
tip_path = 'yelp_academic_dataset_tip.json'

#Reading JSON files 
def read_json(file_path, max_records=None):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(tqdm(f)):
            if max_records and i >= max_records:
                break
            data.append(json.loads(line))
    return pd.DataFrame(data)
    
# Removed 'max_records' for business and users since we have to clean it
# Saved 'max_records' for others for faster processing but can remove when utilizing entire dataset
business = read_json(business_path)
review = read_json(review_path, max_records=5000)
user = read_json(user_path)
checkin = read_json(checkin_path, max_records=5000)
tip = read_json(tip_path, max_records=5000)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [303]:
# Aggregating the dataset into a master dataset
# master dataset created each row represents one review

business.rename(columns={'name': 'business_name',
                         'address': 'business_address',
                         'city': 'business_city',
                         'stars': 'business_stars',
                         'review_count':'business_review_count',
                         'attributes':'business_attributes',
                         'categories':'business_categories',
                         'hours':'business_hours'}, inplace=True)
business.drop(columns=['is_open'], inplace=True)
review.rename(columns={'stars': 'review_stars',
                       'date': 'review_date',
                       'text': 'review'}, inplace=True)
review.drop(columns=['useful', 'funny','cool'], inplace=True)
user.rename(columns={'name': 'user_name',
                     'review_count': 'user_review_count',
                     'average_stars': 'user_average_stars',
                     'compliments_total': 'User_compliments_total'}, inplace=True)
user.drop(columns=['elite', 'fans'], inplace=True)
checkin.rename(columns={'date': 'checkin_dates'}, inplace=True)
master = review.merge(business, on='business_id', how='left')
master = master.merge(user, on='user_id', how='left')
master = master.merge(checkin, on='business_id', how='left')
master.dropna(inplace=True)

#View data 
master.head()

Unnamed: 0,review_id,user_id,business_id,review_stars,review,review_date,business_name,business_address,business_city,state,...,business_stars,business_review_count,business_attributes,business_categories,business_hours,user_name,user_review_count,user_average_stars,User_compliments_total,checkin_dates
5,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,I am a long term frequent customer of this est...,2015-09-23 23:10:31,Dmitri's,795 S 3rd St,Philadelphia,PA,...,4.0,273,"{'BusinessParking': '{'garage': False, 'street...","Mediterranean, Restaurants, Seafood, Greek","{'Wednesday': '17:30-21:0', 'Thursday': '17:30...",Q,4,2.0,1,"2010-03-27 22:04:48, 2010-07-13 00:09:00, 2010..."
32,40thYphUgIfvJq17QCfTwA,QzCEzH3R7Z6erOGLr3t55Q,0pMj5xUAecW9o1P35B0AMw,5.0,Great staff always helps and always nice. Alwa...,2017-05-26 13:10:24,Wawa,2544 W Main Street,Norristown,PA,...,3.5,8,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Food, Coffee & Tea, Gas Stations, Restaurants,...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",Kylhalil,14,4.36,0,"2014-10-01 10:33:34, 2014-10-20 12:23:13, 2014..."
90,byblHsbxiqb1pC1cuSfslA,C_2mNjl-doRVvsL03_T57Q,18eWJFJbXyR9j_5xfcRLYA,4.0,This is the first time I tried this place and ...,2011-10-28 03:43:05,Siam Elephant,509 Linden Ave,Carpinteria,CA,...,4.5,460,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","Restaurants, Thai","{'Tuesday': '17:0-21:30', 'Wednesday': '17:0-2...",Richard,28,4.07,2,"2010-03-26 19:31:36, 2010-06-14 03:47:29, 2010..."
125,quiZPC8t-iZs1uiMA1ovEQ,TTibuRAx2gxu-nVAymFijQ,-ikBycdroyTLDBHR9aC3HA,5.0,Stopped in for the lunch menu with my girlfrie...,2014-09-25 18:36:53,Sukho Thai,2450 Music Valley Dr,Nashville,TN,...,3.5,116,"{'OutdoorSeating': 'False', 'BusinessAcceptsCr...","Thai, Restaurants","{'Monday': '16:30-21:0', 'Tuesday': '16:30-21:...",Joe,32,4.67,1,"2010-04-22 00:29:12, 2010-11-20 00:03:45, 2010..."
138,zqmkEnp1kfU2vosDcG2kMg,KqKXOl0PMlZGBMlw8OUpyA,-If0ps0QhOLCYVWQWs9RYg,5.0,Yes! I love this place! Maple Street Patisseri...,2013-05-28 21:37:01,Maple Street Patisserie,7638 Maple St,New Orleans,LA,...,3.5,171,"{'Ambience': '{'romantic': False, 'intimate': ...","Restaurants, Food, Sandwiches, Bakeries","{'Tuesday': '6:0-17:0', 'Wednesday': '6:0-17:0...",Jessica,36,4.42,15,"2010-08-02 21:22:49, 2010-08-07 20:57:07, 2010..."


In [304]:
#Learn about dataset
print('Data set info:')
print(master.info())
print()

print('Data set .describe:')
print(master.describe())
print()

print('data set null count:')
print(master.isnull().sum())
print()

Data set info:
<class 'pandas.core.frame.DataFrame'>
Index: 199 entries, 5 to 4959
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   review_id               199 non-null    object 
 1   user_id                 199 non-null    object 
 2   business_id             199 non-null    object 
 3   review_stars            199 non-null    float64
 4   review                  199 non-null    object 
 5   review_date             199 non-null    object 
 6   business_name           199 non-null    object 
 7   business_address        199 non-null    object 
 8   business_city           199 non-null    object 
 9   state                   199 non-null    object 
 10  postal_code             199 non-null    object 
 11  business_stars          199 non-null    float64
 12  business_review_count   199 non-null    int64  
 13  business_attributes     199 non-null    object 
 14  business_categories     199 non

In [305]:
#Taking a look at keys to see which to save
unique_keys = set()

for row in master['business_attributes']:
    if isinstance(row, dict):
        keys = row.keys()
        unique_keys.update(keys)

print(unique_keys)

{'WheelchairAccessible', 'RestaurantsTakeOut', 'BusinessParking', 'RestaurantsTableService', 'Caters', 'BYOB', 'BestNights', 'Corkage', 'Music', 'RestaurantsReservations', 'RestaurantsDelivery', 'RestaurantsPriceRange2', 'DogsAllowed', 'RestaurantsGoodForGroups', 'GoodForDancing', 'BYOBCorkage', 'ByAppointmentOnly', 'GoodForKids', 'BusinessAcceptsCreditCards', 'Smoking', 'Ambience', 'HasTV', 'RestaurantsAttire', 'AcceptsInsurance', 'NoiseLevel', 'Alcohol', 'DriveThru', 'WiFi', 'BikeParking', 'OutdoorSeating', 'GoodForMeal', 'BusinessAcceptsBitcoin', 'HappyHour', 'CoatCheck'}


### What Variables to Keep
Here, we look at variables that  could contribute to a business's success, and based on what has been used.  
- ByAppointmentOnly
- NoiseLevel: int
- RestaurantsDelivery
- ResturantsReservations
- ResturantsGoodForGroups
- RestaurantsTakeOut
- ResturantsPriceRange2
- BusinessAcceptsCreditCards
- AcceptsInsurance
- WiFi
- DriveThru
- BikeParking

These are dictionaries that require special extracting:
- GoodForMeal: breakfast, brunch, lunch, etc...
- BusinessParking
- Music
- Alcohol: has type but will make to true/false

In [307]:
#Going to see unique values for each key, excluding keys that are dictionaries
target_keys = [ 'ByAppointmentOnly', 'NoiseLevel', 'RestaurantsDelivery',  
                'RestaurantsTakeOut', 'BusinessAcceptsCreditCards', 'WiFi',  
               'RestaurantsReservations', 'RestaurantsGoodForGroups', 
               'RestaurantsPriceRange2', 'AcceptsInsurance', 'DriveThru','BikeParking', 'Alcohol']

unique_values = defaultdict(set)

for row in master['business_attributes']:
    if isinstance(row, dict):
        for key in target_keys:
            if key in row:
                value = row[key]
                unique_values[key].add(value)

# To display nicely:
for key, values in unique_values.items():
    print(f"{key}: {sorted(values)}")

NoiseLevel: ["'average'", "'loud'", "u'average'", "u'loud'", "u'quiet'"]
RestaurantsDelivery: ['False', 'None', 'True']
RestaurantsTakeOut: ['False', 'None', 'True']
BusinessAcceptsCreditCards: ['False', 'True']
WiFi: ["'free'", "'no'", "u'free'", "u'no'"]
RestaurantsReservations: ['False', 'True']
RestaurantsGoodForGroups: ['False', 'True']
RestaurantsPriceRange2: ['1', '2', '3', '4']
BikeParking: ['False', 'True']
Alcohol: ["'beer_and_wine'", "'full_bar'", "'none'", "u'beer_and_wine'", "u'full_bar'", "u'none'"]
ByAppointmentOnly: ['False', 'True']
AcceptsInsurance: ['True']
DriveThru: ['False', 'None', 'True']


In [308]:
#Functions to extract and create features

#Cleaning unicode values
def clean_value(val):
    if isinstance(val, str):
        val = val.strip()  # remove extra spaces
        if val.lower() in ["'none'", "none", "u'none'"]:
            return np.nan
        if val.startswith("u'") and val.endswith("'"):
            val = val[2:-1]  # remove leading u
        if val.startswith("'") and val.endswith("'"):
            val = val[1:-1]  # remove extra quotes
        val = val.lower()  # optional: make everything lowercase
    return val

def simplify_category(cat_string):
    if not isinstance(cat_string, str):
        return 'Other'
    
    cat_string = cat_string.lower()

    if 'restaurant' in cat_string or 'food' in cat_string:
        return 'Restaurant'
    elif 'shopping' in cat_string or 'store' in cat_string:
        return 'Retail'
    elif 'health' in cat_string or 'medical' in cat_string or 'nutritionist' in cat_string:
        return 'Health'
    elif 'beauty' in cat_string or 'spa' in cat_string or 'salon' in cat_string:
        return 'Beauty'
    elif 'bar' in cat_string or 'nightlife' in cat_string or 'club' in cat_string:
        return 'Nightlife'
    elif 'education' in cat_string or 'school' in cat_string:
        return 'Education'
    elif 'service' in cat_string or 'notary' in cat_string:
        return 'Professional Services'
    elif 'public service' in cat_string or 'government' in cat_string:
        return 'Government'
    elif 'automotive' in cat_string or 'car' in cat_string or 'mechanic' in cat_string or 'tire' in cat_string:
        return 'Automotive'
    elif 'gym' in cat_string or 'fitness' in cat_string or 'yoga' in cat_string or 'active life' in cat_string:
        return 'Fitness'
    elif 'pets' in cat_string or 'animal' in cat_string or 'veterinary' in cat_string:
        return 'Veterinary'
    elif 'hotel' in cat_string or 'lodging' in cat_string or 'travel' in cat_string:
        return 'Travel'
    elif 'church' in cat_string or 'temple' in cat_string or 'synagogue' in cat_string or 'place of worship' in cat_string:
        return 'Religious'
    elif 'movie' in cat_string or 'cinema' in cat_string or 'theater' in cat_string or 'art' in cat_string or 'museum' in cat_string or 'entertainment' in cat_string:
        return 'Entertainment'
    elif 'media' in cat_string or 'news' in cat_string or 'radio' in cat_string or 'television' in cat_string:
        return 'Media/Station'
    else:
        return 'Other'

def extract_parking_type(attr_str, parking_type):
    try:
        # If attr_str is a string, convert it to a dictionary
        if isinstance(attr_str, str):
            attr_dict = ast.literal_eval(attr_str)
        else:
            # If it's already a dictionary, use it as-is
            attr_dict = attr_str

        # Now extract 'BusinessParking' and parse it
        parking_str = attr_dict.get('BusinessParking')
        parking_dict = ast.literal_eval(parking_str) if parking_str else {}
        return parking_dict.get(parking_type, None)
    except:
        return None


#Extract number of hours and days open 
def extract_open_info(hours_dict):
    days_open = 0
    hours_open = 0.0
    for day, hours in hours_dict.items():
        if hours != '0:0-0:0' and hours:
            days_open += 1 #Increment day

            #Get hours string
            open_str, close_str = hours.split('-')

            #Convert to datetime 
            open = datetime.strptime(open_str, '%H:%M')
            close = datetime.strptime(close_str, '%H:%M')

            #Get time difference by converting to seconds then convert to hour
            time_diff = (close - open).seconds / 3600

            #Save to hours_open
            hours_open += time_diff

    return pd.Series([days_open, hours_open])

In [309]:
#Building new columns to add to database
for row in master['business_attributes']:
    if isinstance(row, dict):
        for key in target_keys:
            if key in row:
                value = clean_value(row[key])
                unique_values[key].add(value)

for key in target_keys:
    master[key] = master['business_attributes'].apply(
        lambda d: clean_value(d.get(key)) if isinstance(d, dict) else np.nan)
    
# simplifying the business categories
master['category_simple'] = master['business_categories'].apply(simplify_category)

# Add number of hours and days open
master[['days_open_count', 'hours_open_count']] = master['business_hours'].apply(extract_open_info).apply(pd.Series)

# Add cleaned columns for 'Alcohol' and 'Music'
master['Alcohol'] = master['business_attributes'].apply(
    lambda x: any([val not in [None, 'None', ''] for val in x.get('Alcohol', [])]) if isinstance(x, dict) else False)
master['Music'] = master['business_attributes'].apply(
    lambda x: any([val not in [None, 'None', ''] for val in x.get('Music', [])]) if isinstance(x, dict) else False)

#Review Length
master['review_length'] = master['review'].apply(lambda x: len(str(x).split()))

# Add parking type 
master['parking_garage'] = master['business_attributes'].apply(lambda x: extract_parking_type(x, 'garage'))
master['parking_lot'] = master['business_attributes'].apply(lambda x: extract_parking_type(x, 'lot'))
master['parking_street'] = master['business_attributes'].apply(lambda x: extract_parking_type(x, 'street'))

#Drop column business_hours
master = master.drop('business_hours', axis = 1)

#Rename credit_cards column and drop old one
master = master.rename(columns = {'BusinessAcceptsCreditCards': 'credit_cards'})
master

Unnamed: 0,review_id,user_id,business_id,review_stars,review,review_date,business_name,business_address,business_city,state,...,BikeParking,Alcohol,category_simple,days_open_count,hours_open_count,Music,review_length,parking_garage,parking_lot,parking_street
5,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,I am a long term frequent customer of this est...,2015-09-23 23:10:31,Dmitri's,795 S 3rd St,Philadelphia,PA,...,true,True,Restaurant,5.0,20.0,False,65,False,False,True
32,40thYphUgIfvJq17QCfTwA,QzCEzH3R7Z6erOGLr3t55Q,0pMj5xUAecW9o1P35B0AMw,5.0,Great staff always helps and always nice. Alwa...,2017-05-26 13:10:24,Wawa,2544 W Main Street,Norristown,PA,...,true,False,Restaurant,0.0,0.0,False,28,False,False,False
90,byblHsbxiqb1pC1cuSfslA,C_2mNjl-doRVvsL03_T57Q,18eWJFJbXyR9j_5xfcRLYA,4.0,This is the first time I tried this place and ...,2011-10-28 03:43:05,Siam Elephant,509 Linden Ave,Carpinteria,CA,...,true,True,Restaurant,6.0,37.0,False,90,False,True,True
125,quiZPC8t-iZs1uiMA1ovEQ,TTibuRAx2gxu-nVAymFijQ,-ikBycdroyTLDBHR9aC3HA,5.0,Stopped in for the lunch menu with my girlfrie...,2014-09-25 18:36:53,Sukho Thai,2450 Music Valley Dr,Nashville,TN,...,true,True,Restaurant,6.0,29.5,False,24,False,True,False
138,zqmkEnp1kfU2vosDcG2kMg,KqKXOl0PMlZGBMlw8OUpyA,-If0ps0QhOLCYVWQWs9RYg,5.0,Yes! I love this place! Maple Street Patisseri...,2013-05-28 21:37:01,Maple Street Patisserie,7638 Maple St,New Orleans,LA,...,true,True,Restaurant,6.0,61.0,False,73,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4725,MdGvGVtFZdU-ZLoBgIFo3Q,7ctnUScX9B362qXOebNk9w,-TCa3KBib07_1ko9L2Z0fQ,2.0,Foods good but don't order online for delivery...,2018-09-25 15:34:41,Yeoman's Cask and Lion,"200 1st Ave S, Ste B",St. Petersburg,FL,...,,True,Restaurant,7.0,83.0,True,18,False,False,False
4811,LYBQCkNwLY-Z_2By42vwMg,wcvwEyMLZViMP2rRZAaiZA,1FURjeGJi_LBXcJQg8eskw,4.0,This place is in a great and fun location and ...,2016-06-23 02:21:01,Padaro Beach Grill,3765 Santa Claus Ln,Carpinteria,CA,...,true,True,Restaurant,6.0,52.0,True,85,False,False,True
4920,toLAMehjvvW3JV33WjboRA,1ZhcB8kduDlsC3j70GYAOg,1-z7wd860Rii4kbEMCT8DA,3.0,Any breakfast place that I can remember six mo...,2013-10-25 06:49:06,Moon's Kitchen Cafe,712 W Idaho St,Boise,ID,...,true,True,Restaurant,5.0,42.0,False,73,True,False,True
4930,r-YKEJT6JjiLIz2YmQ-qNw,uUcNCwf86aea7F9AGFBJNQ,0zf4KKZqbqoxiuKzeZLDdg,2.0,I don't know what so special about this place....,2011-01-09 23:01:13,PrimoHoagies,128 S 11th St,Philadelphia,PA,...,true,True,Restaurant,7.0,83.5,False,25,False,False,True


In [310]:
master.dtypes

review_id                    object
user_id                      object
business_id                  object
review_stars                float64
review                       object
review_date                  object
business_name                object
business_address             object
business_city                object
state                        object
postal_code                  object
business_stars              float64
business_review_count         int64
business_attributes          object
business_categories          object
user_name                    object
user_review_count             int64
user_average_stars          float64
User_compliments_total        int64
checkin_dates                object
ByAppointmentOnly            object
NoiseLevel                   object
RestaurantsDelivery          object
RestaurantsTakeOut           object
credit_cards                 object
WiFi                         object
RestaurantsReservations      object
RestaurantsGoodForGroups    

In [311]:
master.columns

Index(['review_id', 'user_id', 'business_id', 'review_stars', 'review',
       'review_date', 'business_name', 'business_address', 'business_city',
       'state', 'postal_code', 'business_stars', 'business_review_count',
       'business_attributes', 'business_categories', 'user_name',
       'user_review_count', 'user_average_stars', 'User_compliments_total',
       'checkin_dates', 'ByAppointmentOnly', 'NoiseLevel',
       'RestaurantsDelivery', 'RestaurantsTakeOut', 'credit_cards', 'WiFi',
       'RestaurantsReservations', 'RestaurantsGoodForGroups',
       'RestaurantsPriceRange2', 'AcceptsInsurance', 'DriveThru',
       'BikeParking', 'Alcohol', 'category_simple', 'days_open_count',
       'hours_open_count', 'Music', 'review_length', 'parking_garage',
       'parking_lot', 'parking_street'],
      dtype='object')

In [312]:
#Checking unique values of each column. 

master_columns = ['ByAppointmentOnly', 'NoiseLevel',
       'RestaurantsDelivery', 'RestaurantsTakeOut', 'credit_cards', 'WiFi',
       'RestaurantsReservations', 'RestaurantsGoodForGroups',
       'RestaurantsPriceRange2', 'AcceptsInsurance', 'DriveThru',
       'BikeParking', 'category_simple', 'days_open_count', 'hours_open_count',
       'Alcohol', 'Music', 'parking_garage', 'parking_lot', 'parking_street']

for col in master_columns:
    print(f'{col}: {master[col].unique()}')

ByAppointmentOnly: [None 'false' 'true']
NoiseLevel: ['average' None 'quiet' 'loud']
RestaurantsDelivery: ['false' 'true' None nan]
RestaurantsTakeOut: ['true' None nan 'false']
credit_cards: ['false' 'true' None]
WiFi: ['no' 'free' None]
RestaurantsReservations: ['false' None 'true']
RestaurantsGoodForGroups: ['false' None 'true']
RestaurantsPriceRange2: ['2' '1' None '3' '4']
AcceptsInsurance: [None 'true']
DriveThru: [None 'true' 'false' nan]
BikeParking: ['true' 'false' None]
category_simple: ['Restaurant' 'Beauty' 'Retail' 'Professional Services' 'Automotive'
 'Health' 'Fitness' 'Nightlife' 'Veterinary']
days_open_count: [5. 0. 6. 7. 2. 4.]
hours_open_count: [ 20.    0.   37.   29.5  61.   76.  119.   70.   56.5  45.   73.   36.
  64.   57.5  62.   42.   88.   32.   40.   58.5  85.   66.   63.  112.
  44.   67.   25.   60.   74.   35.   64.5  87.   30.   52.   23.   92.
  80.5  39.   80.   50.   55.  105.   47.   49.   68.   89.5  56.   54.
 113.   98.   95.   51.5  65.  108.5  43

In [313]:
#Converting booleans/strings (True/false) to numerical (0/1)

def convert_true_false_to_int(value):
    # Check if the value is a string and lower case it for comparison
    if isinstance(value, str):
        value = value.lower()

    # Convert 'true'/'false' (as strings) or True/False to 1/0
    if value in ['true', '1', True]:
        return 1
    elif value in ['false', '0', False]:
        return 0
    else:
        # Handle any other values that might not fit
        return value

convert_columns = ['ByAppointmentOnly',
                    'RestaurantsDelivery',
                    'RestaurantsTakeOut',
                    'credit_cards',
                    'RestaurantsReservations',
                    'RestaurantsGoodForGroups',
                    'DriveThru',
                    'BikeParking',
                    'Alcohol',
                    'Music',
                    'parking_garage',
                    'parking_lot',
                    'parking_street']

# Iterate over each column and apply the conversion function
for col in convert_columns:
    master[col] = master[col].apply(lambda x: convert_true_false_to_int(x))

#Encode NoiseLevel
# Define the mapping
sound_level_mapping = {
    'quiet': 0,
    'average': 1,
    'loud': 2,
    None: np.nan  
}

# Apply the mapping
master['NoiseLevel_Encode'] = master['NoiseLevel'].map(sound_level_mapping)

# Check the result
for col in convert_columns:
    print(f'{col}: {master[col].unique()}')

ByAppointmentOnly: [nan  0.  1.]
RestaurantsDelivery: [ 0.  1. nan]
RestaurantsTakeOut: [ 1. nan  0.]
credit_cards: [ 0.  1. nan]
RestaurantsReservations: [ 0. nan  1.]
RestaurantsGoodForGroups: [ 0. nan  1.]
DriveThru: [nan  1.  0.]
BikeParking: [ 1.  0. nan]
Alcohol: [1 0]
Music: [0 1]
parking_garage: [ 0.  1. nan]
parking_lot: [ 0.  1. nan]
parking_street: [ 1.  0. nan]


In [314]:
#Scale continuous values

#Columns to be scaled
scale_columns = [
    'review_stars',
    'business_review_count',
    'user_review_count',
    'user_average_stars',
    'User_compliments_total',
    'review_length',
    'days_open_count',
    'hours_open_count'
]

#initilize and apply scaler. Chose standard scaler but if another is needed please let me know. 
scaler = StandardScaler()
master[scale_columns] = scaler.fit_transform(master[scale_columns])
master

Unnamed: 0,review_id,user_id,business_id,review_stars,review,review_date,business_name,business_address,business_city,state,...,Alcohol,category_simple,days_open_count,hours_open_count,Music,review_length,parking_garage,parking_lot,parking_street,NoiseLevel_Encode
5,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,-2.058394,I am a long term frequent customer of this est...,2015-09-23 23:10:31,Dmitri's,795 S 3rd St,Philadelphia,PA,...,1,Restaurant,-0.701262,-1.594000,0,-0.397852,0.0,0.0,1.0,1.0
32,40thYphUgIfvJq17QCfTwA,QzCEzH3R7Z6erOGLr3t55Q,0pMj5xUAecW9o1P35B0AMw,0.883225,Great staff always helps and always nice. Alwa...,2017-05-26 13:10:24,Wawa,2544 W Main Street,Norristown,PA,...,0,Restaurant,-4.155499,-2.408916,0,-0.843522,0.0,0.0,0.0,
90,byblHsbxiqb1pC1cuSfslA,C_2mNjl-doRVvsL03_T57Q,18eWJFJbXyR9j_5xfcRLYA,0.147820,This is the first time I tried this place and ...,2011-10-28 03:43:05,Siam Elephant,509 Linden Ave,Carpinteria,CA,...,1,Restaurant,-0.010415,-0.901322,0,-0.096724,0.0,1.0,1.0,1.0
125,quiZPC8t-iZs1uiMA1ovEQ,TTibuRAx2gxu-nVAymFijQ,-ikBycdroyTLDBHR9aC3HA,0.883225,Stopped in for the lunch menu with my girlfrie...,2014-09-25 18:36:53,Sukho Thai,2450 Music Valley Dr,Nashville,TN,...,1,Restaurant,-0.010415,-1.206915,0,-0.891702,0.0,1.0,0.0,0.0
138,zqmkEnp1kfU2vosDcG2kMg,KqKXOl0PMlZGBMlw8OUpyA,-If0ps0QhOLCYVWQWs9RYg,0.883225,Yes! I love this place! Maple Street Patisseri...,2013-05-28 21:37:01,Maple Street Patisserie,7638 Maple St,New Orleans,LA,...,1,Restaurant,-0.010415,0.076578,0,-0.301491,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4725,MdGvGVtFZdU-ZLoBgIFo3Q,7ctnUScX9B362qXOebNk9w,-TCa3KBib07_1ko9L2Z0fQ,-1.322989,Foods good but don't order online for delivery...,2018-09-25 15:34:41,Yeoman's Cask and Lion,"200 1st Ave S, Ste B",St. Petersburg,FL,...,1,Restaurant,0.680433,0.972985,1,-0.963973,0.0,0.0,0.0,
4811,LYBQCkNwLY-Z_2By42vwMg,wcvwEyMLZViMP2rRZAaiZA,1FURjeGJi_LBXcJQg8eskw,0.147820,This place is in a great and fun location and ...,2016-06-23 02:21:01,Padaro Beach Grill,3765 Santa Claus Ln,Carpinteria,CA,...,1,Restaurant,-0.010415,-0.290135,1,-0.156950,0.0,0.0,1.0,1.0
4920,toLAMehjvvW3JV33WjboRA,1ZhcB8kduDlsC3j70GYAOg,1-z7wd860Rii4kbEMCT8DA,-0.587585,Any breakfast place that I can remember six mo...,2013-10-25 06:49:06,Moon's Kitchen Cafe,712 W Idaho St,Boise,ID,...,1,Restaurant,-0.701262,-0.697593,0,-0.301491,1.0,0.0,1.0,1.0
4930,r-YKEJT6JjiLIz2YmQ-qNw,uUcNCwf86aea7F9AGFBJNQ,0zf4KKZqbqoxiuKzeZLDdg,-1.322989,I don't know what so special about this place....,2011-01-09 23:01:13,PrimoHoagies,128 S 11th St,Philadelphia,PA,...,1,Restaurant,0.680433,0.993358,0,-0.879657,0.0,0.0,1.0,1.0


In [315]:
master.dtypes

review_id                    object
user_id                      object
business_id                  object
review_stars                float64
review                       object
review_date                  object
business_name                object
business_address             object
business_city                object
state                        object
postal_code                  object
business_stars              float64
business_review_count       float64
business_attributes          object
business_categories          object
user_name                    object
user_review_count           float64
user_average_stars          float64
User_compliments_total      float64
checkin_dates                object
ByAppointmentOnly           float64
NoiseLevel                   object
RestaurantsDelivery         float64
RestaurantsTakeOut          float64
credit_cards                float64
WiFi                         object
RestaurantsReservations     float64
RestaurantsGoodForGroups    

In [339]:
# master.to_csv('sample_dataset.csv')
# business = read_json(business_path, max_records=5000)
# review = read_json(review_path, max_records=5000)
# user = read_json(user_path, max_records=5000)
# checkin = read_json(checkin_path, max_records=5000)
# tip = read_json(tip_path, max_records=5000)

# business.to_csv('business_sample.csv')
# review.to_csv('review_sample.csv')
# user.to_csv('user_sample.csv')
# checkin.to_csv('checkin_sample.csv')
# tip.to_csv('tip_sample.csv')