In [1]:
import pandas as pd
import json
from datetime import datetime
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [7]:
# business preprocessing

def avgStartTime(hours_dict):
    # returns average start time from dictionary of start times
    if hours_dict and len(hours_dict) > 0:
        daysOpen = len(hours_dict)
        sum_hours = 0
        for weekday, hours in hours_dict.items():
            start_time = hours.split('-')[0]
            hour = int(start_time.split(':')[0])
            minute = int(start_time.split(':')[1])/60
            sum_hours += hour+minute
        return sum_hours/daysOpen

def avgEndTime(hours_dict):
    # returns average end time from dictionary of end times
    if hours_dict and len(hours_dict) > 0:
        daysOpen = len(hours_dict)
        sum_hours = 0
        for weekday, hours in hours_dict.items():
            end_time = hours.split('-')[1]
            hour = int(end_time.split(':')[0])
            minute = int(end_time.split(':')[1])/60
            sum_hours += hour+minute
        return sum_hours/daysOpen


business_descriptors = pd.read_csv("data/business_labels.csv",encoding='ISO-8859–1')
business_descriptors = business_descriptors[['business_id','businessLatentCategory']]
business_descriptors.rename(columns={'businessLatentCategory':'business_latent_category'},inplace=True)
business_df_wo_latent_cat = pd.read_json("data/business.json",lines=True)
business_df_wo_latent_cat.columns = ['business_{}'.format(col) if col != 'business_id' else col for col in business_df_wo_latent_cat.columns]
business_df = business_df_wo_latent_cat.join(business_descriptors.set_index('business_id'), on='business_id')
business_df['business_days_open_weekly'] = business_df['business_hours'].apply(lambda hours: len(hours) if hours else None)
business_df['business_average_open_time'] = business_df['business_hours'].apply(avgStartTime)
business_df['business_average_close_time'] = business_df['business_hours'].apply(avgEndTime)
business_df.drop(['business_hours','business_postal_code','business_categories'], axis=1, inplace=True)

In [8]:
business_df.head()

Unnamed: 0,business_id,business_city,business_latitude,business_longitude,business_review_count,business_stars,business_state,business_latent_category,business_days_open_weekly,business_average_open_time,business_average_close_time
0,1SWheh84yJXfytovILXOAQ,Phoenix,33.522143,-112.018481,5,3.0,AZ,Active Life,,,
1,QXAEGFB4oINsVuTFxEYKFQ,Mississauga,43.605499,-79.652289,128,2.5,ON,Bars,7.0,9.0,0.0
2,gnKjwL_1w79qoiV3IC_xQQ,Charlotte,35.092564,-80.859132,170,4.0,NC,Bars,6.0,17.0,21.0
3,xvX2CttrVhyG2z1dFg_0xw,Goodyear,33.455613,-112.395596,3,5.0,AZ,,5.0,8.0,17.0
4,HhyxOkGAM07SRYtlQ4wMFQ,Charlotte,35.190012,-80.887223,4,4.0,NC,,7.0,7.0,23.0


In [9]:
# user preprocessing
users_df = pd.read_json("data/user.json",lines=True)
users_df.columns = ['user_{}'.format(col) if col != 'user_id' else col for col in users_df.columns]
users_df['user_num_elite_years'] = users_df['user_elite'].apply(lambda years: len(years) if years else 0)
users_df['user_first_elite_year'] = users_df['user_elite'].apply(lambda years: years.split(',')[0] if years else 0)
users_df['user_last_elite_year'] = users_df['user_elite'].apply(lambda years: years.split(',')[-1] if years else 0)
users_df['user_friends_count'] = users_df['user_friends'].apply(len)
users_df['user_begin_yelping_year'] = users_df['user_yelping_since'].apply(lambda date: datetime.strptime(date, '%Y-%m-%d %H:%M:%S').year)
users_df['user_begin_yelping_month'] = users_df['user_yelping_since'].apply(lambda date: datetime.strptime(date, '%Y-%m-%d %H:%M:%S').month)
users_df['user_begin_yelping_day'] = users_df['user_yelping_since'].apply(lambda date: datetime.strptime(date, '%Y-%m-%d %H:%M:%S').day)
users_df.drop(['user_friends','user_elite','user_yelping_since','user_name'], axis=1, inplace=True)
users_df.head()

Unnamed: 0,user_average_stars,user_compliment_cool,user_compliment_cute,user_compliment_funny,user_compliment_hot,user_compliment_list,user_compliment_more,user_compliment_note,user_compliment_photos,user_compliment_plain,...,user_review_count,user_useful,user_id,user_num_elite_years,user_first_elite_year,user_last_elite_year,user_friends_count,user_begin_yelping_year,user_begin_yelping_month,user_begin_yelping_day
0,4.03,1,0,1,2,0,0,1,0,1,...,95,84,l6BmjZMeQD3rDxWUbiAiow,14,2015,2017,2374,2013,10,8
1,3.63,1,0,1,1,0,0,0,0,0,...,33,48,4XChL029mKr5hydo79Ljxg,0,0,0,27646,2013,2,21
2,3.71,0,0,0,0,0,0,1,0,0,...,16,28,bc8C_eETBWL0olvFSJJd0w,0,0,0,358,2013,10,4
3,4.85,0,0,0,1,0,0,0,0,2,...,17,30,dD0gZpBctWGdWo9WlGuhlA,0,0,0,12598,2014,5,22
4,4.08,80,0,80,28,1,1,16,5,57,...,361,1114,MM4RJAeH6yuaN8oZDSt0RA,19,2015,2018,5542,2013,10,23


In [30]:
# training data transform
reviews_df = pd.read_json("data/review.json",lines=True)
business_join = reviews_df.join(business_df.set_index('business_id'), on='business_id')
data = business_join.join(users_df.set_index('user_id'), on='user_id')
data = data.astype({'user_first_elite_year': 'int64','user_last_elite_year': 'int64'})
data['business_city'] = data['business_city'].astype('category').cat.codes
data['business_state'] = data['business_state'].astype('category').cat.codes
data['business_latent_category'] = data['business_latent_category'].astype('category').cat.codes
data['business_days_open_weekly'] = data['business_days_open_weekly'].fillna(data['business_days_open_weekly'].mode()[0])
data['business_average_open_time'] = data['business_average_open_time'].fillna(data['business_average_open_time'].mode().iloc[0])
data['business_average_close_time'] = data['business_average_close_time'].fillna(data['business_average_close_time'].mode().iloc[0])
data['like'] = data['stars'].apply(lambda star_rating: 1 if star_rating > 3 else 0)
data.head()

Unnamed: 0,business_id,stars,user_id,business_city,business_latitude,business_longitude,business_review_count,business_stars,business_state,business_latent_category,...,user_funny,user_review_count,user_useful,user_num_elite_years,user_first_elite_year,user_last_elite_year,user_friends_count,user_begin_yelping_year,user_begin_yelping_month,user_begin_yelping_day
0,ujmEBvifdJM6h6RLv4wQIg,1,hG7b0MtEbXx5QzbzE6C_VA,451,36.215546,-115.248798,238,2.5,19,2,...,4,10,18,0,0,0,22,2008,7,10
1,NZnhc2sEQy3RmzKTZnqtwQ,5,yXQM5uF2jS6es16SJzNHfg,451,36.110083,-115.153871,66,4.5,19,2,...,36,68,178,4,2009,2009,1246,2008,8,26
2,WTqjgwHlXbSFevF32_DJVw,5,n6-Gk65cPZL6Uz8qRm3NYw,157,33.259702,-111.790203,39,3.5,4,6,...,1,10,13,0,0,0,46,2012,10,30
3,ikCg8xy5JIg_NGPx-MSIDA,5,dacAIZ6fTM6mqwW5uxkskg,133,50.969721,-114.070115,13,3.0,0,-1,...,1,4,2,0,0,0,550,2016,9,26
4,b1b1eb3uo-w561D0ZfCEiQ,1,ssoyf2_x0EQMed6fgHeMyQ,959,33.496713,-111.927958,75,4.5,4,2,...,0,2,7,0,0,0,8038,2016,12,6


In [37]:
scalar = StandardScaler()
X = data.drop(['business_id','user_id','stars', 'like'], axis=1)
X_cols = X.columns
X = scalar.fit_transform(X)
X = pd.DataFrame(X, columns=X_cols)
ids = data[['business_id','user_id']]
y = data['like', 'stars']
data_final = pd.concat([ids,y,X], axis=1, join='inner')
data_final.to_csv("data/training_data.csv")