In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
import matplotlib.font_manager as fm
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set(rc={'figure.figsize':(13.7,10.27)})
sns.set_style("whitegrid")
sns.set_color_codes()

In [2]:
from dask.distributed import Client
import dask.bag as db
import dask.dataframe as dd
import dask

from ast import literal_eval
from collections.abc import MutableMapping
import dask.array as da
# import h5py
import os

import reverse_geocode

In [3]:
Client()

0,1
Client  Scheduler: tcp://127.0.0.1:33039  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 20.70 GB


# Load all datasets

## Respective schemas

In [4]:
users={'userName': str,
      'jobs': object,
      'currentPlace': object,
      'previousPlaces': object,
      'education': object,
      'gPlusUserId': str}
reviews={'rating': float,
       'reviewerName': str,
       'reviewText': str,
       'categories': object,
       'gPlusPlaceId': str,
       'unixReviewTime': object,
       'reviewTime': str,
       'gPlusUserId': str}
places={'name': str,
       'price': str,
       'address': str,
       'hours': object,
       'phone': str,
       'closed': bool,
       'gPlusPlaceId': str,
       'gps': object}

# del users, reviews, places

## Fastload of smaller files

In [9]:
%%time

# 1 min

users_df = dd.read_csv('users/*.csv',
                 blocksize='128MiB',
                 dtype=users,
                 encoding='utf-8',
                 lineterminator='\n').compute()
# reviews_df = dd.read_csv('reviews/*.csv',
reviews_df = dd.read_csv('reviews_restaurants/*.csv',
                 blocksize='32MiB',
                 dtype=reviews,
                 encoding='utf-8',
                 lineterminator='\n').compute()
places_df = dd.read_csv('places/*.csv',
                 blocksize='64MiB',
                 dtype=places,
                 encoding='utf-8',
                 lineterminator='\n').compute()
# del users_df, reviews_df, places_df

CPU times: user 12 s, sys: 4.42 s, total: 16.4 s
Wall time: 43.1 s


## Preprocessing

- Users: currentPlace => user_lat, user_long
- Places: gps => places_lat, places_long
- Joined: 'Country' column based on place reviewed

**IDEA**: Merge restaurant types

e.g.
- Ice cream shop, Bakery, Dessert Shop => Dessert Shop
- Thai, South Asian, Southeast Asian, Japanese, Chinese => Asian
- etc...

In [48]:
print('Users: ', users_df.columns.to_list())
print('Places: ', places_df.columns.to_list())
print('Reviews: ', reviews_df.columns.to_list(), '\n')

# extract userName, currentPlace, gPlusUserId from users
users_df2 = users_df.drop(['jobs', 'previousPlaces', 'education'], axis=1)

# extract name, price, address, gPlusPlaceId, gps
places_df2 = places_df.drop(['hours', 'phone', 'closed'], axis=1)

# extract rating, categories, gPlusPlaceId, gPlusUserId
reviews_df2 = reviews_df.drop(['reviewerName', 'reviewText', 'unixReviewTime', 'reviewTime'], axis=1)

print('After dropping...')
print('Users: ', users_df2.columns.to_list())
print('Places: ', places_df2.columns.to_list())
print('Reviews: ', reviews_df2.columns.to_list())

# users_df2
# places_df2
reviews_df2.head()
# del users_df2, places_df2, reviews_df2

Users:  ['userName', 'jobs', 'currentPlace', 'previousPlaces', 'education', 'gPlusUserId']
Places:  ['name', 'price', 'address', 'hours', 'phone', 'closed', 'gPlusPlaceId', 'gps']
Reviews:  ['rating', 'reviewerName', 'reviewText', 'categories', 'gPlusPlaceId', 'unixReviewTime', 'reviewTime', 'gPlusUserId'] 

After dropping...
Users:  ['userName', 'currentPlace', 'gPlusUserId']
Places:  ['name', 'price', 'address', 'gPlusPlaceId', 'gps']
Reviews:  ['rating', 'categories', 'gPlusPlaceId', 'gPlusUserId']


Unnamed: 0,rating,categories,gPlusPlaceId,gPlusUserId
0,4.0,"['Asian Restaurant', 'Chinese Restaurant']",106591714648856494903,100000032416892623125
1,5.0,"['European Restaurant', 'Italian Restaurant', ...",109420033090810328045,100000032416892623125
2,5.0,['Barbecue Restaurant'],111623070919810985923,100000032416892623125
3,4.0,['Restaurant'],113854191152597312098,100000032416892623125
4,5.0,['Mexican Restaurant'],115827996910815192564,100000032416892623125


### Users: Gps column => Lat, Long columns

In [49]:
%%time

def string_to_coord(currentPlace):
    coord = None
    lat = long = None
    try:
        coord = eval(currentPlace)[1][1:3]
        lat, long = coord[0]/10000000, coord[1]/10000000
    except Exception as e:
        return (lat, long)
    return (lat, long)

users_df2['Coordinates'] = users_df2['currentPlace'].map(string_to_coord)
users_df2['user_lat'] = users_df2['Coordinates'].apply(lambda x : x[0])
users_df2['user_long'] = users_df2['Coordinates'].apply(lambda x : x[1])

users_df2.head()

CPU times: user 12.3 s, sys: 222 ms, total: 12.5 s
Wall time: 12.4 s


Unnamed: 0,userName,currentPlace,gPlusUserId,Coordinates,user_lat,user_long
0,an lam,"['Thành phố Hồ Chí Minh, Việt Nam', [[], 10823...",100000010817154263736,"(10.823099, 106.629664)",10.823099,106.629664
1,HALİL TURGUT,"['Adana', [[], 370000000, 353213330, 1]]",100000013500285534661,"(37.0, 35.321333)",37.0,35.321333
2,森田さとこ,,100000021336848867366,"(None, None)",,
3,amey kore,,100000030557048145331,"(None, None)",,
4,william spindler,,100000032416892623125,"(None, None)",,


## Places: Gps column => Lat, Long columns

In [50]:
%%time

def string_to_coord2(gps):
    coord = None
    lat = long = None
    try:
        coord = eval(gps)
        lat, long = coord[0]/1000000, coord[1]/1000000
    except Exception as e:
        return (lat, long)
    return (lat, long)

places_df2['Coordinates'] = places_df2['gps'].map(string_to_coord2)
places_df2['place_lat'] = places_df2['Coordinates'].apply(lambda x : x[0])
places_df2['place_long'] = places_df2['Coordinates'].apply(lambda x : x[1])

places_df2.head()

CPU times: user 20.7 s, sys: 337 ms, total: 21.1 s
Wall time: 20.9 s


Unnamed: 0,name,price,address,gPlusPlaceId,gps,Coordinates,place_lat,place_long
0,Diamond Valley Lake Marina,,"['2615 Angler Ave', 'Hemet, CA 92545']",104699454385822125632,"[33.703804, -117.003209]","(3.3703804e-05, -0.000117003209)",3.4e-05,-0.000117
1,Blue Ribbon Cleaners,,"['Parole', 'Annapolis, MD']",103054478949000078829,"[38.979759, -76.547538]","(3.8979759000000004e-05, -7.6547538e-05)",3.9e-05,-7.7e-05
2,Portofino,,"['ул. Тутаева, 1', 'Nazran, Ingushetia, Russia...",109810290098030327104,"[43.22776, 44.762726]","(4.3227760000000004e-05, 4.4762726e-05)",4.3e-05,4.5e-05
3,T C's Referee Sports Bar,$$,"['5322 W 26th St', 'Sioux Falls, SD 57106']",100327153115986850675,"[43.529494, -96.792244]","(4.3529494e-05, -9.679224399999999e-05)",4.4e-05,-9.7e-05
4,Carrefour - Palembang Square,,"['Jl. Angkatan 45', 'Kompleks Palembang Square...",103368487323937936043,"[-2.976256, 104.742662]","(-2.976256e-06, 0.00010474266199999999)",-3e-06,0.000105


### Drop redundant columns and join

In [53]:
%%time
users_df3 = users_df2.drop(['currentPlace', 'Coordinates'], axis=1)
places_df3 = places_df2.drop(['gps', 'Coordinates'], axis=1)

joined_df = reviews_df.merge(users_df3,
                            how='left',
                            on='gPlusUserId').merge(places_df3,
                                                   how='left',
                                                   on='gPlusPlaceId')
joined_df

CPU times: user 14.2 s, sys: 1.19 s, total: 15.4 s
Wall time: 15.5 s


Unnamed: 0,rating,reviewerName,reviewText,categories,gPlusPlaceId,unixReviewTime,reviewTime,gPlusUserId,userName,user_lat,user_long,name,price,address,place_lat,place_long
0,4.0,william spindler,Best War Wanton soup in Red Bluff,"['Asian Restaurant', 'Chinese Restaurant']",106591714648856494903,1394669496.0,"Mar 12, 2014",100000032416892623125,william spindler,,,Peking Chinese Restaurant,$$$,"['860 Main St', 'Red Bluff, CA 96080']",0.000040,-0.000122
1,5.0,william spindler,This is a review that is long overdo. I've bee...,"['European Restaurant', 'Italian Restaurant', ...",109420033090810328045,1394826388.0,"Mar 14, 2014",100000032416892623125,william spindler,,,Firehouse Pizza,$$,"['734 Main St', 'Red Bluff, CA 96080']",0.000040,-0.000122
2,5.0,william spindler,"Some authentic rub BBQ, great food and don't m...",['Barbecue Restaurant'],111623070919810985923,1394671215.0,"Mar 12, 2014",100000032416892623125,william spindler,,,Two Buds Barbeque,,"['22825 Antelope Blvd', 'Red Bluff, CA 96080']",0.000040,-0.000122
3,4.0,william spindler,Truly a Red Bluff standard. Great old fashione...,['Restaurant'],113854191152597312098,1394670357.0,"Mar 12, 2014",100000032416892623125,william spindler,,,Bud's Jolly Kone,,"['455 Antelope Blvd', 'Red Bluff, CA 96080']",0.000040,-0.000122
4,5.0,william spindler,"Long time favorite Mexican food, always consis...",['Mexican Restaurant'],115827996910815192564,1394669713.0,"Mar 12, 2014",100000032416892623125,william spindler,,,La Corona,$$,"['914 Walnut St', 'Red Bluff, CA 96080']",0.000040,-0.000122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4110767,2.0,charles mckinney,Too expensive for a fast food burger.,"['Hamburger Restaurant', 'Fast Food Restaurant']",116458473784504954830,1364706351.0,"Mar 30, 2013",118446742455312620560,,,,Fatburger,,"['6780 Cherry Ave', 'Long Beach, CA 90805']",0.000034,-0.000118
4110768,2.0,charles mckinney,,"['Mexican Restaurant', 'Latin American Restaur...",117332598175065149705,1368434855.0,"May 13, 2013",118446742455312620560,,,,Super Mex,,"['5660 Atlantic Ave', 'Long Beach, CA 90805']",0.000034,-0.000118
4110769,4.0,charles mckinney,,"['Hot Dog Restaurant', 'Takeout Restaurant', '...",117868066122653879601,1317841949.0,"Oct 5, 2011",118446742455312620560,,,,Wienerschnitzel,$$$,"['1300 E Rosecrans Ave', 'Compton, CA 90221']",0.000034,-0.000118
4110770,1.0,charles mckinney,i will never go back. food and customer servic...,"['Buffet Restaurant', 'American Restaurant']",117952004983617019485,1315602371.0,"Sep 9, 2011",118446742455312620560,,,,HomeTown Buffet,$$,"['3102 E Imperial Hwy', 'Lynwood, CA 90262']",0.000034,-0.000118


# Finish the joined EDA