In [5]:
import gzip
from pathlib import Path
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [None]:
## handling ca_gmap_infos


ca_gmap_infos = pd.read_json('data/meta-California.json.gz', compression = 'gzip', lines = True)

ca_gmap_infos = ca_gmap_infos.rename(columns={
    'name': 'gmap_name'
})

## reorder column to make them more readable
ca_gmap_infos = ca_gmap_infos[[
    'gmap_id',
    'gmap_name',
    'address',
    'latitude',
    'longitude',
    'description',
    'category',
    'avg_rating',
    'num_of_reviews',
    'price',
    'hours',
    'state',
    'MISC',
    'relative_results',
    'url'
]]

# clean ca_gmap_infos

## remove state col: avoid multicolumnity
## remove url col: inrevelant col
ca_gmap_infos = ca_gmap_infos.drop(columns=['state', 'url'])

## missingness handlation
ca_gmap_infos = ca_gmap_infos.dropna(subset=['address', 'category', 'gmap_name', 'relative_results'])
ca_gmap_infos = ca_gmap_infos.fillna(np.nan)


## remove gmap_id that has categories that appear less than 20 times
categories = ca_gmap_infos['category'].explode().value_counts()
spare_categories = categories[categories <= 10].index

ca_gmap_infos = ca_gmap_infos[ca_gmap_infos['category'].apply(lambda categories: not any([cate in spare_categories for cate in categories]))]

In [None]:
results = []

# read reviews data by Chunks
with pd.read_json('data/review-California_10.json.gz', compression='gzip', lines=True, chunksize=1000000) as reader:
    for chunk in reader:
        ## rename columns
        chunk = chunk.rename(columns={
            'user_id': 'reviewer_id',
            'name': 'reviewer_name',
            'time': 'review_time(unix)',
        })

        # clean ca_reviews

        ## remove pics column: since we don't want to deal with image
        chunk = chunk.drop(columns=['pics'])

        ## adding feature
        chunk = chunk.assign(has_rep = chunk['resp'].notna())

        # merging
        ca_gmap_reviews = chunk.merge(
                ca_gmap_infos,
                how = 'inner',
                right_on = 'gmap_id',
                left_on = 'gmap_id'
                )
        
        results.append(ca_gmap_reviews)

In [None]:
# the resulting transformed data

x = pd.concat(results)