# Data Cleaning 📊🧹

In [3]:
import gzip
from pathlib import Path
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import warnings
warnings.filterwarnings("ignore")

## handling `ca_gmap_infos`

In [5]:
base_path = Path.cwd().parent
meta_file_path = base_path / "rdsb" / "data" / "hawaii_metadata.json.gz"

ca_gmap_infos = pd.read_json(meta_file_path, compression = 'gzip', lines = True)

ca_gmap_infos = ca_gmap_infos.rename(columns={
    'name': 'gmap_name'
})

## Reorder column to make them more readable

In [3]:
ca_gmap_infos = ca_gmap_infos[[
    'gmap_id',
    'gmap_name',
    'address',
    'latitude',
    'longitude',
    'description',
    'category',
    'avg_rating',
    'num_of_reviews',
    'price',
    'hours',
    'state',
    'MISC',
    'relative_results',
    'url'
]]

# clean `ca_gmap_infos`
1. remove state col: avoid multicolumnity
2. remove url col: inrevelant col

In [4]:
ca_gmap_infos = ca_gmap_infos.drop(columns=['state', 'url'])

missingness handlation

In [5]:
ca_gmap_infos = ca_gmap_infos.dropna(subset=['address', 'category', 'gmap_name', 'relative_results'])
ca_gmap_infos = ca_gmap_infos.fillna(np.nan)

remove `gmap_id` that has categories that appear less than 20 times

In [6]:
categories = ca_gmap_infos['category'].explode().value_counts()
spare_categories = categories[categories <= 10].index

ca_gmap_infos = ca_gmap_infos[ca_gmap_infos['category'].apply(lambda categories: not any([cate in spare_categories for cate in categories]))]

ca_gmap_infos = ca_gmap_infos.dropna(subset=['address', 'category', 'gmap_name', 'relative_results'])
ca_gmap_infos = ca_gmap_infos.fillna(np.nan)

In [None]:
ca_gmap_infos.info()

# Chunk transformation of large dataset
3min 45sec for kevin to load this process

In [8]:
file_path = base_path / "rdsb" / "data" / "hawaii_data.json.gz"
results = []

# read reviews data by Chunks
with pd.read_json(file_path, compression='gzip', lines=True, chunksize=1000000) as reader:
    for chunk in reader:
        ## rename columns
        chunk = chunk.rename(columns={
            'user_id': 'reviewer_id',
            'name': 'reviewer_name',
            'time': 'review_time(unix)',
        })

        ## remove pics column: since we don't want to deal with image
        chunk = chunk.drop(columns=['pics'])

        ## adding feature
        chunk = chunk.assign(has_rep = chunk['resp'].notna())

        # merging
        ca_gmap_reviews = chunk.merge(
                ca_gmap_infos,
                how = 'inner',
                right_on = 'gmap_id',
                left_on = 'gmap_id'
                )
        results.append(ca_gmap_reviews)

The resulting transformed data

In [9]:
x = pd.concat(results)

In [None]:
x.info()

# Export

In [11]:
metadata_out_path = base_path / "rdsb" / "data" / "california_clean_metadata.json.gz"
ca_gmap_infos.to_json(metadata_out_path, orient='records', lines=True, compression='gzip')

In [None]:
data_out_path = base_path / "rdsb" / "data" / "california_clean_data.json.gz"

x.to_json(data_out_path, orient='records', lines=True, compression='gzip')

print(f"DataFrame exported to {data_out_path} and {metadata_out_path}")