# Data Cleaning 📊🧹

In [2]:
import gzip
from pathlib import Path
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import warnings
warnings.filterwarnings("ignore")

## handling `ca_gmap_infos`

In [3]:
base_path = Path.cwd().parent
meta_file_path = base_path / "data" / "california_metadata.json.gz"

ca_gmap_infos = pd.read_json(meta_file_path, compression = 'gzip', lines = True)

ca_gmap_infos = ca_gmap_infos.rename(columns={
    'name': 'gmap_name'
})

## Reorder column to make them more readable

In [4]:
ca_gmap_infos = ca_gmap_infos[[
    'gmap_id',
    'gmap_name',
    'address',
    'latitude',
    'longitude',
    'description',
    'category',
    'avg_rating',
    'num_of_reviews',
    'price',
    'hours',
    'state',
    'MISC',
    'relative_results',
    'url'
]]

# clean `ca_gmap_infos`
1. remove state col: avoid multicolumnity
2. remove url col: inrevelant col

In [5]:
ca_gmap_infos = ca_gmap_infos.drop(columns=['state', 'url'])

missingness handlation

In [6]:
ca_gmap_infos = ca_gmap_infos.dropna(subset=['address', 'category', 'gmap_name', 'relative_results'])
ca_gmap_infos = ca_gmap_infos.fillna(np.nan)

remove `gmap_id` that has categories that appear less than 20 times

In [7]:
categories = ca_gmap_infos['category'].explode().value_counts()
spare_categories = categories[categories <= 10].index

ca_gmap_infos = ca_gmap_infos[ca_gmap_infos['category'].apply(lambda categories: not any([cate in spare_categories for cate in categories]))]

ca_gmap_infos = ca_gmap_infos.dropna(subset=['address', 'category', 'gmap_name', 'relative_results'])
ca_gmap_infos = ca_gmap_infos.fillna(np.nan)

In [8]:
ca_gmap_infos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 463084 entries, 1 to 515960
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   gmap_id           463084 non-null  object 
 1   gmap_name         463084 non-null  object 
 2   address           463084 non-null  object 
 3   latitude          463084 non-null  float64
 4   longitude         463084 non-null  float64
 5   description       103690 non-null  object 
 6   category          463084 non-null  object 
 7   avg_rating        463084 non-null  float64
 8   num_of_reviews    463084 non-null  int64  
 9   price             106491 non-null  object 
 10  hours             383908 non-null  object 
 11  MISC              404784 non-null  object 
 12  relative_results  463084 non-null  object 
dtypes: float64(3), int64(1), object(9)
memory usage: 49.5+ MB


# Chunk transformation of large dataset
3min 45sec for kevin to load this process

In [11]:
file_path = base_path / "data" / "california_data.json.gz"
results = []

# read reviews data by Chunks
with pd.read_json(file_path, compression='gzip', lines=True, chunksize=1000000) as reader:
    for chunk in reader:
        ## rename columns
        chunk = chunk.rename(columns={
            'user_id': 'reviewer_id',
            'name': 'reviewer_name',
            'time': 'review_time(unix)',
        })

        ## remove pics column: since we don't want to deal with image
        chunk = chunk.drop(columns=['pics'])

        ## adding feature
        chunk = chunk.assign(has_rep = chunk['resp'].notna())

        # merging
        ca_gmap_reviews = chunk.merge(
                ca_gmap_infos,
                how = 'inner',
                right_on = 'gmap_id',
                left_on = 'gmap_id'
                )
        results.append(ca_gmap_reviews)

The resulting transformed data

In [12]:
x = pd.concat(results)

In [27]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44504776 entries, 0 to 476889
Data columns (total 22 columns):
 #   Column             Dtype  
---  ------             -----  
 0   reviewer_id        float64
 1   reviewer_name      object 
 2   review_time(unix)  int64  
 3   rating             int64  
 4   text               object 
 5   resp               object 
 6   gmap_id            object 
 7   has_rep            bool   
 8   gmap_name          object 
 9   address            object 
 10  description        object 
 11  latitude           float64
 12  longitude          float64
 13  category           object 
 14  avg_rating         float64
 15  num_of_reviews     int64  
 16  price              object 
 17  hours              object 
 18  MISC               object 
 19  state              object 
 20  relative_results   object 
 21  url                object 
dtypes: bool(1), float64(4), int64(3), object(14)
memory usage: 7.3+ GB


# Export

In [10]:
metadata_out_path = base_path / "data" / "california_clean_metadata.json.gz"
ca_gmap_infos.to_json(metadata_out_path, orient='records', lines=True, compression='gzip')

In [13]:
data_out_path = base_path / "data" / "california_clean_data.json.gz"

x.to_json(data_out_path, orient='records', lines=True, compression='gzip')

print(f"DataFrame exported to {data_out_path} and {metadata_out_path}")