# Explorative Data Analysis

In [5]:
import gzip
from sklearn import svm
import numpy as np
import string
from sklearn import linear_model
from pathlib import Path
import pandas as pd
import json

import warnings
warnings.filterwarnings("ignore")

## EDA Goals:
1. We need to merge the DataFrames to get what we need and identify questions we are interested in.
2. Make preprocessing pipeline.

## Data Loader
Please do not push the data to Github (in .gitignore), this will result in large file issues. Actual data set is [here](https://cseweb.ucsd.edu/~jmcauley/datasets.html#google_local)
- **Data**: Data itself.
    - The reviews provided by `users`, such as ratings, review text, and pictures.
    - Directly informs recommendations or sentiment analysis.
    - `Give what we are looking at from a user's perspective.`
- **MetaData**: Data that explains about the data.
    - `Business details`, such as name, address, average rating, category, and operational hours, describing the businesses associated with the reviews.
    - Enriches the analysis by providing business context.
    - `Extra information we can us  to perform joins.`

In [6]:
def parseData(file_path):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    return data

base_path = Path.cwd().parent
file_path = base_path / "data" / "data.json.gz"
meta_file_path = base_path / "data" / "metadata.json.gz"

if file_path.exists():
    print(f"Loading data from: {file_path}")
    print(f"Loading metadata from: {meta_file_path}")
    metadata = parseData(meta_file_path)
    data = parseData(file_path)
    print(f"Loaded {len(data)} entries.")
    print(f"Loaded {len(metadata)} entries.")
    
    df = pd.DataFrame(data)
    meta_df = pd.DataFrame(metadata)
    print("DataFrame created. Here's the structure:")
    print(df.info())
    # print(df.head())
    print(meta_df.info())
    # print(meta_df.head())
else:
    print(f"File not found: {file_path}. Please ensure the file exists at the specified location.")

Loading data from: /Users/kevinb/Desktop/cse158/TBR/data/data.json.gz
Loading metadata from: /Users/kevinb/Desktop/cse158/TBR/data/metadata.json.gz


In [4]:
meta_df

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,City Textile,"City Textile, 3001 E Pico Blvd, Los Angeles, C...",0x80c2c98c0e3c16fd:0x29ec8a728764fdf9,,34.018891,-118.215290,[Textile exporter],4.5,6,,,,Open now,"[0x80c2c624136ea88b:0xb0315367ed448771, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
1,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,,34.058092,-118.292130,[Korean restaurant],4.4,18,,"[[Thursday, 6:30AM–6PM], [Friday, 6:30AM–6PM],...","{'Service options': ['Takeout', 'Dine-in', 'De...",Open ⋅ Closes 6PM,"[0x80c2c78249aba68f:0x35bf16ce61be751d, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
2,Nova Fabrics,"Nova Fabrics, 2200 E 11th St, Los Angeles, CA ...",0x80c2c89923b27a41:0x32041559418d447,,34.023669,-118.232930,[Fabric store],3.3,6,,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...","{'Service options': ['In-store shopping'], 'Pa...",Open ⋅ Closes 5PM,"[0x80c2c8811477253f:0x23a8a492df1918f7, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
3,Nobel Textile Co,"Nobel Textile Co, 719 E 9th St, Los Angeles, C...",0x80c2c632f933b073:0xc31785961fe826a6,,34.036694,-118.249421,[Fabric store],4.3,7,,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...",{'Service options': ['In-store pickup']},Open ⋅ Closes 5PM,"[0x80c2c62c496083d1:0xdefa11317fe870a1, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
4,Matrix International Textiles,"Matrix International Textiles, 1363 S Bonnie B...",0x80c2cf163db6bc89:0x219484e2edbcfa41,,34.015505,-118.181839,[Fabric store],3.5,6,,"[[Thursday, 8:30AM–5:30PM], [Friday, 8:30AM–5:...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 5:30PM,"[0x80c2cf042a5d9561:0xd0024ad6f81f1335, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515956,McDonald's,"McDonald's, 1728 Lomita Blvd, Lomita, CA 90717",0x80dd4a7afea27289:0xe49cfab49567e5cb,"Classic, long-running fast-food chain known fo...",33.797530,-118.308527,"[Fast food restaurant, Breakfast restaurant, C...",4.1,830,$,"[[Wednesday, 5AM–12AM], [Thursday, 5AM–12AM], ...","{'Service options': ['Curbside pickup', 'No-co...",,"[0x80dd4a72339feac7:0xcb7398ed660df0b1, 0x80dd...",https://www.google.com/maps/place//data=!4m2!3...
515957,California Citrus State Historic Park,"California Citrus State Historic Park, 9400 Du...",0x80dcba7983a059af:0x2a006c069483d3d2,Park dedicated to preserving the history of Ca...,33.898611,-117.425703,"[Park, Tourist attraction]",4.7,763,,"[[Wednesday, 8AM–5PM], [Thursday, 8AM–5PM], [F...",{'Accessibility': ['Wheelchair accessible entr...,,"[0x80dcb21eefbd19f5:0x8ebb31f8a91dc2d3, 0x80dc...",https://www.google.com/maps/place//data=!4m2!3...
515958,California Citrus,"California Citrus, 1999 Van Buren Boulevard, R...",0x80dcb09e3af6228b:0xa55fc2f742364e02,,33.898299,-117.428067,[State park],4.8,96,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x80dcb09dfb76baef:0x5156f914ebbbe, 0x80dcb1e...",https://www.google.com/maps/place//data=!4m2!3...
515959,Recreation Park Playground,"Recreation Park Playground, 701-939 Federation...",0x80dd31c81d5f153d:0x501886193d0102e7,,33.775862,-118.135669,[Playground],4.3,17,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x80dd31044c86de89:0xea5a9038c8f9e2eb, 0x80dd...",https://www.google.com/maps/place//data=!4m2!3...


#  Merging
Metadata provide inofrmation about business, merge on `gmap_id` would be fine.