In [474]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import collections
import re, string
import sys
import time
import os


In [2]:
import json
import csv

def init_dataset(json) -> tuple[dict, list]:
    ds: dict = {}
    keys = json.keys()
    for k in keys:
        ds[k] = []
    return ds, keys

def read_json(file) -> pd.DataFrame:
    dataset = {}
    keys = []
    with open(file) as file_lines:
        for count, line in enumerate(file_lines):
            json_line = json.loads(line.strip())
            if count == 0:
                dataset, keys = init_dataset(json_line)
            for k in keys:
                dataset[k].append(json_line[k])
        return pd.DataFrame(dataset)

def read_csv(file) -> pd.DataFrame:
    dataset = {}
    with open(file, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        keys = reader.fieldnames
        for k in keys:
            dataset[k] = []
        for row in reader:
            for k in keys:
                dataset[k].append(row[k])
    return pd.DataFrame(dataset)


In [146]:
#yelp_review = read_json('data/yelp_academic_dataset_review.json')
yelp_review = read_csv('data/yelp_academic_dataset_review.csv')

In [145]:
#yelp_business = read_json('data/yelp_academic_dataset_business.json')
yelp_business = read_csv('data/yelp_academic_dataset_business.csv')

In [341]:
# Sample Data: Restaurants reviewed by karen, the user with the most reviews
# Businesses that are categorized as restaurants
business_restaurant = yelp_business.loc[yelp_business['categories'].str.contains('Restaurant', na=False)]
# Reviews of Restaurant businesses
review_restaurant = yelp_review[yelp_review['business_id'].isin(business_restaurant['business_id'])]
# User with most restaurant reviews
karen = review_restaurant['user_id'].value_counts().index[0]
# Reviews Karen has made of restaurant businesses
review_restaurant_karen = review_restaurant.loc[review_restaurant['user_id'] == karen]
# Restaurant businesses that Karen has reviewed
business_restaurant_karen = business_restaurant[business_restaurant['business_id'].isin(review_restaurant_karen['business_id'])]

In [455]:
## Clean Data: remove missing rows and irrelevant columns
df = business_restaurant_karen.set_index('business_id')

# Remove columns with greater than 20% missing fields
mask = df.applymap(lambda x: x =='' or x == 'None').sum()
features = ((mask/len(df)) * 100).map(lambda x: x < 20)


# Remove non-attribute columns (except business_id)
features.loc[~features.index.str.contains('attributes.')] = False
#features.loc['business_id'] = True
dataset = df.loc[:, features]

# Remove rows with missing data
mask = dataset.applymap(lambda x: x == '' or x == 'None')
dataset = dataset.loc[~mask.any(axis=1)]

# Remove all non-boolean columns
mask = dataset.applymap(lambda x : x == 'True' or x == 'False').sum() != 0
#mask.loc['business_id'] = True
#dataset = dataset.set_index('business_id')
dataset = dataset.loc[:, mask].applymap(lambda x: x == 'True')

In [463]:
# Transform Data: add targets
df = review_restaurant_karen.set_index('business_id')
df = df.loc[df.index.intersection(dataset.index)]
df = df.astype({'stars':'float'})
dataset['target'] = df.groupby(df.index)['stars'].mean().map(lambda x: x > 3)

In [475]:
# Delineate between traning and testing set
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)

In [470]:
os.makedirs('data_host/SAMPLES', exist_ok=True)
# review_restaurant_karen.to_csv('data_host/SAMPLES/review_restaurant_karen.csv')
# business_restaurant_karen.to_csv('data_host/SAMPLES/business_restaurant_karen.csv')
dataset = dataset.applymap(lambda x: 1 if x == True else 0)
dataset.to_csv('data_host/SAMPLES/dataset.csv')