# COMS 4111 project 1 Database preparation

> Note: this notebook can be opened at this [Google Colab](https://colab.research.google.com/drive/1UHwwJFimwo2k8wVBSu5kVon2yh7YctRC?usp=sharing)

Teammates:
- Jace Yang (uni: jy3174)
- Binghong Yu (uni: by2325)

The UNI used to create the schema on the course database server: by2325

## 0. Environment Setup



```python
!pip3 install sqlalchemy # ORM for databases
!pip3 install ipython-sql # SQL magic function
````

In [2]:
%load_ext sql

In [3]:
import pandas as pd
import os
from sqlalchemy import *
JACE_DB = True


if JACE_DB:
    DATABASEURI = "postgresql://jy3174:JaceYJH@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2"
    %sql postgresql://jy3174:JaceYJH@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
else:
    DATABASEURI = "postgresql://by2325:0316@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2"
    %sql postgresql://by2325:0316@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
engine = create_engine(DATABASEURI)

In [4]:
%%sql

SELECT *
FROM business_wide
WHERE business_id = 'uqWI27TGjAFOPHQTrf6quQ'
LIMIT 5

 * postgresql://jy3174:***@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
1 rows affected.


business_id,average_stars,name,address,city,state,postal_code,latitude,longitude,is_open,is_takeout,rounded_average_stars
uqWI27TGjAFOPHQTrf6quQ,0.75,Taco Bell,7680 E. 96th St.,Fishers,IN,46038,40,-86,True,True,1.0


## 1 SQL schema

### 1.1 Latest ER diagram

<center> <img src="https://cdn.mathpix.com/snip/images/jmnhrmtITyHgShxBw3h88ZWcf4kPffOi1bkbgz9T4Vk.original.fullsize.png" width="80%"/></center>



All the changes we made from Part 1:
1. Delete attrbute `hour` in `Business` that requires json-type data importing with additional package like `pgspecial` (not allowed according to [Ed #431](https://edstem.org/us/courses/17037/discussion/1311446))
1. Change several `int` type id column into `text` to be consistent to the original dataset
1. Add `email` attribute of `Users` table in order to allow costumer to log in. The user_id will be a long text that only used in back-end.
1. Constrain the `password` attribute of `Users` to be in length [8, 16] in order to be more robust

### 1.2 CREATE commands

In [7]:
%%sql
DROP TABLE IF EXISTS Users, Collection_of_User, Review_of_Business, Category, Business, Photo_contained_Business CASCADE;
CREATE TABLE Users(
    user_id text PRIMARY KEY,
    email text UNIQUE NOT NULL,
    name text NOT NULL,
    password text NOT NULL,
    yealping_since date,
    CHECK (length(password) >= 8 AND length(password) <= 16)
);

CREATE TABlE Collection_of_User(
    user_id text,
    collection_id int,
    created_date date,
    PRIMARY KEY(user_id, collection_id),
    FOREIGN KEY(user_id) REFERENCES Users(user_id) ON DELETE CASCADE
);

CREATE TABlE Business(
    business_id text PRIMARY KEY,
    name text,
    address text,
    city text,
    state text,
    postal_code text,
    latitude numeric(4),
    longitude numeric(4),
    is_open boolean,
    is_takeout boolean
);

CREATE TABlE Review_of_Business(   
    review_id int PRIMARY KEY,
    review_date date,
    business_id text NOT NULL,
    -- Attributes of Tip
    short_tip text,
    likes int,
    -- Attributes of Long Review
    detailed_review text,
    stars int,
    useful int,
    funny int,
    cool int,

    CHECK (stars >= 0 AND stars <= 5),
    CHECK (
           ((short_tip IS NULL AND likes IS NULL)  
            OR 
            (detailed_review IS NULL AND stars IS NULL AND useful IS NULL AND funny IS NULL AND cool IS NULL))
           AND
           ((short_tip IS NOT NULL) 
            OR 
            (detailed_review IS NOT NULL))
          ),
    CHECK (length(detailed_review) >= 30 OR detailed_review is NULL),

    FOREIGN KEY(business_id) REFERENCES Business(business_id) ON DELETE CASCADE
);

CREATE TABlE Category(
    name varchar(255) PRIMARY KEY
);

CREATE TABLE Photo_contained_Business(
    photo_id text PRIMARY KEY,
    business_id text NOT NULL,
    caption text,
    label text,
    FOREIGN KEY(business_id) REFERENCES Business
        ON DELETE CASCADE
);

 * postgresql://jy3174:***@w4111-4-14.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
Done.
Done.
Done.
Done.
Done.
Done.
Done.


[]

In [8]:
%%sql
DROP TABLE IF EXISTS Users_favorite_Business, Users_follow_Collection, Collection_contain_Business,
                     Users_write_Review, Collection_contain_Business,
                     Users_follow_Users, Business_tagged_Category CASCADE;

CREATE TABLE Users_favorite_Business(
    user_id text REFERENCES Users(user_id),
    business_id text REFERENCES Business(business_id),
    PRIMARY KEY(user_id, business_id)
);

CREATE TABLE Users_follow_Collection(
    fan_user_id text REFERENCES Users(user_id),
    followee_user_id text,
    collection_id int,
    PRIMARY KEY(fan_user_id, followee_user_id, collection_id),
    FOREIGN KEY(followee_user_id, collection_id) REFERENCES Collection_of_User(user_id, collection_id)
);

CREATE TABLE Collection_contain_Business(
    collection_owner_id text,
    collection_id int,
    business_id text REFERENCES Business(business_id),
    PRIMARY KEY(collection_owner_id, collection_id, business_id),
    FOREIGN KEY(collection_owner_id, collection_id) REFERENCES Collection_of_User(user_id, collection_id)
);


CREATE TABLE Users_write_Review(
    user_id text NOT NULL REFERENCES Users(user_id) ON DELETE CASCADE, -- allow users to cancel their account
    review_id int REFERENCES Review_of_Business(review_id) ON DELETE CASCADE,
    PRIMARY KEY(review_id)
);

CREATE TABLE Business_tagged_Category(
    business_id text REFERENCES Business,
    name text REFERENCES Category,
    PRIMARY KEY(business_id, name)
);

CREATE TABLE Users_follow_Users(
    follwee_user_id text REFERENCES Users(user_id),
    fan_user_id text REFERENCES Users(user_id),
    follow_since date,
    PRIMARY KEY (follwee_user_id, fan_user_id)
);

 * postgresql://jy3174:***@w4111-4-14.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
Done.
Done.
Done.
Done.
Done.
Done.
Done.


[]

## 2 Data Preparation

In this part, we clean the Yelp dataset which contains mostly real-word data. That being said, we still need to generate some of the features, like `password` of User but all with reasonable assumptions described.

To re-run the code, download the [Yelp Dataset](https://www.yelp.com/dataset/documentation/main) into a folder named "data" in the parent level of this file.

To access the cleaned data output from this part, we upload the dataframes in pickle format into a [Google Drive](https://drive.google.com/drive/folders/1RvL6q7U1eeMvFVpNyH-dJ1AdkIf_E7l0?usp=sharing).

In [76]:
import pickle
pickle.HIGHEST_PROTOCOL = 4
import pandas as pd
import numpy as np
import random, string
from tqdm import tqdm
from datetime import datetime

In [77]:
N_BUSINESS = 1000
N_USER = 6666

### 2.1 Entity

#### Business

In [140]:
business = pd.read_json("../data/yelp_dataset/yelp_academic_dataset_business.json", lines=True)
photo = pd.read_json('../data/yelp_photos/photos.json', lines=True, dtype={'photo_id':str,'business_id':str,'caption':str,'label':str})

In [146]:
business.state.value_counts()

PA     34039
FL     26330
TN     12056
IN     11247
MO     10913
LA      9924
AZ      9912
NJ      8536
NV      7715
AB      5573
CA      5203
ID      4467
DE      2265
IL      2145
TX         4
CO         3
WA         2
HI         2
MA         2
NC         1
UT         1
MT         1
MI         1
SD         1
XMS        1
VI         1
VT         1
Name: state, dtype: int64

In [79]:
# Only include restaurant of main category
business_category = business[['business_id', 'categories']]
business_category = business_category.assign(category = business.categories.str.split(', ')).explode('category').drop('categories', axis=1)
MAIN_FOOD_CATEGORIES = """Bars
Sandwiches
Fast Food
Pizza
Coffee & Tea
Breakfast & Brunch
Burgers
Mexican
Specialty Food
Italian
Seafood
Chicken Wings
Chinese
Salad
Bakeries
Cafes""".split('\n')
business_category = business_category[business_category.category.isin(MAIN_FOOD_CATEGORIES)]
business = business.merge(business_category[['business_id']].drop_duplicates(),
                          on="business_id",
                          how="inner").drop(['categories'], axis=1)

# Remove business that doesn't have any photo.
business = business.merge(photo['business_id'].drop_duplicates(), on = "business_id", how = "inner")

In [80]:
# Select N_USER business with moderate amount of reviews
## Note: We finished the data cleaning pipeline, but for the purpose of this homework, we only includes 100 restaurant and 200 users that has wrote reviews on them!
## For project 1 part 3, we will request more resources to allow us populate the whole dataset ^.^
business = business[(business['review_count'] >= business['review_count'].quantile(.25)) & 
                    (business['review_count'] <= business['review_count'].quantile(.75))].sample(n=N_BUSINESS, random_state=4111)
business = business.drop(['review_count', 'stars', 'hours'], axis=1).reset_index(drop=True)

# Extract whether allow takeout information
attribute_values = ['False' if attributes is None else attributes.get('RestaurantsTakeOut') for attributes in business.attributes]
business['is_takeout'] =  ['False' if value is None or value == 'None' else value for value in attribute_values]
business['is_open'] = business['is_open'].astype(bool)
business = business.drop('attributes', axis=1)

In [81]:
business

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,is_open,is_takeout
0,uqWI27TGjAFOPHQTrf6quQ,Taco Bell,7680 E. 96th St.,Fishers,IN,46038,39.926872,-86.033039,True,True
1,MOy8qGtBOoJzRnHnv3Y3JQ,Ants Pants on 4th,526 S 4th St,Philadelphia,PA,19147,39.941910,-75.149237,False,True
2,cKfonqHkDUx-G0_ksHASbw,Jamba,2314 Elliston Place,Nashville,TN,37203,36.149573,-86.806265,True,True
3,NaA1lTPxouNLq-zFI3IOYA,Charlie Brown's Steakhouse,1001 Baltimore Pike,Springfield,PA,19064,39.913367,-75.341729,False,True
4,DBV7IgOZrrSCzeyNOPN2Ew,Bonchon West Chester,124 East Market St,West Chester,PA,19382,39.960552,-75.602380,True,True
...,...,...,...,...,...,...,...,...,...,...
995,YK3Kt4It6ACDYC0yrDQdCA,Retrograde Coffee,1305 Dickerson Pike,Nashville,TN,37207,36.195047,-86.771829,True,True
996,Flj91i3ODKW-RDupjwWFqQ,Firebirds Wood Fired Grill,1220 Bethlehem Pike,North Wales,PA,19454,40.209885,-75.225479,True,True
997,z0EAMaXAnBa3KHxq9ulZIA,Mad Pub at Madeira Beach,12933 Village Blvd,Madeira Beach,FL,33708,27.786477,-82.783227,True,True
998,qhoPBAD0RP6F9bWKo6mjcA,Mutts Premium Hot Dogs & Sausages,"806 E University Blvd, Ste 16",Tucson,AZ,85719,32.231317,-110.958950,False,True


In [82]:
business.to_pickle('data/Business.pickle')

##### Business_tagged_Category

In [83]:
business_category = business_category.merge(business[['business_id']],
                                            on="business_id",
                                            how="inner")

In [84]:
business_category

Unnamed: 0,business_id,category
0,WYYdQDjx-DsCanlP0DpImQ,Seafood
1,cm0bhBDPM3bjBefmEXa3BQ,Breakfast & Brunch
2,SBgr-5n-kV3EeZztYsstUQ,Bars
3,zjQDk4tZyhEroyqtkgvx0g,Breakfast & Brunch
4,zjQDk4tZyhEroyqtkgvx0g,Italian
...,...,...
1752,2gFPQCmKKVi1aHgGTLRZvA,Breakfast & Brunch
1753,2gFPQCmKKVi1aHgGTLRZvA,Cafes
1754,2gFPQCmKKVi1aHgGTLRZvA,Burgers
1755,35JPTJ69zVoTGuk6TBX5qA,Bars


In [85]:
business_category.to_pickle('data/Business_tagged_Category.pickle')

##### Photo_contained_Business

In [86]:
photo = pd.merge(photo, business[['business_id']], on=['business_id'], how='inner')

In [87]:
photo

Unnamed: 0,photo_id,business_id,caption,label
0,H52Er-uBg6rNrHcReWTD2w,Gzur0f0XMkrVxIwYJvOt2g,,food
1,tVbtLBTGYncy28W-Pmxk4Q,Gzur0f0XMkrVxIwYJvOt2g,"Inside dining seating, needs wall art, The Pok...",inside
2,ofCa9RalF_jFhkPJ5dmfiQ,Gzur0f0XMkrVxIwYJvOt2g,,food
3,WArOCPqdBi6HF8pqawukBA,Gzur0f0XMkrVxIwYJvOt2g,,food
4,vW-C15YJuMxnxn6LGjjICQ,Gzur0f0XMkrVxIwYJvOt2g,,food
...,...,...,...,...
3809,jMqY3RsXsaLvlpyJUKOvnQ,AMVL7n_M6MrIywrnyA0C0A,,inside
3810,9W_EqQEycs_1wDcgkVH37Q,AMVL7n_M6MrIywrnyA0C0A,,food
3811,dM_chUVOSmKVEfBbz9ryHg,b_E_AvagvVUFoR0Sn0u9Pw,,inside
3812,jOzxPiRo0lHF_OJyrRML2A,qC8Whdafj7-DQ29b7Rlcdw,,inside


In [88]:
photo.to_pickle("data/Photo_contained_Business.pickle")

#### Category

In [89]:
category = pd.DataFrame({'name': MAIN_FOOD_CATEGORIES})

In [90]:
category

Unnamed: 0,name
0,Bars
1,Sandwiches
2,Fast Food
3,Pizza
4,Coffee & Tea
5,Breakfast & Brunch
6,Burgers
7,Mexican
8,Specialty Food
9,Italian


In [91]:
category.to_pickle('data/Category.pickle')

#### Users

In [92]:
chunksize = 500000
# Sample N_USER users among those who have commented on those business 
review_json_path = '../data/yelp_dataset/yelp_academic_dataset_review.json'
review = pd.read_json(review_json_path, lines=True, 
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=chunksize)
chunk_list = []
for chunk in tqdm(review):
    chunk = chunk[['business_id', 'user_id']].merge(business[['business_id']], how='inner')
    chunk_list.append(chunk)
user_id_list = pd.concat(chunk_list, ignore_index=True, axis=0)[["user_id"]].drop_duplicates().sample(n=N_USER)

14it [00:46,  3.34s/it]


In [93]:
users_raw = pd.read_pickle('../data/users_raw.pickle')
users = pd.merge(users_raw, user_id_list, how="inner")

In [94]:
# Extract date of registrating
users["yealping_since"] = pd.to_datetime(users["yelping_since"]).dt.date

In [95]:
users["name"].str.replace(' ', '')

0          Nadine
1            Sara
2            Jack
3          Brooke
4           Damen
          ...    
6661          Ann
6662        Robin
6663        Jason
6664    Elizabeth
6665            W
Name: name, Length: 6666, dtype: object

In [96]:
'asfdljdaslfjadslkjfasdlf'

'asfdljdaslfjadslkjfasdlf'

In [97]:
# Generate a password for existing users
def password_generator(prefix, length_min=8, length_max=16):
    chars = string.ascii_letters + string.digits + '!@#$%*'
    # Generate a [1, 6], but mostly 1~3 random suffix
    random_suffix_length = round(np.clip(np.random.normal(2, 1), a_min=1, a_max=6))
    random_suffix_length = max(length_min-len(prefix), random_suffix_length)
    random_suffix_length = min(length_max-len(prefix), random_suffix_length)
    random_suffix = ''.join(random.choice(chars) for _ in range(random_suffix_length))
    password = prefix + random_suffix
    if len(password) > 16:
        password = password[:16]
    return password

In [98]:
users['email'] = users['name'].str.replace(' ', '')
users["password"] = users.apply(lambda x: password_generator(prefix=x['email']), axis=1)

# Generate an email address for existing users
EMAIL_DOMAIN_OPTIONS = ["@gmail.com", "@hotmail.com", "@outlook.com", "@inbox.com", "@qq.com"]
users['rank_samename'] = users.groupby("email")["email"].rank(method="first", ascending=True).astype(int)
users['email'] = users.apply(lambda x: x['email'] + "_" +str(x['rank_samename']) + random.choice(EMAIL_DOMAIN_OPTIONS), axis=1).str.replace("_1@", "@", regex=False)

# Select the columns
users = users[["user_id", "email", "name", "password", "yealping_since"]].reset_index(drop=True)

In [99]:
users

Unnamed: 0,user_id,email,name,password,yealping_since
0,hHDltEEGlvuhRjDvfbYpqg,Nadine@inbox.com,Nadine,NadinecG,2007-05-05
1,BKCyaw3fLry1268fwpQeIw,Sara@inbox.com,Sara,Sara7#t4,2009-10-12
2,7LmJ87EijrHcoYRu9WgoHg,Jack@hotmail.com,Jack,Jack5igu,2007-01-24
3,Qa2NoSc6DphQArihBgY9Gg,Brooke@qq.com,Brooke,Brookek3%,2007-05-28
4,9HZ8hY5q2ESYA36uF2lKvQ,Damen@hotmail.com,Damen,DamenItr,2011-06-13
...,...,...,...,...,...
6661,0GDiZKbAUUOGDonc8vWKbQ,Ann_11@gmail.com,Ann,Anny74GJ,2012-10-12
6662,L3WfDpl9tMjgUbliZF6R1w,Robin_6@qq.com,Robin,RobinERS,2019-10-19
6663,cV7J16TDV1SNA2qHfwBISw,Jason_36@hotmail.com,Jason,JasonwGq,2012-06-12
6664,r3ViFKPLXT7dGwQrABMOqw,Elizabeth_25@outlook.com,Elizabeth,Elizabeth4B,2017-10-22


In [100]:
users.to_pickle('data/Users.pickle')

#### Collection_of_User

Assume 85% of users don't have collection, and 15% of the active users have min(1, $N(2, 4)$) collections that created from a random date from after signup date till the globally maximum signup date.

In [101]:
collection = users[['user_id', 'yealping_since']].sample(frac=0.15).reset_index(drop=True)

# Generate local id for each user_id
collection['n_collection'] = pd.Series(np.around(np.random.normal(2, 4, len(collection)))).clip(1, 20)
collection['collection_id'] = collection['n_collection'].apply(lambda x: list(range(1, int(x)+1)))
collection = collection.explode('collection_id').reset_index(drop=True)

# Generate a random collection create date
def random_dates(start, end=max(users.yealping_since), n=10):
    random.seed(4111)
    d = random.randint(0, (end - start).days)
    return start + pd.DateOffset(days=d)
collection['created_date'] = collection.apply(lambda x: random_dates(x['yealping_since']), axis=1)


# Select columns
collection = collection[["user_id", "collection_id", "created_date"]]

In [102]:
collection

Unnamed: 0,user_id,collection_id,created_date
0,iJr6yrVNVFJb0kAp6RmI7g,1,2021-09-01
1,_6b_7pVofSMtiDhjz2SjOQ,1,2020-06-19
2,8eDzunKGCsPeppZ3JIIqVA,1,2019-07-27
3,SvW51nqYAbivaXilf9qQoA,1,2021-11-28
4,SvW51nqYAbivaXilf9qQoA,2,2021-11-28
...,...,...,...
3097,GDosrD0iVZkXw01t9DXRsg,1,2021-04-10
3098,GDosrD0iVZkXw01t9DXRsg,2,2021-04-10
3099,GDosrD0iVZkXw01t9DXRsg,3,2021-04-10
3100,GDosrD0iVZkXw01t9DXRsg,4,2021-04-10


In [103]:
collection.to_pickle("data/Collection_of_User.pickle")

#### Review_of_Business

In [104]:
chunksize = 500000
review_json_path = '../data/yelp_dataset/yelp_academic_dataset_review.json'
review = pd.read_json(review_json_path, lines=True, dtype={'review_id':str,'user_id':str,'business_id':str,'stars':int,'date':str,'text':str,'useful':int,'funny':int,'cool':int},
                      chunksize=chunksize)

chunk_list = []
for chunk in review:
    chunk = chunk.drop(['review_id'], axis=1)
    chunk = chunk.rename(columns={'stars': 'review_stars'})
    chunk = chunk.rename(columns={'text': 'review'})
    chunk_merged = pd.merge(chunk, business[['business_id']], on='business_id', how='inner')
    chunk_merged = pd.merge(chunk_merged, users[['user_id']], on='user_id', how='inner')
    print(f"{chunk_merged.shape[0]} out of {chunksize:,} related reviews")
    chunk_list.append(chunk_merged)

review_df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

541 out of 500,000 related reviews
593 out of 500,000 related reviews
671 out of 500,000 related reviews
596 out of 500,000 related reviews
650 out of 500,000 related reviews
660 out of 500,000 related reviews
543 out of 500,000 related reviews
505 out of 500,000 related reviews
595 out of 500,000 related reviews
649 out of 500,000 related reviews
547 out of 500,000 related reviews
460 out of 500,000 related reviews
530 out of 500,000 related reviews
579 out of 500,000 related reviews


In [105]:
chunksize = 500000
tip_json_path = '../data/yelp_dataset/yelp_academic_dataset_tip.json'
tip = pd.read_json(tip_json_path, lines=True, dtype={'text':str,'date':str,'compliment_count':int,'business_id':str,'user_id': int},
                   chunksize=chunksize)

chunk_list_tip = []
for chunk in tip:
    chunk = chunk.rename(columns={'text': 'tip'})
    chunk_merged = pd.merge(chunk, business[['business_id']], on='business_id', how='inner')
    chunk_merged = pd.merge(chunk_merged, users[['user_id']], on='user_id', how='inner')
    print(f"{chunk_merged.shape[0]} out of {chunksize:,} related tips")
    chunk_list_tip.append(chunk_merged)

tip_df = pd.concat(chunk_list_tip, ignore_index=True, join='outer', axis=0)

448 out of 500,000 related tips
147 out of 500,000 related tips


In [106]:
review_df = review_df[review_df['review'].str.len()>=30]
review_dfs = pd.merge(tip_df, review_df, on=['user_id','business_id','date'], how='outer')
review_dfs = review_dfs.rename(columns={'tip': 'short_tip','date':'review_date','review_stars':'stars','review':'detailed_review','compliment_count':'likes'})
review_dfs['review_date'] = pd.to_datetime(review_dfs['review_date']).dt.date
# review_dfs[['likes','stars','useful','funny','cool','detailed_review']] = review_dfs[['likes','stars','useful','funny','cool', 'detailed_review']]
# review_dfs= review_dfs.where(review_dfs.notnull(),None)

In [107]:
# Add globally unique review_id
review_dfs['review_id'] = [i + 100000 for i in range(len(review_dfs))]

# Select columns
review = review_dfs[['review_id','review_date','business_id','short_tip','likes','detailed_review','stars','useful','funny','cool']]

In [108]:
review

Unnamed: 0,review_id,review_date,business_id,short_tip,likes,detailed_review,stars,useful,funny,cool
0,100000,2011-11-16,d-5kMts7CzenaPgDzesQDg,They're extremely busy and backed up during di...,0.0,,,,,
1,100001,2011-11-15,V0SSA36N_FpGaikuo4H7hQ,I'm pretty sure they don't have a deli with me...,0.0,,,,,
2,100002,2011-09-26,V0SSA36N_FpGaikuo4H7hQ,Free wifi! Also I'm pretty sure they don't hav...,0.0,,,,,
3,100003,2013-05-13,WXLpEC9h4tlI57Wy26cdrw,Great place for salad or sandwichs!!!,0.0,,,,,
4,100004,2013-12-15,wqmAnbS2roEnhKHCHygrSw,Great cheese crisp,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
8708,108708,2021-01-02,oZFddbA1tlw4CHrLvg4mwg,,,"Whenever I have a craving for comfort food, I ...",5.0,5.0,1.0,1.0
8709,108709,2020-03-13,oZFddbA1tlw4CHrLvg4mwg,,,My husband and I ate here today. We were look...,2.0,0.0,0.0,0.0
8710,108710,2021-06-10,oZFddbA1tlw4CHrLvg4mwg,,,Just delicious! I've eaten here several times ...,5.0,0.0,0.0,0.0
8711,108711,2021-10-01,oZFddbA1tlw4CHrLvg4mwg,,,Stopped by carry out on a busy weeknight. I ca...,5.0,1.0,0.0,0.0


In [109]:
review.to_pickle('data/Review_of_Business.pickle')

##### Users_write_Review

In [110]:
users_write_review = review_dfs[['user_id', 'review_id']]

In [111]:
users_write_review

Unnamed: 0,user_id,review_id
0,acYG_L-chbfRYGMdmERXpA,100000
1,acYG_L-chbfRYGMdmERXpA,100001
2,acYG_L-chbfRYGMdmERXpA,100002
3,UYjrcERZkGi8IddYuQCSeQ,100003
4,UYjrcERZkGi8IddYuQCSeQ,100004
...,...,...
8708,vr0gmPyRIXo9jpdJ3_pO5Q,108708
8709,vlKusLSetbCBpeIVIK0l-Q,108709
8710,Yd3LdJjHZYj8xIRnQWwiBA,108710
8711,Hp1Spk3Bpyouc2-jgXDHmA,108711


In [112]:
users_write_review.to_pickle('data/Users_write_Review.pickle')

### 2.2 Relationship

#### Users_favorite_Business

Assume 60% of users don't have collection, and 40% of the active users have min(1,  𝑁(3, 5)) business favorated that they have reviewed, which happened at a random date from after signup date till the globally maximum signup date.

In [113]:
user_business = users[['user_id']].sample(frac=0.4).reset_index(drop=True)

# Generate local id for each user_id
user_business['n_bz_follow'] = pd.Series(np.around(np.random.normal(3, 5, len(user_business)))).clip(lower=1).astype('int')
user_business['business_id'] = user_business['n_bz_follow'].apply(lambda n: business.sample(n).business_id.tolist())
user_business = user_business.explode(["business_id"])
user_business = user_business[['user_id', 'business_id']]

In [114]:
user_business

Unnamed: 0,user_id,business_id
0,ien1BKNnAeJsL1gciNadpA,N2ftvATcrj6x3PRGxyHzFw
1,RLW98dloIcfl57JPK5rMXQ,kPy6jsVWuMREMRWRGTvEyw
1,RLW98dloIcfl57JPK5rMXQ,_8dFWEVSVxU6nIW3yM0xRA
1,RLW98dloIcfl57JPK5rMXQ,C2XXTlG9rIqLjjJrn2ODVA
1,RLW98dloIcfl57JPK5rMXQ,WK7fLsrqNk9gxAOupNXa9A
...,...,...
2664,6Dn80MzsYpLTasm0cH2r8A,3D0bjMq82o92hQylTEkDug
2664,6Dn80MzsYpLTasm0cH2r8A,fBI981fKx1k1Ju8QRfMGPg
2664,6Dn80MzsYpLTasm0cH2r8A,LCxZ5RkXU9pQvKLInTLCrQ
2664,6Dn80MzsYpLTasm0cH2r8A,QIOxYlhQQGX7B_bGvHKdXw


In [115]:
user_business.to_pickle("data/Users_favorite_Business.pickle")

#### Users_follow_Collection

Assume 80% of users don't follow any collection, and 20% of the active users follow max(1, 𝑁(2, 5)) collections (include theirselves)>

In [116]:
users_collection = pd.DataFrame()
users_collection['fan_user_id'] = users[['user_id']]

# Random some collections for user to follow
users_collection['n_collection_follow'] = np.round(np.maximum(1, np.random.normal(2, 5, len(users_collection)))).astype('int')
def get_collection(fan):
    df = collection.sample(fan['n_collection_follow'])
    fan['followee_user_id'] = df.user_id.tolist()
    fan['collection_id'] = df.collection_id.tolist()
    return fan
users_collection = users_collection.apply(get_collection, axis=1).explode(["followee_user_id", "collection_id"])
users_collection = users_collection[['fan_user_id','followee_user_id','collection_id']]

In [117]:
collection

Unnamed: 0,user_id,collection_id,created_date
0,iJr6yrVNVFJb0kAp6RmI7g,1,2021-09-01
1,_6b_7pVofSMtiDhjz2SjOQ,1,2020-06-19
2,8eDzunKGCsPeppZ3JIIqVA,1,2019-07-27
3,SvW51nqYAbivaXilf9qQoA,1,2021-11-28
4,SvW51nqYAbivaXilf9qQoA,2,2021-11-28
...,...,...,...
3097,GDosrD0iVZkXw01t9DXRsg,1,2021-04-10
3098,GDosrD0iVZkXw01t9DXRsg,2,2021-04-10
3099,GDosrD0iVZkXw01t9DXRsg,3,2021-04-10
3100,GDosrD0iVZkXw01t9DXRsg,4,2021-04-10


In [118]:
users_collection.to_pickle('data/Users_follow_Collection.pickle')

#### Collection_contain_Business

Assume every collections collects max(1, 𝑁(3, 5)) business.

In [119]:
collection_business = pd.DataFrame()
collection_business['collection_owner_id'] = collection['user_id']
collection_business['collection_id'] = collection['collection_id']
collection_business['n_business_contain'] = np.round(np.maximum(1, np.random.normal(3, 5, len(collection_business)))).astype('int')
collection_business['business_id'] = collection_business['n_business_contain'].apply(lambda n: business.sample(n).business_id.tolist())
collection_business = collection_business.explode('business_id')
collection_business = collection_business[['collection_owner_id','collection_id', 'business_id']]

In [120]:
collection_business

Unnamed: 0,collection_owner_id,collection_id,business_id
0,iJr6yrVNVFJb0kAp6RmI7g,1,NA3oFJQBle5iulhYu2b1tA
1,_6b_7pVofSMtiDhjz2SjOQ,1,h_Pu984hndy2gPc4yvQLng
1,_6b_7pVofSMtiDhjz2SjOQ,1,sb-YFqRw3pc406U0_d5R8A
1,_6b_7pVofSMtiDhjz2SjOQ,1,I9v-617xe2zTTahd9IobyQ
1,_6b_7pVofSMtiDhjz2SjOQ,1,eM8kmbbkjtvYKL2kHsARsQ
...,...,...,...
3101,GDosrD0iVZkXw01t9DXRsg,5,3966MAWoL1GbB8Zjvspx1g
3101,GDosrD0iVZkXw01t9DXRsg,5,TyML9sqJmsHO0Mh3S1_f7A
3101,GDosrD0iVZkXw01t9DXRsg,5,LCxZ5RkXU9pQvKLInTLCrQ
3101,GDosrD0iVZkXw01t9DXRsg,5,PlEnI02D0iKpMt6gS7h_HQ


In [121]:
collection_business.to_pickle('data/Collection_contain_Business.pickle')

#### Users_follow_Users

Assume 70% of users don't follow any users, and 30% of the active users have min(0,  $N(3, 5)$) fans, and follow_since happened at a random date from after signup date till the globally maximum signup date.

In [122]:
users_follow= pd.DataFrame()
users_follow = users[['user_id']].sample(frac=0.3).reset_index(drop=True) # only 30%users have fans
users_follow = users_follow.rename(columns={'user_id': 'fan_user_id'})

# Generate follower
users_follow['n_followers'] = np.round(np.maximum(1, np.random.normal(3, 5, len(users_follow)))).astype('int')
def get_fan(fan):
    df = users.sample(fan['n_followers'])
    fan['followee_user_id'] = df.user_id.tolist()
    #fan['collection_id'] = df.collection_id.tolist()
    return fan
users_follow = users_follow.apply(get_fan, axis=1).explode(["followee_user_id"])

# Generate follow date
def random_dates(start, end=max(users.yealping_since), n=10):
    random.seed(4111)
    d = random.randint(0, (end - start).days)
    return start + pd.DateOffset(days=d)
users_follow['follow_since'] = users.apply(lambda x: random_dates(x['yealping_since']), axis=1)

users_follow = users_follow[['fan_user_id','followee_user_id','follow_since']]

In [123]:
users_follow

Unnamed: 0,fan_user_id,followee_user_id,follow_since
0,ACBem-7OPZPnUF2CsnxQIg,NuWEbLM5khmsmAMfND6cKQ,2020-05-27
0,ACBem-7OPZPnUF2CsnxQIg,ew1oSfz0zCGkBiIgwxZeXw,2020-05-27
0,ACBem-7OPZPnUF2CsnxQIg,10G9QMeySHqgJZptIkNf6A,2020-05-27
0,ACBem-7OPZPnUF2CsnxQIg,h_XCqLu_tGo2h58hPi29sA,2020-05-27
0,ACBem-7OPZPnUF2CsnxQIg,DRXZfC3za33O-mhdQCeGJw,2020-05-27
...,...,...,...
1997,TowWCfViNQjxnqwNLHDUIA,ZbTqN6xcPcaRovRWDt6Kow,2018-10-11
1998,M5N53R-RuKHRsTsIyN8LzQ,RWldkY40Bv21Q2VNLXKxMg,2020-06-18
1999,aVgTyVrjlM1YT52o5f1qqw,0sAUF_yzU7nlac8Hg-FUvg,2020-11-04
1999,aVgTyVrjlM1YT52o5f1qqw,jk3Q6HfJZWLWTBDcZi846A,2020-11-04


In [124]:
users_follow.to_pickle("data/Users_follow_Users.pickle")

## 3 Data Population

To avoid directly populating sql table by pandas dataframe, we first generate this code automatically for all tables:
```
for each table:
    for each row in table:
        for each column in the table:
            extract the value
        %sql insert the row
```

and then run it!

### 3.1 Generate data population SQLs automatically

To avoid using additional package to achieve data importing from dataframe/csv, we automatic the method suggested in [ED #450](https://edstem.org/us/courses/17037/discussion/1317472) that INSERT tuples table-by-table and line-by-line.

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')
table_names = ['Users',
               'Collection_of_User',
               'Business',
               'Review_of_Business',
               'Category',
               'Photo_contained_Business',
               'Users_favorite_Business',
               'Users_follow_Collection',
               'Users_write_Review', 
               'Collection_contain_Business', 
               'Business_tagged_Category',
               'Users_follow_Users']

folder_path = "data"
file_names = [table_name + '.pickle' for table_name in table_names]
file_paths = [folder_path + '/' + file_name for file_name in file_names]
file_paths

['data/Users.pickle',
 'data/Collection_of_User.pickle',
 'data/Business.pickle',
 'data/Review_of_Business.pickle',
 'data/Category.pickle',
 'data/Photo_contained_Business.pickle',
 'data/Users_favorite_Business.pickle',
 'data/Users_follow_Collection.pickle',
 'data/Users_write_Review.pickle',
 'data/Collection_contain_Business.pickle',
 'data/Business_tagged_Category.pickle',
 'data/Users_follow_Users.pickle']

In [5]:
def get_code(table_name):
    df = pd.read_pickle(folder_path + '/' + table_name + '.pickle')
    data_import_code = f'{table_name} = pd.read_pickle(folder_path + "/{table_name}.pickle")'
    #for_loop_code = f'for index, row in {table_name}.iterrows():'
    clean_null_code = f'{table_name} = {table_name}.astype(object).where({table_name}.notna(), None)'
#    table_columns = ", ".join(df.columns)
    value_inser_code = f'engine.execute("""INSERT INTO {table_name} VALUES (' + ', '.join([f'%s' for i in range(len(df.columns))]) + f');""", list({table_name}.itertuples(index=False, name=None)))'
    print('\n'.join([data_import_code, clean_null_code, value_inser_code]))

for table_name in table_names:
    print(f'# Populate {table_name} table')
    get_code(table_name)
    print('')

# Populate Users table
Users = pd.read_pickle(folder_path + "/Users.pickle")
Users = Users.astype(object).where(Users.notna(), None)
engine.execute("""INSERT INTO Users VALUES (%s, %s, %s, %s, %s);""", list(Users.itertuples(index=False, name=None)))

# Populate Collection_of_User table
Collection_of_User = pd.read_pickle(folder_path + "/Collection_of_User.pickle")
Collection_of_User = Collection_of_User.astype(object).where(Collection_of_User.notna(), None)
engine.execute("""INSERT INTO Collection_of_User VALUES (%s, %s, %s);""", list(Collection_of_User.itertuples(index=False, name=None)))

# Populate Business table
Business = pd.read_pickle(folder_path + "/Business.pickle")
Business = Business.astype(object).where(Business.notna(), None)
engine.execute("""INSERT INTO Business VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""", list(Business.itertuples(index=False, name=None)))

# Populate Review_of_Business table
Review_of_Business = pd.read_pickle(folder_path + "/Review_of_Business.


```python
# Initial SQL INSERT code generating method we try without using the "for looop %sql"
table_nam = "Users"
for index, row in Users[:5].iterrows():
    INSERT_str = f"INSERT INTO {table_name} VALUES ("
    for column, column_type in zip(Users.columns, Users.dtypes):
        if column_type != 'object':
            INSERT_str += str(row[column]) + ", "
        else:
            INSERT_str += "'" + str(row[column]) + "', "
    INSERT_str = INSERT_str[:-2] + ");"
    print(INSERT_str)
```

```python
# Oder version of code generating
def get_code(table_name):
    df = pd.read_pickle(folder_path + '/' + table_name + '.pickle')
    data_import_code = f'{table_name} = pd.read_pickle(folder_path + "/{table_name}.pickle")'
    for_loop_code = f'for index, row in {table_name}.iterrows():'
    clean_null_code = f'{table_name} = {table_name}.astype(object).where({table_name}.notna(), None)'
    value_extract_code = '\n'.join([f'\tvalue{col_idx} = row["{column}"]' for col_idx, column in enumerate(df.columns)])
    value_inser_code = f'\t%sql INSERT INTO {table_name} VALUES (' + ', '.join([f':value{i}' for i in range(len(df.columns))]) + ')'
    print('\n'.join([data_import_code, clean_null_code, for_loop_code, value_extract_code, value_inser_code]))

for table_name in table_names:
    print(f'# Populate {table_name} table')
    get_code(table_name)
    print('')
```

### 3.2 Run the above code to populate the data

In [7]:
engine = create_engine(DATABASEURI)

In [8]:
# Populate Users table
Users = pd.read_pickle(folder_path + "/Users.pickle")
Users = Users.astype(object).where(Users.notna(), None)
engine.execute("""INSERT INTO Users VALUES (%s, %s, %s, %s, %s);""", list(Users.itertuples(index=False, name=None)))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x106fe8370>

In [9]:
# Populate Collection_of_User table
Collection_of_User = pd.read_pickle(folder_path + "/Collection_of_User.pickle")
Collection_of_User = Collection_of_User.astype(object).where(Collection_of_User.notna(), None)
engine.execute("""INSERT INTO Collection_of_User VALUES (%s, %s, %s);""", list(Collection_of_User.itertuples(index=False, name=None)))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1245041f0>

In [10]:
# Populate Business table
Business = pd.read_pickle(folder_path + "/Business.pickle")
Business = Business.astype(object).where(Business.notna(), None)
engine.execute("""INSERT INTO Business VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""", list(Business.itertuples(index=False, name=None)))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x127a4d0d0>

In [11]:
# Populate Review_of_Business table
Review_of_Business = pd.read_pickle(folder_path + "/Review_of_Business.pickle")
Review_of_Business = Review_of_Business.astype(object).where(Review_of_Business.notna(), None)
engine.execute("""INSERT INTO Review_of_Business VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""", list(Review_of_Business.itertuples(index=False, name=None)))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x127a8e2b0>

In [12]:
# Populate Category table
Category = pd.read_pickle(folder_path + "/Category.pickle")
Category = Category.astype(object).where(Category.notna(), None)
engine.execute("""INSERT INTO Category VALUES (%s);""", list(Category.itertuples(index=False, name=None)))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x127d0b580>

In [13]:
# Populate Photo_contained_Business table
Photo_contained_Business = pd.read_pickle(folder_path + "/Photo_contained_Business.pickle")
Photo_contained_Business = Photo_contained_Business.astype(object).where(Photo_contained_Business.notna(), None)
engine.execute("""INSERT INTO Photo_contained_Business VALUES (%s, %s, %s, %s);""", list(Photo_contained_Business.itertuples(index=False, name=None)))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x106fdccd0>

In [14]:
# Populate Users_favorite_Business table
Users_favorite_Business = pd.read_pickle(folder_path + "/Users_favorite_Business.pickle")
Users_favorite_Business = Users_favorite_Business.astype(object).where(Users_favorite_Business.notna(), None)
engine.execute("""INSERT INTO Users_favorite_Business VALUES (%s, %s);""", list(Users_favorite_Business.itertuples(index=False, name=None)))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1278adc10>

In [15]:
# Populate Users_follow_Collection table
Users_follow_Collection = pd.read_pickle(folder_path + "/Users_follow_Collection.pickle")
Users_follow_Collection = Users_follow_Collection.astype(object).where(Users_follow_Collection.notna(), None)
engine.execute("""INSERT INTO Users_follow_Collection VALUES (%s, %s, %s);""", list(Users_follow_Collection.itertuples(index=False, name=None)))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x124505310>

In [16]:
# Populate Users_write_Review table
Users_write_Review = pd.read_pickle(folder_path + "/Users_write_Review.pickle")
Users_write_Review = Users_write_Review.astype(object).where(Users_write_Review.notna(), None)
engine.execute("""INSERT INTO Users_write_Review VALUES (%s, %s);""", list(Users_write_Review.itertuples(index=False, name=None)))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x12789d7f0>

In [17]:
# Populate Collection_contain_Business table
Collection_contain_Business = pd.read_pickle(folder_path + "/Collection_contain_Business.pickle")
Collection_contain_Business = Collection_contain_Business.astype(object).where(Collection_contain_Business.notna(), None)
engine.execute("""INSERT INTO Collection_contain_Business VALUES (%s, %s, %s);""", list(Collection_contain_Business.itertuples(index=False, name=None)))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x127a49a00>

In [18]:
# Populate Business_tagged_Category table
Business_tagged_Category = pd.read_pickle(folder_path + "/Business_tagged_Category.pickle")
Business_tagged_Category = Business_tagged_Category.astype(object).where(Business_tagged_Category.notna(), None)
engine.execute("""INSERT INTO Business_tagged_Category VALUES (%s, %s);""", list(Business_tagged_Category.itertuples(index=False, name=None)))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x124524a30>

In [19]:
# Populate Users_follow_Users table
Users_follow_Users = pd.read_pickle(folder_path + "/Users_follow_Users.pickle")
Users_follow_Users = Users_follow_Users.astype(object).where(Users_follow_Users.notna(), None)
engine.execute("""INSERT INTO Users_follow_Users VALUES (%s, %s, %s);""", list(Users_follow_Users.itertuples(index=False, name=None)))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x124504e50>

### 3.3 Preparing static files

##### Photo of restaurants

In [101]:
Photo_contained_Business = %sql SELECT * FROM Photo_contained_Business

 * postgresql://jy3174:***@w4111-4-14.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
3814 rows affected.


In [107]:
Photo_contained_Business = pd.DataFrame(Photo_contained_Business)

In [108]:
photo_paths = '../data/yelp_photos/photos/'
photo_names = Photo_contained_Business.photo_id.tolist()

In [111]:
import shutil

front_end_folder = '../yealp_frontend/flaskblog/static/business_photos/'
try:
    shutil.rmtree(front_end_folder)
except:
    pass
os.mkdir(front_end_folder)
for photo_name in photo_names:
    shutil.copy(photo_paths + photo_name + '.jpg' , front_end_folder + photo_name + '.jpg' )

##### Calculating all the states available in the current dataset

In [112]:
STATES_OPTIONS = [state[0] for state in engine.execute("SELECT state FROM Business GROUP BY state").fetchall()]
STATES_OPTIONS

['CA',
 'PA',
 'AB',
 'LA',
 'IL',
 'NV',
 'TN',
 'NJ',
 'IN',
 'MO',
 'FL',
 'DE',
 'AZ',
 'ID']

In [113]:
import json
with open("../yealp_frontend/flaskblog/static/STATES_OPTIONS.json", 'w') as f:
    json.dump(STATES_OPTIONS, f, indent=2) 

# with open("../yealp_frontend/flaskblog/static/STATES_OPTIONS.json", 'r') as f:
#     STATES_OPTIONS = json.load(f)
# STATES_OPTIONS = [(state, state) for state in STATES_OPTIONS]
# STATES_OPTIONS

In [114]:
%%sql

SELECT *
FROM Business Where name = 'Lemon & Coriander'

 * postgresql://jy3174:***@w4111-4-14.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
1 rows affected.


business_id,name,address,city,state,postal_code,latitude,longitude,is_open,is_takeout
yL1yC5OOawjSamAb9Wjm5A,Lemon & Coriander,231 Magnolia Ave,Goleta,CA,93117,34,-120,False,False


In [115]:
%%sql

SELECT *
FROM Photo_contained_Business Where business_id = 'yL1yC5OOawjSamAb9Wjm5A'

 * postgresql://jy3174:***@w4111-4-14.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
2 rows affected.


photo_id,business_id,caption,label
ALSW66CPzpFsYEh5LKEr_w,yL1yC5OOawjSamAb9Wjm5A,"Egg Sandwich - poppy seed brioche bun, fried egg, bacon, mushrooms, sliced tomato, cheddar",food
pbuIN7TnOyQWvsWuAqALJA,yL1yC5OOawjSamAb9Wjm5A,Lemon Poppy Seed Muffin & Cappuccino,drink


## 4 Interesting SELECT queries.

### 4.1 Find the top 5 rated open resturant in average that has at least 5 review!

In [25]:
%%sql

SELECT name, address, city, round(AVG(stars), 2) AS average_stars
FROM Review_of_Business JOIN Business USING(business_id)
WHERE detailed_review IS NOT NULL AND is_open = True
GROUP BY business_id, name, address, city
HAVING count(*) >= 5
ORDER BY average_stars DESC LIMIT 5

 * postgresql://jy3174:***@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
5 rows affected.


name,address,city,average_stars
Abel's Del Real Pub & Grill,6148 Mae Anne Ave,Reno,5.0
Bazaar European Deli & Cafe,3652 S Virginia St,Reno,5.0
Decibel Coffee Works,"267 S Avenida Del Convento, Bldg 9",Tucson,5.0
Three Palms Bar & Grill,3813 Tulane Ave,New Orleans,5.0
Aguitas Bar & Grill Night Club,1825 Prater Way,Sparks,5.0


### 4.2 Find the TOP 5 users with most fans

In [26]:
%%sql

WITH user_fans_cnt AS(
    SELECT follwee_user_id as user_id, COUNT(fan_user_id) as fans_cnt
    FROM Users_follow_Users
    GROUP BY follwee_user_id)
SELECT user_id, name, email, yealping_since, fans_cnt
FROM user_fans_cnt JOIN Users USING(user_id)
ORDER BY fans_cnt DESC, yealping_since ASC
LIMIT 5

 * postgresql://jy3174:***@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
5 rows affected.


user_id,name,email,yealping_since,fans_cnt
hwZ8fDGkcB395BFfC_TO_w,Ant,Ant@outlook.com,2009-04-01,19
T8UFtXuNejmWXLQskQFAcw,Rob,Rob_11@hotmail.com,2012-08-17,18
KSLS1_701x517q0CtU_WwA,Errin,Errin@inbox.com,2017-01-24,18
Gwfd8y-6nmhXL3hce0Et1g,Elise,Elise_2@qq.com,2019-06-13,18
-xm7Q5WmwJTcD3yVW9CyEA,Michael,Michael_34@qq.com,2021-07-12,18


### 4.3 For all the categories, calculate the number of followers users through `Collection` that contain the restaurant

In [27]:
%%sql

WITH collection_cnt AS(
    SELECT followee_user_id AS collection_owner_id, collection_id, COUNT(fan_user_id) AS fans_cnt
    FROM Users_follow_Collection  
    GROUP BY followee_user_id, collection_id)

SELECT cate.name, 
       COUNT(CONCAT(collection_owner_id, collection_id)) AS n_collections,
       SUM(fans_cnt) as total_fans_among_all_collections
FROM collection_cnt 
    JOIN Collection_contain_Business USING(collection_owner_id, collection_id) 
    JOIN Business USING(business_id)
    JOIN Business_tagged_Category cate USING(business_id)
WHERE is_open = True
GROUP BY cate.name
ORDER BY total_fans_among_all_collections DESC

 * postgresql://jy3174:***@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
16 rows affected.


name,n_collections,total_fans_among_all_collections
Bars,1868,13794
Sandwiches,1805,13380
Breakfast & Brunch,1638,12109
Fast Food,1512,11299
Coffee & Tea,1497,11118
Pizza,1168,8731
Burgers,1144,8510
Salad,1004,7474
Mexican,981,7269
Italian,907,6801


## 5. Generate PostGreSQL View

### Business wide table

In [5]:
%%sql

DROP VIEW IF EXISTS business_wide CASCADE;
CREATE VIEW business_wide AS 
    WITH bizs_cal_star AS(
        SELECT business_id, 
               round(AVG(stars), 2) AS average_stars,
               round(round(AVG(stars) * 2) / 2, 1) AS rounded_average_stars
        FROM Review_of_Business JOIN Business USING(business_id)
        WHERE detailed_review IS NOT NULL
        GROUP BY business_id),
    
    bizs_join_category AS(
        SELECT business_id, array_agg(name) as category_names
        FROM Business_tagged_Category
        GROUP BY business_id
    ),
    
    bizs_join_photo AS(
        SELECT business_id, array_agg(photo_id) as photo_ids
        FROM Photo_contained_Business
        GROUP BY business_id
    )
        
    SELECT *
    FROM Business 
        LEFT JOIN bizs_cal_star USING(business_id)
        LEFT JOIN bizs_join_category USING(business_id)
        LEFT JOIN bizs_join_photo USING(business_id)

 * postgresql://jy3174:***@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
Done.
Done.


[]

In [6]:
%%sql

SELECT *
FROM business_wide
LIMIT 5

 * postgresql://jy3174:***@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
5 rows affected.


business_id,name,address,city,state,postal_code,latitude,longitude,is_open,is_takeout,average_stars,rounded_average_stars,category_names,photo_ids
0cg8BdPanf-DAA8AYuk6Zg,China Chef,11440 US-301,Riverview,FL,33569,28,-82,True,True,5.0,5.0,['Chinese'],['MC2POiY7NQ6VJL6UyHb8Eg']
0DBf72U_oR_iDDXdOx4a4g,Waffle House,4937 Knights Way,Indianapolis,IN,46217,40,-86,True,True,2.8,3.0,"['Breakfast & Brunch', 'Fast Food']","['gKO_RcBKMCxu9zU-Sr4qLw', 'cuwnVxb7ufE4mlZLzPNyKA', '1x5NJj5KUMQeiPWRK7Rt3A', '4sagaT0rwqULkPpq3Mrydg', 'NOkBXten2b2VRZbSJ158mA']"
0dKjWJG5X9YGTNoFCfNedA,Mario & Franks 1,2083 US Rt 130 S,Florence,NJ,8518,40,-75,True,True,2.0,2.0,"['Sandwiches', 'Italian', 'Pizza']","['LNb-vlkSKoGVcmHcR2nrtA', 'jWpTAgmwv_SfYCNCHFbTQw', 'qbdCBqheocxpK9EuZWg-Cw']"
0f0l62WKla-j2cfnPn7P2Q,The Coffee Bean & Tea Leaf,5745 Calle Real,Goleta,CA,93117,34,-120,True,True,3.3,3.5,['Coffee & Tea'],"['htJQ5JR8CiCtKGIqHxlasQ', 'QA8iXvwQqd5quufBv2juyQ', 'p_NZ1qiLn-cwc9XpFLCXdw', '8aFitXnVzkHa-6AZ3nChtA']"
0gdnntqYGYhUCTTf0a7Xcg,Tumerico On 4th Ave,402 E 4th St,Tucson,AZ,85705,32,-111,True,True,4.8,5.0,['Mexican'],['pWewmgFuoqvToVzqmP2HuQ']


### Review wide table

In [7]:
%%sql

DROP VIEW IF EXISTS reviews_wide CASCADE;
CREATE VIEW reviews_wide AS 

WITH detailed_reviews AS (
    SELECT *
    FROM Review_of_Business
    WHERE detailed_review IS NOT NULL)
    
SELECT review_id, 
       user_id, Users.name as user_name, 
       business_id,
       Business.name as business_name,
       detailed_review, review_date,
       useful, funny, cool	
FROM detailed_reviews
    LEFT JOIN Users_write_Review USING(review_id)
    LEFT JOIN Users USING(user_id)
    LEFT JOIN Business USING(business_id)
ORDER BY review_date DESC

 * postgresql://jy3174:***@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
Done.
Done.


[]

In [8]:
%%sql

SELECT *
FROM reviews_wide
WHERE user_id = 'cPT-E02ZVvzevldYI-vrAg'

 * postgresql://jy3174:***@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
3 rows affected.


review_id,user_id,user_name,business_id,business_name,detailed_review,review_date,useful,funny,cool
102300,cPT-E02ZVvzevldYI-vrAg,Stan,p4M3JKUfalVIolRvC2vZEg,Casa Don Alfonso,"I think I know Italian food pretty well, I've lived in Italy for extended periods of time and married to an excellent Italian cook, we normally do not eat out Italian because the food at home is usually better. So I was skeptical about this place but we gave it a try. It is indeed down home Italian food BUT, its all pretty much taken to the next level. Top to bottom the entire meal was really pretty great and the service excellent. The setting a really well done and most important for me, not too loud. Now it is expensive, it is for us a spurge kind of place, but in the end we walked away feeling like we had a great evening there and the hype for this place deserved.",2022-01-19,1,0,0
104161,cPT-E02ZVvzevldYI-vrAg,Stan,hxyedIXhDM48mLF8LAnX0Q,DiGregorio's Market,"I'm a big fan of Italian culture and cooking, and of all the places to get your Italian imports, this is the place, IMHO. There are other places, like Viviano's down the street, but in my experience, you'll have a better shopping experience here. Very clean and well organized and the staff very helpful. Saturday it's very busy here, so bring some extra patience with you if you insist on shopping here on weekends.",2010-11-30,3,0,0
104870,cPT-E02ZVvzevldYI-vrAg,Stan,S7CHy4U1Mv-4JEb963ouPA,Duff's Restaurant,"The staff here are very nice, the service, generally good, but can unexpectantly just disapear. It's a good place for brunch or lunch, I've not eaten dinner here in a while.",2010-11-30,0,0,0


### Short tips wider table

In [9]:
%%sql

DROP VIEW IF EXISTS tips_wide CASCADE;
CREATE VIEW tips_wide AS 

WITH tips AS (
    SELECT *
    FROM Review_of_Business
    WHERE short_tip IS NOT NULL)
SELECT review_id, 
       user_id, Users.name as user_name, 
       business_id,
       Business.name as business_name,
       short_tip, review_date, likes
FROM tips
    LEFT JOIN Users_write_Review USING(review_id)
    LEFT JOIN Users USING(user_id)
    LEFT JOIN Business USING(business_id)
ORDER BY review_date DESC

 * postgresql://jy3174:***@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
Done.
Done.


[]

In [10]:
%%sql

SELECT *
FROM business_wide
LIMIT 10

 * postgresql://jy3174:***@w4111.cisxo09blonu.us-east-1.rds.amazonaws.com/proj1part2
10 rows affected.


business_id,name,address,city,state,postal_code,latitude,longitude,is_open,is_takeout,average_stars,rounded_average_stars,category_names,photo_ids
0cg8BdPanf-DAA8AYuk6Zg,China Chef,11440 US-301,Riverview,FL,33569,28,-82,True,True,5.0,5.0,['Chinese'],['MC2POiY7NQ6VJL6UyHb8Eg']
0DBf72U_oR_iDDXdOx4a4g,Waffle House,4937 Knights Way,Indianapolis,IN,46217,40,-86,True,True,2.8,3.0,"['Breakfast & Brunch', 'Fast Food']","['gKO_RcBKMCxu9zU-Sr4qLw', 'cuwnVxb7ufE4mlZLzPNyKA', '1x5NJj5KUMQeiPWRK7Rt3A', '4sagaT0rwqULkPpq3Mrydg', 'NOkBXten2b2VRZbSJ158mA']"
0dKjWJG5X9YGTNoFCfNedA,Mario & Franks 1,2083 US Rt 130 S,Florence,NJ,8518,40,-75,True,True,2.0,2.0,"['Sandwiches', 'Italian', 'Pizza']","['LNb-vlkSKoGVcmHcR2nrtA', 'jWpTAgmwv_SfYCNCHFbTQw', 'qbdCBqheocxpK9EuZWg-Cw']"
0f0l62WKla-j2cfnPn7P2Q,The Coffee Bean & Tea Leaf,5745 Calle Real,Goleta,CA,93117,34,-120,True,True,3.3,3.5,['Coffee & Tea'],"['htJQ5JR8CiCtKGIqHxlasQ', 'QA8iXvwQqd5quufBv2juyQ', 'p_NZ1qiLn-cwc9XpFLCXdw', '8aFitXnVzkHa-6AZ3nChtA']"
0gdnntqYGYhUCTTf0a7Xcg,Tumerico On 4th Ave,402 E 4th St,Tucson,AZ,85705,32,-111,True,True,4.8,5.0,['Mexican'],['pWewmgFuoqvToVzqmP2HuQ']
0gRoCRYcjOFKLg3S2c8dmg,Chipotle Mexican Grill,2391 N Hwy 67,Florissant,MO,63033,39,-90,True,True,2.0,2.0,"['Fast Food', 'Mexican']","['W5Esz3Lor45LyalCm4olaQ', 'Bp-xbWCd4cJ6ItpghGQhtA', '4dHeiE-5a97RhOGWLLqgFA']"
0LXdnyZy4ae8J28Ckjvv7w,Sly Fox Taphouse at The Grove,"20 Liberty Blvd, Ste 100",Malvern,PA,19355,40,-76,True,True,3.86,4.0,"['Pizza', 'Bars']","['ZEpysUsBH8fr8F8vIr8nOw', 'DmPnMPi9i_MWcCLZjWmbug']"
0OVhJp8cS3fWFLa35SESOQ,Jackson Tavern,2900 N Swan Rd,Tucson,AZ,85716,32,-111,False,True,2.71,2.5,"['Seafood', 'Breakfast & Brunch']",['ucLmAd34VQQHpUapZkKBPA']
0R3SRPEYpqoUi3kq5ZsELw,SuVege,615 S Trooper Rd,Audubon,PA,19403,40,-75,False,True,5.0,5.0,['Fast Food'],"['EpEJIffRX_tzoVF5htDE7Q', 'HxQBZSBfdVNik7NdUI1GlQ', 'iP08wc2mZQaUqyurMx-qRg']"
0VYfnlvP5LsCEennaTYJkw,Newbolds Food & Libations,211 York Rd,Jenkintown,PA,19046,40,-75,True,True,4.6,4.5,"['Pizza', 'Sandwiches', 'Bars']","['6AmVF_b8ui6qYlmpZCoP0Q', 'r03NUhD9W0eat1WXfhPnEg']"
