In [17]:
import psycopg

In [18]:
%reload_ext sql

In [19]:
%sql postgresql://michellelin@localhost:5432/yelp

In [30]:
%config SqlMagic.displaylimit = 30

# Query 1
#### What are the top 5 cities with the most 5-star business?

## Denormalized Data

In [21]:
%%sql
SELECT city, state, COUNT(*) five_star_count
FROM business 
WHERE stars = 5
GROUP BY city, state
ORDER BY five_star_count DESC
LIMIT 5;

city,state,five_star_count
Santa Barbara,CA,9
Reno,NV,7
Tucson,AZ,6
Tampa,FL,5
Philadelphia,PA,5


In [22]:
%%sql 
EXPLAIN ANALYZE 
SELECT city, state, COUNT(*) five_star_count
FROM business 
WHERE stars = 5
GROUP BY city, state
ORDER BY five_star_count DESC
LIMIT 5;

QUERY PLAN
Limit (cost=102.70..102.71 rows=5 width=21) (actual time=1.069..1.070 rows=5 loops=1)
-> Sort (cost=102.70..102.92 rows=90 width=21) (actual time=1.068..1.068 rows=5 loops=1)
Sort Key: (count(*)) DESC
Sort Method: top-N heapsort Memory: 25kB
-> HashAggregate (cost=100.30..101.20 rows=90 width=21) (actual time=1.037..1.049 rows=56 loops=1)
"Group Key: city, state"
Batches: 1 Memory Usage: 24kB
-> Seq Scan on business (cost=0.00..99.50 rows=107 width=13) (actual time=0.009..0.983 rows=107 loops=1)
Filter: (stars = '5'::numeric)
Rows Removed by Filter: 893


## Normalized Data

In [23]:
%%sql
SELECT city, state, COUNT(*) five_star_count
FROM nor_business b
JOIN location l
ON b.business_id = l.business_id
WHERE stars = 5 
GROUP BY city, state
ORDER BY five_star_count DESC
LIMIT 5;

city,state,five_star_count
Santa Barbara,CA,9
Reno,NV,7
Tucson,AZ,6
Tampa,FL,5
Philadelphia,PA,5


In [24]:
%%sql 
EXPLAIN ANALYZE 
SELECT city, state, COUNT(*) five_star_count
FROM nor_business b
JOIN location l
ON b.business_id = l.business_id
WHERE stars = 5 
GROUP BY city, state
ORDER BY five_star_count DESC
LIMIT 5;

QUERY PLAN
Limit (cost=124.12..124.14 rows=5 width=21) (actual time=1.563..1.565 rows=5 loops=1)
-> Sort (cost=124.12..124.39 rows=107 width=21) (actual time=1.551..1.553 rows=5 loops=1)
Sort Key: (count(*)) DESC
Sort Method: top-N heapsort Memory: 25kB
-> HashAggregate (cost=121.28..122.35 rows=107 width=21) (actual time=1.506..1.517 rows=56 loops=1)
"Group Key: l.city, l.state"
Batches: 1 Memory Usage: 24kB
-> Hash Join (cost=93.84..120.47 rows=107 width=13) (actual time=1.083..1.460 rows=107 loops=1)
Hash Cond: ((l.business_id)::text = (b.business_id)::text)
-> Seq Scan on location l (cost=0.00..24.00 rows=1000 width=36) (actual time=0.021..0.165 rows=1000 loops=1)


# Query 2
### Which years had the most elite users?

## Denormalized Data

In [28]:
%%sql
WITH exploded_elite AS (
    SELECT 
        UNNEST(STRING_TO_ARRAY(elite, ','))::INTEGER AS elite_year 
    FROM 
        yelp_user 
    WHERE 
        elite IS NOT NULL 
        AND elite != '' 
        AND elite NOT LIKE '%NaN%' 
)

SELECT 
    elite_year, 
    COUNT(*) AS user_count 
FROM 
    exploded_elite 
GROUP BY 
    elite_year 
ORDER BY 
    user_count DESC 
LIMIT 5;


elite_year,user_count
20,7936
2019,4387
2018,4198
2021,4171
2017,3763


In [31]:
%%sql
EXPLAIN ANALYZE
WITH exploded_elite AS (
    SELECT 
        UNNEST(STRING_TO_ARRAY(elite, ','))::INTEGER AS elite_year 
    FROM 
        yelp_user 
    WHERE 
        elite IS NOT NULL 
        AND elite != '' 
        AND elite NOT LIKE '%NaN%'  
)

SELECT 
    elite_year, 
    COUNT(*) AS user_count 
FROM 
    exploded_elite 
GROUP BY 
    elite_year 
ORDER BY 
    user_count DESC 
LIMIT 5;

QUERY PLAN
Limit (cost=5877.88..5877.89 rows=5 width=12) (actual time=28.643..29.898 rows=5 loops=1)
-> Sort (cost=5877.88..5878.38 rows=200 width=12) (actual time=28.642..29.897 rows=5 loops=1)
Sort Key: (count(*)) DESC
Sort Method: top-N heapsort Memory: 25kB
-> Finalize GroupAggregate (cost=5848.55..5874.55 rows=200 width=12) (actual time=28.619..29.881 rows=16 loops=1)
"Group Key: (((unnest(string_to_array(yelp_user.elite, ','::text))))::integer)"
-> Gather Merge (cost=5848.55..5871.55 rows=200 width=12) (actual time=28.615..29.873 rows=32 loops=1)
Workers Planned: 1
Workers Launched: 1
-> Sort (cost=4848.54..4849.04 rows=200 width=12) (actual time=24.205..24.206 rows=16 loops=2)


## Normalized Data

In [34]:
%%sql
SELECT elite_year, COUNT(*) user_count
FROM elite 
GROUP BY elite_year
ORDER BY user_count DESC
LIMIT 5;

elite_year,user_count
2019,4387
2018,4198
2021,4171
20,3968
2017,3763


In [35]:
%%sql
EXPLAIN ANALYZE
SELECT elite_year, COUNT(*) user_count
FROM elite 
GROUP BY elite_year
ORDER BY user_count DESC
LIMIT 5;

QUERY PLAN
Limit (cost=765.41..765.42 rows=5 width=12) (actual time=19.547..19.549 rows=5 loops=1)
-> Sort (cost=765.41..765.45 rows=16 width=12) (actual time=19.546..19.547 rows=5 loops=1)
Sort Key: (count(*)) DESC
Sort Method: top-N heapsort Memory: 25kB
-> HashAggregate (cost=764.98..765.14 rows=16 width=12) (actual time=19.510..19.514 rows=16 loops=1)
Group Key: elite_year
Batches: 1 Memory Usage: 24kB
-> Seq Scan on elite (cost=0.00..593.99 rows=34199 width=4) (actual time=0.045..9.467 rows=34199 loops=1)
Planning Time: 0.499 ms
Execution Time: 19.718 ms


# Query 3
### Find businesses open in multiple states

### Denormalized Data

In [42]:
%%sql
SELECT name
FROM business 
GROUP BY name
HAVING COUNT(state) > 1;

name
Forever 21
Pet Valu
Great Basin Federal Credit Union
CVS Pharmacy
Tim Hortons
Sprouts Farmers Market
Dairy Queen Grill & Chill
QDOBA Mexican Eats
Pizza Hut
Dunkin'


In [43]:
%%sql
EXPLAIN ANALYZE
SELECT name
FROM business 
GROUP BY name
HAVING COUNT(state) > 1;

QUERY PLAN
HashAggregate (cost=102.00..113.96 rows=319 width=19) (actual time=6.011..6.138 rows=27 loops=1)
Group Key: name
Filter: (count(state) > 1)
Batches: 1 Memory Usage: 193kB
Rows Removed by Filter: 930
-> Seq Scan on business (cost=0.00..97.00 rows=1000 width=22) (actual time=0.324..4.623 rows=1000 loops=1)
Planning Time: 0.899 ms
Execution Time: 7.183 ms


### Normalized Data

In [46]:
%%sql
SELECT name
FROM nor_business b
JOIN location l 
ON b.business_id = l.business_id
GROUP BY name
HAVING COUNT(state) > 1;

name
Forever 21
Pet Valu
Great Basin Federal Credit Union
CVS Pharmacy
Tim Hortons
Sprouts Farmers Market
Dairy Queen Grill & Chill
QDOBA Mexican Eats
Pizza Hut
Dunkin'


In [47]:
%%sql
EXPLAIN ANALYZE
SELECT name
FROM nor_business b
JOIN location l 
ON b.business_id = l.business_id
GROUP BY name
HAVING COUNT(state) > 1;

QUERY PLAN
HashAggregate (cost=134.14..146.10 rows=319 width=19) (actual time=7.332..7.442 rows=27 loops=1)
Group Key: b.name
Filter: (count(l.state) > 1)
Batches: 1 Memory Usage: 193kB
Rows Removed by Filter: 930
-> Hash Join (cost=102.50..129.14 rows=1000 width=22) (actual time=5.624..6.212 rows=1000 loops=1)
Hash Cond: ((l.business_id)::text = (b.business_id)::text)
-> Seq Scan on location l (cost=0.00..24.00 rows=1000 width=26) (actual time=2.685..2.839 rows=1000 loops=1)
-> Hash (cost=90.00..90.00 rows=1000 width=42) (actual time=2.824..2.824 rows=1000 loops=1)
Buckets: 1024 Batches: 1 Memory Usage: 82kB


# Query 4
### Find the user that has the most number of friends?

## Denormazlied Data

In [50]:
%%sql
SELECT 
    user_id, 
    name, 
    CARDINALITY(STRING_TO_ARRAY(friends, ',')) AS num_friends 
FROM 
    yelp_user 
ORDER BY 
    num_friends DESC 
LIMIT 5;


user_id,name,num_friends
Oi1qbcz2m2SnwUeztGYcnQ,Steven,10072
djxnI8Ux8ZYQJhiOQkrRhA,Abby,8858
MeDuKsZcnI3IU2g7OlV-hQ,Frank,7945
UsXqCXRZwSCSw0AT7y1uBg,Carl,7413
QiciohgD8N6vCty4vGL3pQ,Emily,7048


In [51]:
%%sql
EXPLAIN ANALYZE
SELECT 
    user_id, 
    name, 
    CARDINALITY(STRING_TO_ARRAY(friends, ',')) AS num_friends 
FROM 
    yelp_user 
ORDER BY 
    num_friends DESC 
LIMIT 5;

QUERY PLAN
Limit (cost=3400.35..3400.36 rows=5 width=33) (actual time=583.745..583.746 rows=5 loops=1)
-> Sort (cost=3400.35..3506.83 rows=42593 width=33) (actual time=583.744..583.745 rows=5 loops=1)
"Sort Key: (cardinality(string_to_array(friends, ','::text))) DESC"
Sort Method: top-N heapsort Memory: 25kB
-> Seq Scan on yelp_user (cost=0.00..2692.89 rows=42593 width=33) (actual time=2.448..576.911 rows=42593 loops=1)
Planning Time: 0.197 ms
Execution Time: 583.976 ms


## Normalized Data

In [52]:
%%sql
SELECT user_id, COUNT(*) num_friends
FROM friends 
GROUP BY user_id
ORDER BY num_friends DESC
LIMIT 5;

user_id,num_friends
Oi1qbcz2m2SnwUeztGYcnQ,10072
djxnI8Ux8ZYQJhiOQkrRhA,8858
MeDuKsZcnI3IU2g7OlV-hQ,7945
UsXqCXRZwSCSw0AT7y1uBg,7413
QiciohgD8N6vCty4vGL3pQ,7048


In [53]:
%%sql
EXPLAIN ANALYZE
SELECT user_id, COUNT(*) num_friends
FROM friends 
GROUP BY user_id
ORDER BY num_friends DESC
LIMIT 5;

QUERY PLAN
Limit (cost=62992.92..62992.93 rows=5 width=31) (actual time=1012.822..1012.853 rows=5 loops=1)
-> Sort (cost=62992.92..63026.60 rows=13471 width=31) (actual time=1012.821..1012.852 rows=5 loops=1)
Sort Key: (count(*)) DESC
Sort Method: top-N heapsort Memory: 25kB
-> Finalize HashAggregate (cost=62634.46..62769.17 rows=13471 width=31) (actual time=1005.569..1010.302 rows=29918 loops=1)
Group Key: user_id
Batches: 5 Memory Usage: 4657kB Disk Usage: 224kB
-> Gather (cost=59670.84..62499.75 rows=26942 width=31) (actual time=992.837..996.129 rows=31275 loops=1)
Workers Planned: 2
Workers Launched: 2
