In [1]:
import psycopg

In [2]:
%reload_ext sql

In [3]:
%sql postgresql://michellelin@localhost:5432/yelp

In [4]:
%config SqlMagic.displaylimit = 30

# Query 1
#### What are the top 5 cities with the most 5-star business?

## Denormalized Data

In [5]:
%%sql
SELECT city, state, COUNT(*) five_star_count
FROM business 
WHERE stars = 5
GROUP BY city, state
ORDER BY five_star_count DESC
LIMIT 5;

city,state,five_star_count
Philadelphia,PA,12
Reno,NV,9
New Orleans,LA,9
Tampa,FL,9
Nashville,TN,8


In [6]:
%%sql 
EXPLAIN ANALYZE 
SELECT city, state, COUNT(*) five_star_count
FROM business 
WHERE stars = 5
GROUP BY city, state
ORDER BY five_star_count DESC
LIMIT 5;

QUERY PLAN
Limit (cost=103.53..103.54 rows=5 width=21) (actual time=0.624..0.625 rows=5 loops=1)
-> Sort (cost=103.53..103.74 rows=85 width=21) (actual time=0.622..0.623 rows=5 loops=1)
Sort Key: (count(*)) DESC
Sort Method: top-N heapsort Memory: 25kB
-> HashAggregate (cost=101.27..102.11 rows=85 width=21) (actual time=0.589..0.597 rows=44 loops=1)
"Group Key: city, state"
Batches: 1 Memory Usage: 24kB
-> Seq Scan on business (cost=0.00..100.50 rows=102 width=13) (actual time=0.018..0.542 rows=102 loops=1)
Filter: (stars = '5'::numeric)
Rows Removed by Filter: 898


## Normalized Data

In [7]:
%%sql
SELECT city, state, COUNT(*) five_star_count
FROM nor_business b
JOIN location l
ON b.business_id = l.business_id
WHERE stars = 5 
GROUP BY city, state
ORDER BY five_star_count DESC
LIMIT 5;

city,state,five_star_count
Philadelphia,PA,12
Reno,NV,9
New Orleans,LA,9
Tampa,FL,9
Nashville,TN,8


In [8]:
%%sql 
EXPLAIN ANALYZE 
SELECT city, state, COUNT(*) five_star_count
FROM nor_business b
JOIN location l
ON b.business_id = l.business_id
WHERE stars = 5 
GROUP BY city, state
ORDER BY five_star_count DESC
LIMIT 5;

QUERY PLAN
Limit (cost=123.89..123.90 rows=5 width=21) (actual time=4.779..4.782 rows=5 loops=1)
-> Sort (cost=123.89..124.15 rows=102 width=21) (actual time=4.778..4.780 rows=5 loops=1)
Sort Key: (count(*)) DESC
Sort Method: top-N heapsort Memory: 25kB
-> HashAggregate (cost=121.18..122.20 rows=102 width=21) (actual time=4.733..4.761 rows=44 loops=1)
"Group Key: l.city, l.state"
Batches: 1 Memory Usage: 24kB
-> Hash Join (cost=93.78..120.41 rows=102 width=13) (actual time=4.316..4.663 rows=102 loops=1)
Hash Cond: ((l.business_id)::text = (b.business_id)::text)
-> Seq Scan on location l (cost=0.00..24.00 rows=1000 width=36) (actual time=0.291..0.406 rows=1000 loops=1)


# Query 2
### Which years had the most elite users?

## Denormalized Data

In [9]:
%%sql
WITH exploded_elite AS (
    SELECT 
        UNNEST(STRING_TO_ARRAY(elite, ','))::INTEGER AS elite_year 
    FROM 
        yelp_user 
    WHERE 
        elite IS NOT NULL 
        AND elite != '' 
        AND elite NOT LIKE '%NaN%' 
)

SELECT 
    elite_year, 
    COUNT(*) AS user_count 
FROM 
    exploded_elite 
GROUP BY 
    elite_year 
ORDER BY 
    user_count DESC 
LIMIT 5;


elite_year,user_count
20,7202
2019,3954
2021,3822
2018,3777
2017,3416


In [10]:
%%sql
EXPLAIN ANALYZE
WITH exploded_elite AS (
    SELECT 
        UNNEST(STRING_TO_ARRAY(elite, ','))::INTEGER AS elite_year 
    FROM 
        yelp_user 
    WHERE 
        elite IS NOT NULL 
        AND elite != '' 
        AND elite NOT LIKE '%NaN%'  
)

SELECT 
    elite_year, 
    COUNT(*) AS user_count 
FROM 
    exploded_elite 
GROUP BY 
    elite_year 
ORDER BY 
    user_count DESC 
LIMIT 5;

QUERY PLAN
Limit (cost=5555.51..5555.52 rows=5 width=12) (actual time=24.424..26.343 rows=5 loops=1)
-> Sort (cost=5555.51..5556.01 rows=200 width=12) (actual time=24.423..26.341 rows=5 loops=1)
Sort Key: (count(*)) DESC
Sort Method: top-N heapsort Memory: 25kB
-> Finalize GroupAggregate (cost=5526.18..5552.18 rows=200 width=12) (actual time=24.403..26.333 rows=16 loops=1)
"Group Key: (((unnest(string_to_array(yelp_user.elite, ','::text))))::integer)"
-> Gather Merge (cost=5526.18..5549.18 rows=200 width=12) (actual time=24.395..26.321 rows=32 loops=1)
Workers Planned: 1
Workers Launched: 1
-> Sort (cost=4526.17..4526.67 rows=200 width=12) (actual time=21.531..21.533 rows=16 loops=2)


## Normalized Data

In [11]:
%%sql
SELECT elite_year, COUNT(*) user_count
FROM elite 
GROUP BY elite_year
ORDER BY user_count DESC
LIMIT 5;

elite_year,user_count
2019,3954
2021,3822
2018,3777
20,3601
2017,3416


In [12]:
%%sql
EXPLAIN ANALYZE
SELECT elite_year, COUNT(*) user_count
FROM elite 
GROUP BY elite_year
ORDER BY user_count DESC
LIMIT 5;

QUERY PLAN
Limit (cost=726.82..726.83 rows=5 width=12) (actual time=10.378..10.380 rows=5 loops=1)
-> Sort (cost=726.82..726.86 rows=16 width=12) (actual time=10.377..10.378 rows=5 loops=1)
Sort Key: (count(*)) DESC
Sort Method: top-N heapsort Memory: 25kB
-> HashAggregate (cost=726.39..726.55 rows=16 width=12) (actual time=10.363..10.366 rows=16 loops=1)
Group Key: elite_year
Batches: 1 Memory Usage: 24kB
-> Seq Scan on elite (cost=0.00..563.93 rows=32493 width=4) (actual time=0.009..3.084 rows=32493 loops=1)
Planning Time: 0.078 ms
Execution Time: 10.428 ms


# Query 3
### Find businesses open in multiple states

### Denormalized Data

In [13]:
%%sql
SELECT name
FROM business 
GROUP BY name
HAVING COUNT(state) > 1;

name
Firestone Complete Auto Care
Michaels
Planet Fitness
Culver's
Subway
Jack in the Box
Wendy's
Chipotle Mexican Grill
Lowe's Home Improvement
Midas


In [14]:
%%sql
EXPLAIN ANALYZE
SELECT name
FROM business 
GROUP BY name
HAVING COUNT(state) > 1;

QUERY PLAN
HashAggregate (cost=103.00..114.86 rows=316 width=19) (actual time=1.138..1.247 rows=32 loops=1)
Group Key: name
Filter: (count(state) > 1)
Batches: 1 Memory Usage: 193kB
Rows Removed by Filter: 917
-> Seq Scan on business (cost=0.00..98.00 rows=1000 width=22) (actual time=0.009..0.576 rows=1000 loops=1)
Planning Time: 0.085 ms
Execution Time: 1.295 ms


### Normalized Data

In [15]:
%%sql
SELECT name
FROM nor_business b
JOIN location l 
ON b.business_id = l.business_id
GROUP BY name
HAVING COUNT(state) > 1;

name
Firestone Complete Auto Care
Michaels
Planet Fitness
Culver's
Subway
Jack in the Box
Wendy's
Chipotle Mexican Grill
Lowe's Home Improvement
Midas


In [16]:
%%sql
EXPLAIN ANALYZE
SELECT name
FROM nor_business b
JOIN location l 
ON b.business_id = l.business_id
GROUP BY name
HAVING COUNT(state) > 1;

QUERY PLAN
HashAggregate (cost=134.14..146.00 rows=316 width=19) (actual time=4.309..4.528 rows=32 loops=1)
Group Key: b.name
Filter: (count(l.state) > 1)
Batches: 1 Memory Usage: 193kB
Rows Removed by Filter: 917
-> Hash Join (cost=102.50..129.14 rows=1000 width=22) (actual time=1.744..3.477 rows=1000 loops=1)
Hash Cond: ((l.business_id)::text = (b.business_id)::text)
-> Seq Scan on location l (cost=0.00..24.00 rows=1000 width=26) (actual time=0.020..0.859 rows=1000 loops=1)
-> Hash (cost=90.00..90.00 rows=1000 width=42) (actual time=1.712..1.714 rows=1000 loops=1)
Buckets: 1024 Batches: 1 Memory Usage: 81kB


# Query 4
### Find the user that has the most number of friends?

## Denormazlied Data

In [17]:
%%sql
SELECT 
    user_id, 
    name, 
    CARDINALITY(STRING_TO_ARRAY(friends, ',')) AS num_friends 
FROM 
    yelp_user 
ORDER BY 
    num_friends DESC 
LIMIT 5;


user_id,name,num_friends
hizGc5W1tBHPghM5YKCAtg,Katie,9390
djxnI8Ux8ZYQJhiOQkrRhA,Abby,8858
JjXuiru1_ONzDkYVrHN0aw,Richard,7228
5MCBLBxr10NLUKZ4AboAMg,Colleen,7179
YttDgOC9AlM4HcAlDsbB2A,Phil,6941


In [18]:
%%sql
EXPLAIN ANALYZE
SELECT 
    user_id, 
    name, 
    CARDINALITY(STRING_TO_ARRAY(friends, ',')) AS num_friends 
FROM 
    yelp_user 
ORDER BY 
    num_friends DESC 
LIMIT 5;

QUERY PLAN
Limit (cost=3103.28..3103.29 rows=5 width=33) (actual time=795.674..795.693 rows=5 loops=1)
-> Sort (cost=3103.28..3200.58 rows=38921 width=33) (actual time=795.673..795.691 rows=5 loops=1)
"Sort Key: (cardinality(string_to_array(friends, ','::text))) DESC"
Sort Method: top-N heapsort Memory: 25kB
-> Seq Scan on yelp_user (cost=0.00..2456.82 rows=38921 width=33) (actual time=0.704..784.680 rows=38921 loops=1)
Planning Time: 0.374 ms
Execution Time: 795.723 ms


## Normalized Data

In [19]:
%%sql
SELECT user_id, COUNT(*) num_friends
FROM friends 
GROUP BY user_id
ORDER BY num_friends DESC
LIMIT 5;

user_id,num_friends
hizGc5W1tBHPghM5YKCAtg,9390
djxnI8Ux8ZYQJhiOQkrRhA,8858
JjXuiru1_ONzDkYVrHN0aw,7228
5MCBLBxr10NLUKZ4AboAMg,7179
YttDgOC9AlM4HcAlDsbB2A,6941


In [20]:
%%sql
EXPLAIN ANALYZE
SELECT user_id, COUNT(*) num_friends
FROM friends 
GROUP BY user_id
ORDER BY num_friends DESC
LIMIT 5;

QUERY PLAN
Limit (cost=59389.93..59389.94 rows=5 width=31) (actual time=1169.483..1169.526 rows=5 loops=1)
-> Sort (cost=59389.93..59421.23 rows=12520 width=31) (actual time=1169.481..1169.525 rows=5 loops=1)
Sort Key: (count(*)) DESC
Sort Method: top-N heapsort Memory: 25kB
-> Finalize HashAggregate (cost=59056.78..59181.98 rows=12520 width=31) (actual time=1161.287..1165.804 rows=27468 loops=1)
Group Key: user_id
Batches: 1 Memory Usage: 3857kB
-> Gather (cost=56302.38..58931.58 rows=25040 width=31) (actual time=1142.185..1148.168 rows=29651 loops=1)
Workers Planned: 2
Workers Launched: 2
