In [1]:
import pandas as pd
import plotly.express as px
from plotly import graph_objects as go

## Connect to Snowflake

In [2]:
import snowflake.connector
from snowflake_credentials import snowflake_params

conn = snowflake.connector.connect(
    user=snowflake_params['user'],
    password=snowflake_params['password'],
    account=snowflake_params['account'],
    warehouse=snowflake_params['warehouse'],
    database=snowflake_params['database'],
    schema=snowflake_params['schema'],
    role=snowflake_params['role']
)

cur = conn.cursor()

## Preview the Table Data

In [21]:
businesses_count = cur.execute('''
                               SELECT COUNT(*)
                               FROM yelp_businesses
                               ''').fetch_pandas_all()

print(f'There are {businesses_count['COUNT(*)'][0]:,} records in the Yelp businesses table.')

df = cur.execute('''
                 SELECT *
                 FROM yelp_businesses 
                 LIMIT 5
                 ''').fetch_pandas_all()

display(df)

There are 150,346 records in the Yelp businesses table.


Unnamed: 0,NAME,BUSINESS_ID,IS_OPEN,REVIEW_COUNT,STARS,CATEGORIES,STATE,CITY,POSTAL_CODE,LATITUDE,LONGITUDE
0,"Abby Rappoport, LAC, CMQ",Pns2l4eNsfO8kk83dixA6A,0,7,5,"Doctors, Traditional Chinese Medicine, Naturop...",CA,Santa Barbara,93101,34.426679,-119.711197
1,The UPS Store,mpf3x-BjTdTEA3yCZrAYPw,1,15,3,"Shipping Centers, Local Services, Notaries, Ma...",MO,Affton,63123,38.551126,-90.335695
2,Target,tUFrWirKiKi_TAnsVWINQQ,0,22,4,"Department Stores, Shopping, Fashion, Home & G...",AZ,Tucson,85711,32.223236,-110.880452
3,St Honore Pastries,MTSW4McQd7CbVtyjqoe9mw,1,80,4,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",PA,Philadelphia,19107,39.955505,-75.155564
4,Perkiomen Valley Brewery,mWMc6_wTdE0EUBKIGXDVfA,1,13,5,"Brewpubs, Breweries, Food",PA,Green Lane,18054,40.338183,-75.471659


In [22]:
reviews_count = cur.execute('''
                               SELECT COUNT(*)
                               FROM yelp_reviews
                               ''').fetch_pandas_all()

print(f'There are {reviews_count['COUNT(*)'][0]:,} records in the Yelp reviews table.')

df = cur.execute('''
                 SELECT *
                 FROM yelp_reviews 
                 LIMIT 5
                 ''').fetch_pandas_all()

display(df)

There are 6,990,280 records in the Yelp reviews table.


Unnamed: 0,BUSINESS_ID,REVIEW_ID,USER_ID,REVIEW_DATE,USER_REVIEW,STARS,COOL_VOTES,FUNNY_VOTES,USEFUL_VOTES
0,iRIHK8-EwpeffwvoO4nzIA,oXpWjOG2rue-2bHbjvDZIQ,1uHAAjOX18Px4OibCMI3pg,2018-10-04,Awesome spot. This is one of my favorite place...,5,0,0,0
1,U9TJ13_B6mySAb1RI4lhug,QVL6hH7M0nv5ljHCjRfK0w,KZXmUq6y_0k4guwAfPmQcQ,2017-05-13,"I like Rumor, but some nights, Idk why they ch...",3,0,0,0
2,oBNrLz4EDhiscSlbOl8uAw,PAjnd_I5pkN2m9Mg5cJTUg,C8jSfs5zj3NkqBx-YRb3tA,2018-04-15,This place has the rudest server I think I hav...,1,0,0,1
3,IL7MEBxfei9pOaGR1g6joQ,H_gK3gJrBSMlxELs2BC7sQ,YOIorhfo8hHhilRyam2c4w,2017-12-29,I've had the ice cream from here a number of t...,4,0,0,0
4,t-cT3vDv6vB0AAow_XIK8g,kViZwGpixoAnpN7xpsaJyQ,y6RQ8oBEtryoXVTxBddNng,2012-06-26,This is a very nice store to visit. They have ...,5,0,0,0


In [49]:
users_count = cur.execute('''
                               SELECT COUNT(*)
                               FROM yelp_users
                               ''').fetch_pandas_all()

print(f'There are {users_count['COUNT(*)'][0]:,} records in the Yelp users table.')

df = cur.execute('''
                 SELECT *
                 FROM yelp_users 
                 LIMIT 5
                 ''').fetch_pandas_all()

display(df)

There are 1,987,897 records in the Yelp users table.


Unnamed: 0,FIRST_NAME,USER_ID,YELPING_SINCE,REVIEW_COUNT,AVG_STARS,FANS,COOL_COMPLIMENTS,CUTE_COMPLIMENTS,FUNNY_COMPLIMENTS,HOT_COMPLIMENTS,...,MORE_COMPLIMENTS,NOTE_COMPLIMENTS,PHOTO_COMPLIMENTS,PLAIN_COMPLIMENTS,PROFILE_COMPLIMENTS,WRITER_COMPLIMENTS,COOL_VOTES,ELITE,FUNNY_VOTES,USEFUL_VOTES
0,Thomas,dXZLLcy6klelF1O1wL_vzw,2015-07-24,11,4.45,1,0,0,0,0,...,0,0,0,0,0,0,0,,0,2
1,Jeremias JMan,eaiIXArF07R0MGtu4h-6kg,2015-10-29,19,4.42,1,0,0,0,0,...,0,0,0,0,0,0,2,,9,9
2,Dolly,T_Fsemvyy8IsoB90-sS0sQ,2014-05-05,48,4.32,5,1,1,1,2,...,0,0,0,1,0,0,13,,10,38
3,Ellyse,EaRd_4nFnPs8Fei-H2ZmVA,2013-07-18,427,4.31,61,31,0,31,5,...,2,8,17,15,0,23,420,2017201820192020.0,108,823
4,Julie,yqpqFUXIkKBYP5QQ-5jZpA,2013-06-13,48,3.34,1,1,0,1,0,...,0,1,0,0,0,0,17,,28,62


In [24]:
tips_count = cur.execute('''
                               SELECT COUNT(*)
                               FROM yelp_tips
                               ''').fetch_pandas_all()

print(f'There are {tips_count['COUNT(*)'][0]:,} records in the Yelp tips table.')

df = cur.execute('''
                 SELECT *
                 FROM yelp_tips 
                 LIMIT 5
                 ''').fetch_pandas_all()

display(df)

There are 908,915 records in the Yelp tips table.


Unnamed: 0,BUSINESS_ID,USER_ID,DATE,TIP,COMPLIMENT_COUNT
0,WnHG_obpmHazbngTdu570Q,_Wkf2RNDnBuqMOpzSqmZXw,2015-08-14,Only a couple more weeks,0
1,c-CAcdH2-12g1sQUQfy5xw,xnuEEuMfvCNLFMtaZf7asw,2016-03-20,Get here early!!,0
2,wj8XtPyuREj8_0GQz3LZ6w,W0DJOPsSwcAj0uqCJG8iLw,2015-08-14,If you have not tried this place what are you ...,0
3,l8vwslfqKq1yxlmgVi-M0Q,AED6tP0v4aMPd9-YtjPcPg,2015-05-11,Check out the third floor lounge for nice city...,0
4,KZe5kpIHWVuJl6WgNYzceg,9UZlB3fHVe2JBvPw4kRb5g,2012-03-03,Pancakes with Breakfast Trick?,0


## Category Analysis

#### Number of Businesses by Category

In [None]:
# Top 20 categories of all businesses on Yelp.
df = cur.execute('''
                 WITH cte AS (
                    -- split the categories column so each business has a row for each category it has
                    SELECT business_id, stars,
                    TRIM(A.value) AS category
                    FROM yelp_businesses,
                    LATERAL SPLIT_TO_TABLE(categories, ',') A
                )

                SELECT category, COUNT(DISTINCT business_id) AS num_of_businesses, ROUND(AVG(stars), 2) AS avg_rating
                FROM cte
                GROUP BY 1
                ORDER BY 2 DESC
                LIMIT 20;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,CATEGORY,NUM_OF_BUSINESSES,AVG_STARS
0,Restaurants,52268,3.77
1,Food,27781,3.93
2,Shopping,24395,3.86
3,Home Services,14356,3.69
4,Beauty & Spas,14292,4.05
5,Nightlife,12281,3.91
6,Health & Medical,11890,3.86
7,Local Services,11198,3.85
8,Bars,11065,3.9
9,Automotive,10773,3.75


#### Review Volume by Category

In [None]:
# Top 20 categories by number of reviews
df = cur.execute('''
                 WITH cte AS (
                    -- split the categories column so each business has a row for each category it has
                    SELECT business_id, stars, review_count,
                    TRIM(A.value) AS category
                    FROM yelp_businesses,
                    LATERAL SPLIT_TO_TABLE(categories, ',') A
                )

                SELECT category, SUM(review_count) AS num_of_reviews, COUNT(DISTINCT business_id) AS num_of_businesses, ROUND(AVG(stars), 2) AS avg_rating
                FROM cte
                GROUP BY 1  
                ORDER BY 2 DESC
                LIMIT 20;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,CATEGORY,NUM_OF_REVIEWS,NUM_OF_BUSINESSES,AVG_STARS
0,Restaurants,4561279,52268,3.77
1,Food,1752281,27781,3.93
2,Nightlife,1488163,12281,3.91
3,Bars,1406415,11065,3.9
4,American (Traditional),976483,8139,3.65
5,American (New),950560,6097,3.84
6,Breakfast & Brunch,839467,6239,3.82
7,Sandwiches,669139,8366,3.8
8,Seafood,600183,3539,3.92
9,Event Planning & Services,591442,9895,3.93


#### Average Star Rating by Category

In [None]:
# Top 20 categories by average star rating (minimum 1000 reviews)
df = cur.execute('''
                 WITH cte AS (
                    -- split the categories column so each business has a row for each category it has
                    SELECT business_id, stars, review_count,
                    TRIM(A.value) AS category
                    FROM yelp_businesses,
                    LATERAL SPLIT_TO_TABLE(categories, ',') A
                )

                SELECT category, SUM(review_count) AS num_of_reviews, COUNT(DISTINCT business_id) AS num_of_businesses, ROUND(AVG(stars), 2) AS avg_rating
                FROM cte
                GROUP BY 1  
                HAVING SUM(review_count) >= 1000
                ORDER BY 4 DESC
                LIMIT 20;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,CATEGORY,NUM_OF_REVIEWS,NUM_OF_BUSINESSES,AVG_STARS
0,Conveyor Belt Sushi,1128,1,5.0
1,Art Tours,1962,18,4.94
2,Bike tours,1714,32,4.88
3,Sugaring,1549,39,4.85
4,Cideries,1650,25,4.84
5,Beer Tours,2949,51,4.82
6,Reiki,3048,201,4.82
7,Barre Classes,2926,158,4.8
8,Axe Throwing,1856,53,4.79
9,Meditation Centers,1451,99,4.79


In [None]:
# Bottom 20 categories by average star rating (minimum 1000 reviews)
df = cur.execute('''
                 WITH cte AS (
                    -- split the categories column so each business has a row for each category it has
                    SELECT business_id, stars, review_count,
                    TRIM(A.value) AS category
                    FROM yelp_businesses,
                    LATERAL SPLIT_TO_TABLE(categories, ',') A
                )

                SELECT category, SUM(review_count) AS num_of_reviews, COUNT(DISTINCT business_id) AS num_of_businesses, ROUND(AVG(stars), 2) AS avg_rating
                FROM cte
                GROUP BY 1  
                HAVING SUM(review_count) >= 1000
                ORDER BY 4
                LIMIT 20;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,CATEGORY,NUM_OF_REVIEWS,NUM_OF_BUSINESSES,AVG_STARS
0,Television Service Providers,4253,159,2.33
1,University Housing,2630,122,2.53
2,Internet Service Providers,7947,356,2.6
3,Property Management,12282,759,2.74
4,Post Offices,5284,327,2.79
5,Apartments,28510,1921,2.82
6,Truck Rental,3633,206,2.84
7,Fast Food,224529,6472,2.89
8,Airlines,6018,118,2.89
9,Utilities,2276,102,2.9


In [None]:
# Categories with high stars and low engagement
df = cur.execute('''
                 WITH cte AS (
                    -- split the categories column so each business has a row for each category it has
                    SELECT business_id, stars, review_count,
                    TRIM(A.value) AS category
                    FROM yelp_businesses,
                    LATERAL SPLIT_TO_TABLE(categories, ',') A
                )

                SELECT category, SUM(review_count) AS num_of_reviews, COUNT(DISTINCT business_id) AS num_of_businesses, ROUND(AVG(stars), 2) AS avg_rating
                FROM cte
                GROUP BY 1  
                ORDER BY 4 DESC, 2
                LIMIT 20;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,CATEGORY,NUM_OF_REVIEWS,NUM_OF_BUSINESSES,AVG_STARS
0,Metal Detector Services,6,1,5.0
1,Patent Law,6,1,5.0
2,Karaoke Rental,7,1,5.0
3,Circus Schools,9,1,5.0
4,Bubble Soccer,13,1,5.0
5,Mohels,14,1,5.0
6,Art Consultants,15,2,5.0
7,Calligraphy,16,2,5.0
8,Water Suppliers,17,1,5.0
9,Silent Disco,37,2,5.0


## Reviews Analysis

#### Total Review Volume by Star Rating

In [45]:
# Count of reviews for each star value 1-5
df = cur.execute('''
                 SELECT '1 Star' AS review_rating, COUNT(review_id) AS num_reviews FROM yelp_reviews WHERE stars = 1
                 UNION
                 SELECT '2 Star' AS review_rating, COUNT(review_id) AS num_reviews FROM yelp_reviews WHERE stars = 2
                 UNION
                 SELECT '3 Star' AS review_rating, COUNT(review_id) AS num_reviews FROM yelp_reviews WHERE stars = 3
                 UNION
                 SELECT '4 Star' AS review_rating, COUNT(review_id) AS num_reviews FROM yelp_reviews WHERE stars = 4
                 UNION
                 SELECT '5 Star' AS review_rating, COUNT(review_id) AS num_reviews FROM yelp_reviews WHERE stars = 5
                 ORDER BY num_reviews DESC;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,REVIEW_RATING,NUM_REVIEWS
0,5 Star,3231627
1,4 Star,1452918
2,1 Star,1069561
3,3 Star,691934
4,2 Star,544240


#### Volume of Each Rating by Business

In [None]:
# Count of reviews with each star value by business
df = cur.execute('''
                 SELECT 
                    business_id, 
                    SUM(CASE WHEN stars=1 THEN 1 ELSE 0 END) AS one_star_reviews,
                    SUM(CASE WHEN stars=2 THEN 1 ELSE 0 END) AS two_star_reviews,
                    SUM(CASE WHEN stars=3 THEN 1 ELSE 0 END) AS three_star_reviews,
                    SUM(CASE WHEN stars=4 THEN 1 ELSE 0 END) AS four_star_reviews,
                    SUM(CASE WHEN stars=5 THEN 1 ELSE 0 END) AS five_star_reviews,
                    COUNT(*) AS total_reviews,
                    SUM(stars) AS total_stars
                 FROM yelp_reviews
                 GROUP BY 1;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,BUSINESS_ID,ONE_STAR_REVIEWS,TWO_STAR_REVIEWS,THREE_STAR_REVIEWS,FOUR_STAR_REVIEWS,FIVE_STAR_REVIEWS,TOTAL_REVIEWS,TOTAL_STARS
0,lCjNNCJKMEeAHjSUFqEH5A,0,0,0,3,6,9,42
1,nwN92Uje-xIKE5voPTTvBQ,18,28,36,102,425,609,2715
2,8OGLDw2Z0UK0aURh8eRVMQ,15,8,5,2,5,35,79
3,S8ZFYEgMejpChID8tzKo9A,79,115,236,658,1074,2162,9019
4,YRw-uBpdzRZngN6zzoC4WA,2,2,1,2,7,14,52
...,...,...,...,...,...,...,...,...
150341,I6p837glbhDog1XB4NSvKA,0,0,2,2,1,5,19
150342,5LIBxc9PNCGjtGk1I7HOpg,0,0,0,0,5,5,25
150343,615-GZzUg2Xpw-AkzOeXKA,3,0,0,0,2,5,13
150344,HEP7VxLY2v0mRDXPARrPyQ,1,0,0,0,4,5,21


In [None]:
# True star average for each business and percentage of five star and one star reviews to total reviews
df = cur.execute('''
                WITH cte AS (
                    SELECT 
                        business_id, 
                        SUM(CASE WHEN stars=1 THEN 1 ELSE 0 END) AS one_star_reviews,
                        SUM(CASE WHEN stars=2 THEN 1 ELSE 0 END) AS two_star_reviews,
                        SUM(CASE WHEN stars=3 THEN 1 ELSE 0 END) AS three_star_reviews,
                        SUM(CASE WHEN stars=4 THEN 1 ELSE 0 END) AS four_star_reviews,
                        SUM(CASE WHEN stars=5 THEN 1 ELSE 0 END) AS five_star_reviews,
                        COUNT(*) AS total_reviews,
                        SUM(stars) AS total_stars,
                        ROUND(AVG(stars), 2) AS true_avg_rating
                    FROM yelp_reviews
                    GROUP BY 1
                 )

                SELECT 
                    yb.name, yb.city, yb.state, total_reviews, total_stars,
                    true_avg_rating,
                    yb.stars AS rounded_avg_rating,
                    ROUND(five_star_reviews / total_reviews * 100, 2) AS five_star_pct,
                    ROUND(one_star_reviews / total_reviews * 100, 2) AS one_star_pct,
                FROM cte
                JOIN yelp_businesses yb
                    ON cte.business_id = yb.business_id
                LIMIT 20;
                ''').fetch_pandas_all()

display(df)

Unnamed: 0,NAME,CITY,STATE,TOTAL_REVIEWS,TOTAL_STARS,TRUE_AVG_STARS,ROUNDED_AVG_STARS,FIVE_STAR_PCT,ONE_STAR_PCT
0,The Gables Apartments,Greenwood,IN,12,16,1.33,1.5,0.0,83.33
1,Schiano's Pizza,Warminster,PA,29,83,2.86,3.0,31.03,41.38
2,Caliber Collision,Largo,FL,10,34,3.4,3.5,50.0,30.0
3,Gail Marcus,Philadelphia,PA,9,45,5.0,5.0,100.0,0.0
4,Employ Health,Nashville,TN,10,50,5.0,5.0,100.0,0.0
5,Value Vet,Nashville,TN,51,184,3.61,3.5,47.06,15.69
6,America's Mattress of Tucson,Tucson,AZ,7,35,5.0,5.0,100.0,0.0
7,Terry House Bed & Breakfast,New Castle,DE,5,22,4.4,4.5,40.0,0.0
8,Little Caesers Pizza,Souderton,PA,5,17,3.4,3.5,60.0,40.0
9,Piano Gastrolounge,Santa Barbara,CA,24,78,3.25,3.5,25.0,29.17


In [None]:
# Months with the highest total reviews and highest average star rating
df = cur.execute('''
                SELECT 
                    MONTH(review_date) AS month,
                    SUM(CASE WHEN stars=1 THEN 1 ELSE 0 END) AS one_star_reviews,
                    SUM(CASE WHEN stars=2 THEN 1 ELSE 0 END) AS two_star_reviews,
                    SUM(CASE WHEN stars=3 THEN 1 ELSE 0 END) AS three_star_reviews,
                    SUM(CASE WHEN stars=4 THEN 1 ELSE 0 END) AS four_star_reviews,
                    SUM(CASE WHEN stars=5 THEN 1 ELSE 0 END) AS five_star_reviews,
                    COUNT(*) AS total_reviews,
                    SUM(stars) AS total_stars,
                    ROUND(AVG(stars), 2) avg_rating
                FROM yelp_reviews
                GROUP BY 1
                ORDER BY 1;
                 ''').fetch_pandas_all()

display(df)

# Rename columns for readability in charts
df.rename(columns={'ONE_STAR_REVIEWS': '1 Star', 'TWO_STAR_REVIEWS': '2 Star', 'THREE_STAR_REVIEWS': '3 Star', 'FOUR_STAR_REVIEWS': '4 Star', 'FIVE_STAR_REVIEWS': '5 Star', 'TOTAL_REVIEWS': 'Total Reviews',
                   'AVG_RATING': 'Average Rating'}, inplace=True)

# Generate line chart for review count for each star value by month
fig = px.line(
    df,
    x='MONTH',
    y=[
        '1 Star',
        '2 Star',
        '3 Star',
        '4 Star',
        '5 Star'
    ],
    labels={
        'value': 'Review Count',
        'variable': 'Star Rating',
        'MONTH': 'Month'
    },
    title='Monthly Yelp Review Star Ratings Trends'
)
fig.show()

# Generate bar chart for monthly review volume by month
fig = px.bar(
    df, 
    x='MONTH', 
    y='Total Reviews', 
    color='Total Reviews', 
    labels={'MONTH': 'Month', 'Total Reviews': 'Review Count'}, 
    title='Monthly Yelp Review Volume Trends'
)
fig.show()

Unnamed: 0,MONTH,ONE_STAR_REVIEWS,TWO_STAR_REVIEWS,THREE_STAR_REVIEWS,FOUR_STAR_REVIEWS,FIVE_STAR_REVIEWS,TOTAL_REVIEWS,TOTAL_STARS,AVG_STARS
0,1,86852,46047,61474,131145,279014,604532,2283018,3.78
1,2,80051,42503,54620,116561,250390,544125,2047111,3.76
2,3,87976,46933,60527,127924,275195,598555,2251094,3.76
3,4,82676,43029,55083,116372,254311,551471,2071026,3.76
4,5,91994,46318,58076,120991,269196,586575,2188802,3.73
5,6,93282,46701,58791,123158,279805,601737,2254714,3.75
6,7,100135,51170,64669,135171,303482,654627,2454576,3.75
7,8,98608,49976,62289,129710,295801,636384,2383272,3.75
8,9,89705,44552,55136,114482,261499,565374,2109640,3.73
9,10,89243,44267,55856,115923,266520,571809,2141637,3.75


In [None]:
# Total reviews by each star amount and the average star review for each month in every year
df = cur.execute('''
                SELECT 
                    YEAR(review_date) AS year,
                    MONTH(review_date) AS month,
                    SUM(CASE WHEN stars=1 THEN 1 ELSE 0 END) AS one_star_reviews,
                    SUM(CASE WHEN stars=2 THEN 1 ELSE 0 END) AS two_star_reviews,
                    SUM(CASE WHEN stars=3 THEN 1 ELSE 0 END) AS three_star_reviews,
                    SUM(CASE WHEN stars=4 THEN 1 ELSE 0 END) AS four_star_reviews,
                    SUM(CASE WHEN stars=5 THEN 1 ELSE 0 END) AS five_star_reviews,
                    COUNT(*) AS total_reviews,
                    SUM(stars) AS total_stars,
                    ROUND(AVG(stars), 2) avg_rating
                FROM yelp_reviews
                GROUP BY 1, 2
                ORDER BY 1, 2;
                 ''').fetch_pandas_all()

display(df)

# Rename columns for readability in charts
df.rename(columns={'ONE_STAR_REVIEWS': '1 Star', 'TWO_STAR_REVIEWS': '2 Star', 'THREE_STAR_REVIEWS': '3 Star', 'FOUR_STAR_REVIEWS': '4 Star', 'FIVE_STAR_REVIEWS': '5 Star', 'TOTAL_REVIEWS': 'Total Reviews',
                   'AVG_RATING': 'Average Rating'}, inplace=True)

# Create a date column from year and month
df['date'] = df['MONTH'].astype(str).str.zfill(2) + '/' + df['YEAR'].astype(str)

# Melt into long format for counts of reviews with each star value
long_df = df.melt(
    id_vars=['date'],
    value_vars=['1 Star','2 Star','3 Star','4 Star','5 Star'],
    var_name='star',
    value_name='count'
)

# Plot time series of star value counts by year and month
fig = px.line(
    long_df,
    x='date',
    y='count',
    color='star',
    labels={
        'date':'Month-Year',
        'count':'Review Count',
        'star':'Star Rating'
    },
    title='Monthly Yelp Star Rating Counts by Year'
)
fig.show()

# Melt into long format for total review counts for each month
long_df = df.melt(
    id_vars=['date'],
    value_vars=['Total Reviews'],
    var_name='reviews',
    value_name='count'
)

# Plot time series of total reviews by year and month
fig = px.line(
    long_df, 
    x='date', 
    y='count', 
    labels={
        'date': 'Month-Year',
        'count': 'Review Count'
        }, 
    title='Monthly Yelp Review Volume by Year'
)
fig.show()

# Melt into long format for average star rating for each month
long_df = df.melt(
    id_vars=['date'],
    value_vars=['Average Rating'],
    var_name='avg_rating',
    value_name='count'
)

# Plot time series of average star rating by year and month
fig = px.line(
    long_df, 
    x='date', 
    y='count', 
    labels={
        'date': 'Month-Year',
        'count': 'Average Star Rating'
        }, 
    title='Monthly Average Yelp Star Rating by Year'
)
fig.show()

Unnamed: 0,YEAR,MONTH,ONE_STAR_REVIEWS,TWO_STAR_REVIEWS,THREE_STAR_REVIEWS,FOUR_STAR_REVIEWS,FIVE_STAR_REVIEWS,TOTAL_REVIEWS,TOTAL_STARS,AVG_STARS
0,2005,2,0,0,0,2,1,3,13,4.33
1,2005,3,5,8,18,20,23,74,270,3.65
2,2005,4,1,0,4,10,11,26,108,4.15
3,2005,5,1,7,16,42,42,108,441,4.08
4,2005,6,2,2,4,15,15,38,153,4.03
...,...,...,...,...,...,...,...,...,...,...
199,2021,9,10470,3295,3348,5775,23509,46397,167749,3.62
200,2021,10,10539,3510,3567,6507,25181,49304,180193,3.65
201,2021,11,9426,3180,3193,6290,23135,45224,166200,3.68
202,2021,12,10175,3219,3601,6768,24716,48479,178068,3.67


In [None]:
# Number of businesses and avg star rating by city
df = cur.execute('''
                SELECT 
                    city,
                    COUNT(DISTINCT yb.business_id) num_businesses,
                    ROUND(AVG(yr.stars), 2) as avg_rating
                FROM yelp_reviews yr
                JOIN yelp_businesses yb
                    ON yr.business_id = yb.business_id
                GROUP BY city
                ORDER BY 2 DESC
                LIMIT 20;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,CITY,NUM_BUSINESSES,AVG_STARS
0,Philadelphia,14569,3.79
1,Tucson,9250,3.7
2,Tampa,9050,3.75
3,Indianapolis,7540,3.82
4,Nashville,6971,3.82
5,New Orleans,6209,3.94
6,Reno,5935,3.75
7,Edmonton,5054,3.57
8,Saint Louis,4827,3.81
9,Santa Barbara,3829,4.0


In [None]:
# Number of businesses and avg star rating by state
df = cur.execute('''
                SELECT 
                    state,
                    COUNT(DISTINCT yb.business_id) num_businesses,
                    ROUND(AVG(yr.stars), 2) as avg_rating
                FROM yelp_reviews yr
                JOIN yelp_businesses yb
                    ON yr.business_id = yb.business_id
                GROUP BY state
                ORDER BY 2 DESC;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,STATE,NUM_BUSINESSES,AVG_STARS
0,PA,34039,3.7
1,FL,26330,3.76
2,TN,12056,3.76
3,IN,11247,3.79
4,MO,10913,3.73
5,LA,9924,3.88
6,AZ,9912,3.69
7,NJ,8536,3.56
8,NV,7715,3.73
9,AB,5573,3.57


In [None]:
# Reviews by quarter
df = cur.execute('''
                SELECT 
                    QUARTER(review_date) AS quarter,
                    SUM(CASE WHEN stars=1 THEN 1 ELSE 0 END) AS one_star_reviews,
                    SUM(CASE WHEN stars=2 THEN 1 ELSE 0 END) AS two_star_reviews,
                    SUM(CASE WHEN stars=3 THEN 1 ELSE 0 END) AS three_star_reviews,
                    SUM(CASE WHEN stars=4 THEN 1 ELSE 0 END) AS four_star_reviews,
                    SUM(CASE WHEN stars=5 THEN 1 ELSE 0 END) AS five_star_reviews,
                    COUNT(*) AS total_reviews,
                    SUM(stars) AS total_stars,
                    ROUND(AVG(stars), 2) avg_rating
                FROM yelp_reviews
                GROUP BY 1
                ORDER BY 1;
                 ''').fetch_pandas_all()

display(df)

# Rename columns for readability in charts
df.rename(columns={'ONE_STAR_REVIEWS': '1 Star', 'TWO_STAR_REVIEWS': '2 Star', 'THREE_STAR_REVIEWS': '3 Star', 'FOUR_STAR_REVIEWS': '4 Star', 'FIVE_STAR_REVIEWS': '5 Star', 'TOTAL_REVIEWS': 'Total Reviews',
                   'AVG_RATING': 'Average Rating'}, inplace=True)

# Generate line chart for review count for each star value by quarter
fig = px.line(
    df,
    x='QUARTER',
    y=[
        '1 Star',
        '2 Star',
        '3 Star',
        '4 Star',
        '5 Star'
    ],
    labels={
        'value': 'Review Count',
        'variable': 'Star Rating',
        'QUARTER': 'Quarter'
    },
    title='Quarterly Yelp Review Star Rating Trends'
)
fig.update_xaxes(dtick=1)
fig.show()

# Generate bar chart for monthly review volume by quarter
fig = px.bar(
    df,
    x='QUARTER', 
    y='Total Reviews', 
    color='Total Reviews', 
    labels={'QUARTER': 'Quarter', 'Total Reviews': 'Review Count'},
    title='Quarterly Yelp Review Volume Trends'
)
fig.update_xaxes(dtick=1)
fig.show()

Unnamed: 0,QUARTER,ONE_STAR_REVIEWS,TWO_STAR_REVIEWS,THREE_STAR_REVIEWS,FOUR_STAR_REVIEWS,FIVE_STAR_REVIEWS,TOTAL_REVIEWS,TOTAL_STARS,AVG_STARS
0,1,254879,135483,176621,375630,804599,1747212,6581223,3.77
1,2,267952,136048,171950,360521,803312,1739783,6514542,3.74
2,3,288448,145698,182094,379363,860782,1856385,6947488,3.74
3,4,258282,127011,161269,337404,762934,1646900,6160397,3.74


In [None]:
# Reviews by year and quarter
df = cur.execute('''
                SELECT 
                    YEAR(review_date) AS year,
                    QUARTER(review_date) AS quarter,
                    SUM(CASE WHEN stars=1 THEN 1 ELSE 0 END) AS one_star_reviews,
                    SUM(CASE WHEN stars=2 THEN 1 ELSE 0 END) AS two_star_reviews,
                    SUM(CASE WHEN stars=3 THEN 1 ELSE 0 END) AS three_star_reviews,
                    SUM(CASE WHEN stars=4 THEN 1 ELSE 0 END) AS four_star_reviews,
                    SUM(CASE WHEN stars=5 THEN 1 ELSE 0 END) AS five_star_reviews,
                    COUNT(*) AS total_reviews,
                    SUM(stars) AS total_stars,
                    ROUND(AVG(stars), 2) avg_rating
                FROM yelp_reviews
                GROUP BY 1, 2
                ORDER BY 1, 2;
                 ''').fetch_pandas_all()

display(df)

# Rename columns for readability in charts
df.rename(columns={'ONE_STAR_REVIEWS': '1 Star', 'TWO_STAR_REVIEWS': '2 Star', 'THREE_STAR_REVIEWS': '3 Star', 'FOUR_STAR_REVIEWS': '4 Star', 'FIVE_STAR_REVIEWS': '5 Star', 'TOTAL_REVIEWS': 'Total Reviews',
                   'AVG_RATING': 'Average Rating'}, inplace=True)

# Create a date column from year and quarter
df['date'] = 'Q' + df['QUARTER'].astype(str) + '-' + df['YEAR'].astype(str)

# Melt into long format for counts of reviews with each star value
long_df = df.melt(
    id_vars=['date'],
    value_vars=['1 Star','2 Star','3 Star','4 Star','5 Star'],
    var_name='star',
    value_name='count'
)

# Plot time series of star value counts by year and quarter
fig = px.line(
    long_df,
    x='date',
    y='count',
    color='star',
    labels={
        'date':'Quarter-Year',
        'count':'Review Count',
        'star':'Star Rating'
    },
    title='Quarterly Yelp Star Rating Counts by Year'
)
fig.show()

# Melt into long format for total review counts for each quarter
long_df = df.melt(
    id_vars=['date'],
    value_vars=['Total Reviews'],
    var_name='reviews',
    value_name='count'
)

# Plot time series of total reviews by year and quarter
fig = px.line(
    long_df, 
    x='date', 
    y='count', 
    labels={
        'date': 'Quarter-Year',
        'count': 'Review Count'
        }, 
    title='Quarterly Yelp Review Volume by Year'
)
fig.show()

# Melt into long format for average star rating for each quarter
long_df = df.melt(
    id_vars=['date'],
    value_vars=['Average Rating'],
    var_name='avg_stars',
    value_name='count'
)

# Plot time series of average star rating by year and quarter
fig = px.line(
    long_df, 
    x='date', 
    y='count', 
    labels={
        'date': 'Quarter-Year',
        'count': 'Average Star Rating'
        }, 
    title='Quarterly Average Yelp Star Rating by Year'
)
fig.show()

Unnamed: 0,YEAR,QUARTER,ONE_STAR_REVIEWS,TWO_STAR_REVIEWS,THREE_STAR_REVIEWS,FOUR_STAR_REVIEWS,FIVE_STAR_REVIEWS,TOTAL_REVIEWS,TOTAL_STARS,AVG_STARS
0,2005,1,5,8,18,22,24,77,283,3.68
1,2005,2,4,9,24,67,68,172,702,4.08
2,2005,3,15,26,97,144,112,394,1494,3.79
3,2005,4,6,11,32,82,80,211,852,4.04
4,2006,1,23,27,88,217,242,597,2419,4.05
...,...,...,...,...,...,...,...,...,...,...
64,2021,1,26701,9094,9298,17966,79443,142502,541862,3.80
65,2021,2,34641,11786,12022,21433,89507,169389,627546,3.70
66,2021,3,34968,11705,12006,21099,83513,163291,596357,3.65
67,2021,4,30140,9909,10361,19565,73032,143007,524461,3.67


In [None]:
# Most recent reviews for each business
df = cur.execute('''
                WITH cte AS (
                    SELECT 
                        business_id,
                        review_date,
                        user_review,
                        stars,
                        ROW_NUMBER() OVER(PARTITION BY business_id ORDER BY review_date DESC) AS rn
                    FROM yelp_reviews
                )
                SELECT 
                    yb.name,
                    yb.city,
                    yb.state,
                    yb.postal_code,
                    review_date,
                    cte.stars,
                    user_review
                FROM cte
                JOIN yelp_businesses yb
                    ON cte.business_id = yb.business_id
                WHERE rn=1;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,NAME,CITY,STATE,POSTAL_CODE,REVIEW_DATE,STARS,USER_REVIEW
0,The Gables Apartments,Greenwood,IN,46143,2021-11-28,2,"If you think you might be denied, don't apply ..."
1,Schiano's Pizza,Warminster,PA,18974,2021-12-20,1,I love this place but tonight got rudely treat...
2,Caliber Collision,Largo,FL,33771,2022-01-15,1,At first caliber collision was on it! Sending ...
3,Gail Marcus,Philadelphia,PA,19102,2021-05-10,5,Gail was great to speak with. She allowed me t...
4,Employ Health,Nashville,TN,37203,2016-12-16,5,I was on vacation and needed a doctor at 9am o...
...,...,...,...,...,...,...,...
150341,MIZU Sushi Bar,Philadelphia,PA,19103,2017-11-02,2,How someone can screw up something as simple a...
150342,Beniki Nail Spa,Mount Juliet,TN,37122,2021-10-01,4,Do you know what heartbreak is? I'll tell you ...
150343,Arby's,Nashville,TN,37115,2019-12-25,5,Nothing more then a regular Arby's but clean a...
150344,Jet's Pizza,Madison,TN,37115,2020-01-30,1,I ordered two pizzas with extra cheese by web ...


In [None]:
# Most reviewed businesses
df = cur.execute('''
                SELECT yb.name, yb.city, yb.state, COUNT(yr.review_id) AS review_count, ROUND(AVG(yr.stars), 2) AS avg_rating
                FROM yelp_reviews yr
                JOIN yelp_businesses yb
                ON yr.business_id = yb.business_id
                GROUP BY yb.name, yb.city, yb.state
                ORDER BY COUNT(yr.review_id) DESC
                LIMIT 20;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,NAME,CITY,STATE,REVIEW_COUNT,AVG_STARS
0,Acme Oyster House,New Orleans,LA,7673,4.12
1,Oceana Grill,New Orleans,LA,7516,4.15
2,Hattie B’s Hot Chicken - Nashville,Nashville,TN,6160,4.45
3,Reading Terminal Market,Philadelphia,PA,5778,4.61
4,Ruby Slipper Cafe,New Orleans,LA,5523,4.21
5,Ruby Slipper - New Orleans,New Orleans,LA,5264,4.29
6,Mother's Restaurant,New Orleans,LA,5254,3.44
7,Royal House,New Orleans,LA,5146,3.79
8,Commander's Palace,New Orleans,LA,4969,4.29
9,Los Agaves,Santa Barbara,CA,4718,4.44


## Users

In [21]:
# Get distinct number of users
df = cur.execute('''
                SELECT COUNT(*) AS registered_users
                FROM yelp_users;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,REGISTERED_USERS
0,1987897


In [None]:
# Get number of users with at least 1 review
df = cur.execute('''
                SELECT COUNT(user_id) AS reviewers
                FROM yelp_users
                WHERE review_count >= 1;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,REVIEWERS
0,1987843


In [34]:
# Get number of users with more reviews than average
df = cur.execute('''
                SELECT COUNT(user_id) as highly_active_users
                FROM yelp_users
                WHERE review_count >= (
                SELECT ROUND(AVG(review_count), 2) AS avg_reviews
                FROM yelp_users
                );
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,HIGHLY_ACTIVE_USERS
0,374650


In [None]:
# Get number of users with at least one tip
df = cur.execute('''
                SELECT COUNT(user_id) AS tippers
                FROM yelp_tips;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,TIPPERS
0,301758


In [None]:
# Get number of users with more fans than average
df = cur.execute('''
                SELECT COUNT(user_id) as influential_users
                FROM yelp_users
                WHERE fans >= (
                SELECT ROUND(AVG(fans), 2) AS avg_fans
                FROM yelp_users
                );
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,INFLUENTIAL_USERS
0,210208


In [30]:
# Get number of users with at least one year of elite status
df = cur.execute('''
                SELECT COUNT(user_id) AS elite_users
                FROM yelp_users
                WHERE elite != '';
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,ELITE_USERS
0,91198


In [41]:
# Get number of users with an account created in the the latest year of the database
df = cur.execute('''
                SELECT COUNT(user_id) AS new_users
                FROM yelp_users
                WHERE EXTRACT(year from yelping_since) >= 2021;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,NEW_USERS
0,43267


In [98]:
# Create user engagement funnel
df = cur.execute('''
                SELECT 'Registered Users' AS User_Type, COUNT(*) AS Number_Of_Users, 100 AS Pct_Of_Users
                FROM yelp_users
                UNION
                SELECT 'Reviewers', COUNT(user_id), ROUND(COUNT(user_id) / (SELECT COUNT(*) FROM yelp_users) * 100, 3)
                FROM yelp_users
                WHERE review_count >= 1
                UNION
                SELECT 'Highly Active Reviewers', COUNT(user_id), ROUND(COUNT(user_id) / (SELECT COUNT(*) FROM yelp_users) * 100, 3)
                FROM yelp_users
                WHERE review_count >= (
                SELECT ROUND(AVG(review_count), 2) AS avg_reviews
                FROM yelp_users
                )
                UNION
                SELECT 'Tippers', COUNT(user_id), ROUND(COUNT(user_id) / (SELECT COUNT(*) FROM yelp_users) * 100, 3)
                FROM yelp_tips
                UNION
                SELECT 'Influential Users', COUNT(user_id), ROUND(COUNT(user_id) / (SELECT COUNT(*) FROM yelp_users) * 100, 3)
                FROM yelp_users
                WHERE fans >= (
                SELECT ROUND(AVG(fans), 2) AS avg_fans
                FROM yelp_users
                )
                UNION
                SELECT 'Elite Users', COUNT(user_id), ROUND(COUNT(user_id) / (SELECT COUNT(*) FROM yelp_users) * 100, 3)
                FROM yelp_users
                WHERE elite != ''
                UNION
                SELECT 'New Users', COUNT(user_id), ROUND(COUNT(user_id) / (SELECT COUNT(*) FROM yelp_users) * 100, 3)
                FROM yelp_users
                WHERE EXTRACT(year from yelping_since) >= 2021
                ORDER BY 2 DESC
                ;
                 ''').fetch_pandas_all()

display(df)

fig = go.Figure(go.Funnel(
    y=df['USER_TYPE'].values,
    x=df['NUMBER_OF_USERS'].values,
    textinfo='value+percent initial',
    marker= {
        'color': ['#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd','#8c564b', '#e377c2']
        },
    textfont={'color': 'black', 'size': 14},
    textposition='outside'
    )
)
fig.update_layout(title_text='User Engagement Funnel')
fig.show()


Unnamed: 0,USER_TYPE,NUMBER_OF_USERS,PCT_OF_USERS
0,Registered Users,1987897,100.0
1,Reviewers,1987843,99.997
2,Tippers,908915,45.722
3,Highly Active Reviewers,374650,18.847
4,Influential Users,210208,10.574
5,Elite Users,91198,4.588
6,New Users,43267,2.177


In [None]:
# Highest rated businesses for the selected state by average stars and total stars
df = cur.execute('''
                WITH ranked AS (
                SELECT
                    yb.name,
                    yb.state,
                    ROUND(AVG(yr.stars),2) AS avg_rating,
                    COUNT(yr.review_id) AS total_reviews
                FROM yelp_businesses yb
                JOIN yelp_reviews yr
                    ON yb.business_id = yr.business_id
                WHERE yb.state = 'IL'
                GROUP BY 1,2
                )
                SELECT
                    name,
                    state,
                    avg_rating,
                    total_reviews,
                    avg_rating * total_reviews AS total_stars
                FROM ranked
                ORDER BY avg_rating DESC, total_reviews DESC
                LIMIT 20;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,NAME,STATE,AVG_STARS,TOTAL_REVIEWS,TOTAL_STARS
0,Grit & Tonic Handcrafted,IL,5.0,22,110.0
1,CP Pinball,IL,5.0,18,90.0
2,Walton's Ice Cream and More,IL,5.0,13,65.0
3,Mike's iPhone Repair Service,IL,5.0,12,60.0
4,"Michael Murphy, OD",IL,5.0,12,60.0
5,CBC Home Inspections,IL,5.0,12,60.0
6,Eaker's Family Barber Shop,IL,5.0,11,55.0
7,Post Pack & Ship,IL,5.0,11,55.0
8,Vintage Wine Bar,IL,5.0,10,50.0
9,Jacks 66 Auto Repair,IL,5.0,10,50.0


In [None]:
# Highest rated businesses for the selected city by average stars and total stars
df = cur.execute('''
                WITH ranked AS (
                SELECT
                    yb.name,
                    yb.city,
                    yb.state,
                    ROUND(AVG(yr.stars),2) AS avg_rating,
                    COUNT(yr.review_id) AS total_reviews
                FROM yelp_businesses yb
                JOIN yelp_reviews yr
                    ON yb.business_id = yr.business_id
                WHERE yb.city = 'New Orleans'
                GROUP BY 1, 2, 3
                )
                SELECT
                    name,
                    city,
                    state,
                    avg_rating,
                    total_reviews,
                    avg_rating * total_reviews AS total_stars
                FROM ranked
                ORDER BY avg_rating DESC, total_reviews DESC
                LIMIT 20;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,NAME,CITY,STATE,AVG_STARS,TOTAL_REVIEWS,TOTAL_STARS
0,Drink & Learn,New Orleans,LA,5.0,90,450.0
1,New Orleans Streetwalkers Tours,New Orleans,LA,5.0,52,260.0
2,Hidden History Tours,New Orleans,LA,5.0,42,210.0
3,Casa Pelican B&B and Cooking School,New Orleans,LA,5.0,42,210.0
4,Slate Detail,New Orleans,LA,5.0,41,205.0
5,RachelLauren Massage,New Orleans,LA,5.0,36,180.0
6,Terrell House,New Orleans,LA,5.0,35,175.0
7,New Orleans Architecture Tours,New Orleans,LA,5.0,34,170.0
8,WobbeMassage,New Orleans,LA,5.0,31,155.0
9,Eversaint Salon,New Orleans,LA,5.0,30,150.0


In [None]:
# Average rating for closed vs currently operating businesses
df = cur.execute('''
                SELECT 
                    'Open' AS business_status, 
                    ROUND(AVG(yr.stars), 2) as avg_rating
                FROM yelp_businesses yb
                JOIN yelp_reviews yr
                    ON yb.business_id = yr.business_id
                WHERE is_open = 1
                UNION
                SELECT 
                    'Closed',
                    ROUND(AVG(yr.stars), 2) as avg_rating
                FROM yelp_businesses yb
                JOIN yelp_reviews yr
                    ON yb.business_id = yr.business_id
                WHERE is_open = 0;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,BUSINESS_STATUS,AVG_STARS
0,Open,3.77
1,Closed,3.62


In [None]:
# Average rating for closed vs currently operating businesses by category
df = cur.execute('''
                WITH status AS (
                    -- split the categories column so each business has a row for each category it has
                    SELECT 
                        business_id,
                        TRIM(A.value) AS category,
                        CASE
                            WHEN is_open = 1 THEN 'open'
                            ELSE 'closed'
                        END AS business_status
                    FROM yelp_businesses,
                    LATERAL SPLIT_TO_TABLE(categories, ',') A
                )
                SELECT 
                    category,
                    ROUND(AVG(CASE WHEN business_status = 'open' THEN yr.stars END), 2) as avg_open_rating,
                    ROUND(AVG(CASE WHEN business_status = 'closed' THEN yr.stars END), 2) as avg_closed_rating
                FROM status
                JOIN yelp_reviews yr
                    ON status.business_id = yr.business_id
                GROUP BY category;
                 ''').fetch_pandas_all()

display(df.head(10))

print(df.info())

open_higher = df[df['AVG_OPEN_RATING'] > df['AVG_CLOSED_RATING']].shape[0]
closed_higher = df[df['AVG_OPEN_RATING'] <= df['AVG_CLOSED_RATING']].shape[0]

print(f'Categories with avg_open_RATING greater than avg_closed_rating: {open_higher}\nCategories with avg_open_rating less than or equal to avg_closed_rating: {closed_higher}')

print(f'''
      Out of the 1311 categories with at least one review for both closed and open businesses, there are {open_higher} categories where the average
      star rating for open businesses is greater than closed businesses, and {closed_higher} categories where the average star rating for open 
      businesses is less than or equal to closed businesses.
      ''')

Unnamed: 0,CATEGORY,AVG_OPEN_STARS,AVG_CLOSED_STARS
0,Restaurants,3.83,3.64
1,Venezuelan,4.38,4.12
2,Day Spas,3.87,3.61
3,American (Traditional),3.71,3.49
4,Medical Spas,3.99,3.68
5,Windows Installation,3.36,3.26
6,Cosmetic Dentists,3.75,3.08
7,Specialty Food,4.05,3.9
8,Gluten-Free,4.01,3.82
9,Transportation,3.18,2.87


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   CATEGORY          1311 non-null   object
 1   AVG_OPEN_STARS    1302 non-null   object
 2   AVG_CLOSED_STARS  1048 non-null   object
dtypes: object(3)
memory usage: 30.9+ KB
None
Categories with avg_open_stars greater than avg_closed_stars: 715
Categories with avg_open_stars less than or equal to avg_closed_stars: 324

      Out of the 1311 categories with at least one review for both closed and open businesses, there are 715 categories where the average
      star rating for open businesses is greater than closed businesses, and 324 categories where the average star rating for open 
      businesses is less than or equal to closed businesses.
      


In [None]:
# Businesses with lowest average stars (min 100 reviews)
df = cur.execute('''
                SELECT 
                    name,
                    city,
                    state,
                    COUNT(yr.review_id) AS review_count,
                    ROUND(AVG(yr.stars), 2) AS avg_rating 
                FROM yelp_businesses yb
                JOIN yelp_reviews yr
                    ON yb.business_id = yr.business_id
                GROUP BY name, city, state
                HAVING COUNT(yr.review_id) >= 100
                ORDER BY AVG(yr.stars), COUNT(yr.review_id) DESC
                LIMIT 20;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,NAME,CITY,STATE,REVIEW_COUNT,AVG_STARS
0,First Advantage Corporation,Saint Petersburg,FL,103,1.0
1,717 Parking Enterprises,Tampa,FL,106,1.04
2,International Medical Group,Indianapolis,IN,168,1.04
3,PHH Mortgage,Mount Laurel,NJ,167,1.04
4,Sensible Home Warranty,Sparks,NV,175,1.05
5,Express Scripts,Maryland Heights,MO,318,1.06
6,Sears Home Services,Fenton,MO,587,1.08
7,Defender Security Company,Indianapolis,IN,428,1.08
8,Penrose Hotel,Philadelphia,PA,152,1.09
9,EXPRESS SCRIPTS,St. Louis,MO,224,1.09


In [45]:
# Comparing elite and non-elite user reviews
df = cur.execute('''
                WITH review_stats AS (
                    SELECT 
                        IFF(LENGTH(elite) > 0, 'Elite', 'Regular') AS user_status,
                        yr.stars,
                        yr.useful_votes,
                        yr.funny_votes,
                        yr.cool_votes,
                        yr.user_review
                    FROM yelp_users yu
                    JOIN yelp_reviews yr
                        ON yu.user_id = yr.user_id
                )
                SELECT 
                    user_status,
                    ROUND(AVG(stars), 2) AS avg_rating,
                    ROUND(AVG(useful_votes), 2) AS avg_useful_votes,
                    ROUND(AVG(funny_votes), 2) AS avg_funny_votes,
                    ROUND(AVG(cool_votes), 2) AS avg_cool_votes,
                    ROUND(AVG(LENGTH(user_review)), 2) AS avg_review_length,
                    COUNT(*) AS total_reviews
                FROM review_stats
                GROUP BY user_status;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,USER_STATUS,AVG_RATING,AVG_USEFUL_VOTES,AVG_FUNNY_VOTES,AVG_COOL_VOTES,AVG_REVIEW_LENGTH,TOTAL_REVIEWS
0,Regular,3.68,0.82,0.18,0.21,498.78,5264589
1,Elite,3.97,2.3,0.76,1.38,778.23,1725658


In [44]:
# Comparing engagement on reviews of each star value
df = cur.execute('''
                WITH review_stats AS (
                    SELECT 
                        CASE
                            WHEN stars = 1 THEN '1 Star'
                            WHEN stars = 2 THEN '2 Star'
                            WHEN stars = 3 THEN '3 Star'
                            WHEN stars = 4 THEN '4 Star'
                            ELSE '5 Star'
                        END AS rating,
                        yr.stars,
                        yr.useful_votes,
                        yr.funny_votes,
                        yr.cool_votes,
                        yr.user_review
                    FROM yelp_reviews yr
                )
                SELECT 
                    rating,
                    ROUND(AVG(useful_votes), 2) AS avg_useful_votes,
                    ROUND(AVG(funny_votes), 2) AS avg_funny_votes,
                    ROUND(AVG(cool_votes), 2) AS avg_cool_votes,
                    COUNT(*) AS total_reviews,
                    ROUND(AVG(LENGTH(user_review)), 2) AS avg_review_length
                FROM review_stats
                GROUP BY rating
                ORDER BY rating
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,RATING,AVG_USEFUL_VOTES,AVG_FUNNY_VOTES,AVG_COOL_VOTES,TOTAL_REVIEWS,AVG_REVIEW_LENGTH
0,1 Star,1.67,0.42,0.15,1069561,713.78
1,2 Star,1.35,0.43,0.26,544240,721.08
2,3 Star,1.18,0.4,0.47,691934,669.2
3,4 Star,1.23,0.38,0.74,1452918,587.5
4,5 Star,0.97,0.24,0.55,3231627,463.02


In [None]:
# Exploring stats for elite users
df = cur.execute('''
                WITH review_stats AS (
                    SELECT 
                        yu.user_id,
                        LENGTH(elite) AS elite_size,
                        yr.stars,
                        yr.useful_votes,
                        yr.funny_votes,
                        yr.cool_votes,
                        yr.user_review,
                        yu.fans,
                        yu.yelping_since
                    FROM yelp_users yu
                    JOIN yelp_reviews yr
                        ON yu.user_id = yr.user_id
                    WHERE elite != ''
                )
                SELECT 
                    user_id,
                    elite_size,
                    fans,
                    ROUND((CURRENT_DATE - yelping_since) / 365, 2) AS account_age,
                    ROUND(AVG(stars), 2) AS avg_rating,
                    ROUND(AVG(useful_votes), 2) AS avg_useful_votes,
                    ROUND(AVG(funny_votes), 2) AS avg_funny_votes,
                    ROUND(AVG(cool_votes), 2) AS avg_cool_votes,
                    ROUND(AVG(LENGTH(user_review)), 2) AS avg_review_length,
                    COUNT(*) AS total_reviews
                FROM review_stats
                GROUP BY user_id, elite_size, fans, yelping_since
                LIMIT 25;
                 ''').fetch_pandas_all()

display(df)

Unnamed: 0,USER_ID,ELITE_SIZE,FANS,ACCOUNT_AGE,AVG_RATING,AVG_USEFUL_VOTES,AVG_FUNNY_VOTES,AVG_COOL_VOTES,AVG_REVIEW_LENGTH,TOTAL_REVIEWS
0,b47MFJu3LYjv6xmCRv0dUA,25,17,9.91,3.83,1.38,0.62,0.76,677.72,95
1,3t_Jf5R5uRllg9ERHnsgRQ,19,18,11.45,3.71,1.37,0.28,0.49,1249.46,126
2,skdAsWlekOcaA8WvAyYztQ,4,2,10.76,3.67,0.79,0.5,0.56,388.54,48
3,1U7QQcz5gv4Wu33Og_pKHw,29,17,14.93,3.88,0.79,0.13,0.51,230.7,76
4,MOLgSNgnkYktFbXrwchbdg,40,34,12.53,4.16,1.39,0.28,0.71,714.17,189
5,HQTuvMrM4_MIWU1PSoKPDA,44,24,16.72,3.94,0.75,0.06,0.19,663.31,16
6,fDOmWO4KP03mmtBu1yby7g,4,8,12.92,3.49,2.03,0.69,0.81,614.53,93
7,dEvFqqQImRAaE1dT5wYMJg,14,23,12.06,3.76,1.15,0.3,0.37,665.13,176
8,dcFF9akYubn_UIwsXX_JTw,65,85,18.01,2.72,3.72,1.72,1.61,602.39,18
9,33C5Hrjuya2qUd8rgZSiFw,34,18,17.96,3.8,1.07,0.4,0.2,856.73,15
