In [5]:
from pyhive import hive
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import seaborn as sns

In [None]:
# Connect to hive and run query
conn = hive.Connection(host = 'hadoop.rcc.uchicago.edu', 
                       port = 10000, 
                       username = 'mtoolsidas', 
                       database = 'dmp_yelp_rs')
cursor = conn.cursor()
query = 'select r.business_id,r.`date`,r.stars,b.simplified_category from review r join business_basicdata b on r.business_id = b.business_id'
cursor.execute(query)
res = cursor.fetchall()


In [None]:
# Create df from Hive result
df = pd.DataFrame(res, columns = [i[0][2:] for i in cursor.description])

In [None]:
# Converting `date` column to datetime and extracting date components

df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d %H:%M:%S')
df['just_date'] = df['date'].dt.date
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['day_name'] = df['date'].dt.day_name()
df['month_year'] = df['date'].dt.to_period('M').astype(str)
df['week'] = df['date'].dt.to_period('W').astype(str)

In [None]:
# Grouping data by category and month/year, getting the count of reviews and average stars for each group  

group_by_cat = df.groupby(['simplified_category','month_year']).agg({'business_id':'count', 'stars':'mean'})
group_by_cat = group_by_cat.rename(columns = {'business_id':'num_reviews'})
group_by_cat.index = group_by_cat.index.set_levels([group_by_cat.index.levels[0], pd.to_datetime(group_by_cat.index.levels[1])])

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

dfs = []

# Looping through each category (index level 0) and performing decomposition analysis
for i in set(group_by_cat.index.get_level_values(0)):
    
    print('starting category {}...'.format(i))
    
    # refactoring df to have 1 index instead of multi-index (removing category index)
    tmp = group_by_cat.xs(i, level = 0, drop_level = True)
    # changing index (month_year) to monthly period
    tmp.index = tmp.index.to_period('M')
    # filling in any missing months in the data and interpolating the values for review count and average stars
    tmp = tmp.reindex(pd.period_range(tmp.index[0],tmp.index[-1],freq='M')).interpolate('time')
    # turning index back to datetime
    tmp.index = tmp.index.to_timestamp()
    tmp['simplified_category'] = i
    dfs.append(tmp)

    print('finished with category {}.'.format(i))
    print('--- --- --- ---')

In [None]:
# Concatenating dfs together 
final_df = pd.concat(dfs)

In [None]:
# Function to determine whether pre-covid (1) or not (0)

def time_test(x):
    if x < pd.to_datetime('2020-02'):
        result = 1
    else:
        result = 0
    return result

In [None]:
# Apply function to date index
final_df['is_pre_covid'] = final_df.index.map(time_test)

In [None]:
# Convert date format to yyyy-mm
final_df.index = final_df.index.strftime('%Y-%m')

In [None]:
# Rename columns
final_df = final_df.rename(columns = {'num_reviews':'review_counts', 'stars':'avg_stars'})

In [None]:
# Rename index
final_df.index = final_df.index.rename('month_year')

In [None]:
# Save result
final_df.to_csv('time_series_cat_final2.csv', index = True, index_label = 'month_year')