In [None]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Overview

This notebook demonstrates the initial steps of understanding a time series dataset, exploration and visualization.

### Dataset

The [Iowa Liquor Sales](https://console.cloud.google.com/marketplace/details/iowa-department-of-commerce/iowa-liquor-sales) dataset from BigQuery Public Datasets is used in this example. The dataset contains wholesale liquor purchases in the state of Iowa from 2012 to the present.

### Objective

We will show how to use BigQuery to query data and then use the `statsmodels` stats and `seaborn` visualization packages to explore the data.


## Install packages and dependencies

Restarting the kernel may be required to use new packages.

In [None]:
%pip install -U statsmodels --user

**Note:** To restart the Kernel, navigate to Kernel > Restart Kernel... on the Jupyter menu.

### Import libraries and define constants

In [None]:
from google.cloud import bigquery as bq
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from pandas.plotting import register_matplotlib_converters
from statsmodels.tsa.seasonal import seasonal_decompose

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
# Enter your project and region. Then run the  cell to make sure the
# Cloud SDK uses the right project for all the commands in this notebook.

PROJECT = "your-gcp-project-here" # REPLACE WITH YOUR PROJECT NAME 
REGION = "us-central1" # REPLACE WITH YOUR BUCKET REGION e.g. us-central1

#Don't change the following command - this is to check if you have changed the project name above.
assert PROJECT != 'your-gcp-project-here', 'Don''t forget to change the project variables!'

In [None]:
target_col = 'y'  # What we are predicting
ts_col = 'ds'     # Time series column
holiday_col = 'holiday'

daily_file = 'iowa_daily.csv'
monthly_file = 'iowa_monthly.csv'

## Explore monthly data

In [None]:
# Let's first sample the dataset

sql = 'SELECT * FROM `bigquery-public-data.iowa_liquor_sales.sales` LIMIT 5'

client = bq.Client(project=PROJECT)
df = client.query(sql).to_dataframe()

df.head()

In [None]:
# Aggregate all sales by month.
# Category names are very specific (e.g. "Straight Rye Whiskies"), so let's group them.
# The date_trunc() function will extract just the month and day parts for grouping

sql_monthly = """
select
  case 
    when lower(category_name) like '%vodka%' then 'vodka'
    when lower(category_name) like '%liqueur%' then 'liqueur'
    when lower(category_name) like '%bourbon%' then 'bourbon'
    when lower(category_name) like '%scotch%' then 'scotch'    
    when lower(category_name) like '%whisk%' then 'whisky'
    when lower(category_name) like '%rum%' then 'rum'
    when lower(category_name) like '%tequila%' then 'tequila'
    when lower(category_name) like '%brand%' then 'brandy'
    when lower(category_name) like '%schnapps%' then 'schnapps'
    when lower(category_name) like '%gin%' then 'gin'
    else 'other'
  end as category,
  sum(sale_dollars) as y,
  date_trunc(date, month) as ds  
from `bigquery-public-data.iowa_liquor_sales.sales`
GROUP by category, ds ORDER BY ds asc, category asc
"""

# Run the query
df_monthly_by_category = client.query(sql_monthly).to_dataframe()

# Print the first few rows to see what is returned
df_monthly_by_category.head() 

In [None]:
# Let's look at the popularity by category

df_category = df_monthly_by_category.groupby('category').sum().sort_values(by=target_col, ascending=False)
df_category.head()
_ = sns.barplot(x=df_category[target_col], y=df_category.index)

#### TODO 1: Analyze the patterns

In [None]:
# Let's look at the trends for a few categories.

# TODO-1: What patterns do you notice? Are there different trajectories? Are there differences in seasonality?

sample_categories = ['whisky','vodka','rum']

register_matplotlib_converters() # Addresses a warning

with plt.rc_context():
    plt.rc('figure', figsize=(20,6))
    df_monthly_sample_categories = df_monthly_by_category[df_monthly_by_category.category.isin(sample_categories)]
    _ = sns.lineplot(x=ts_col, y=target_col, hue='category', data=df_monthly_sample_categories)

In [None]:
# Let's now group the data into sales by day (aggregating category-level data)
df_monthly = df_monthly_by_category.groupby(ts_col).sum()

df_monthly.index = pd.DatetimeIndex(df_monthly.index) # Set index explicitly to a datetime index for future graphing

df_monthly.head()

In [None]:
# Provide summary statistics

with pd.option_context('display.float_format', '{:,.0f}'.format):
    print(df_monthly[target_col].describe())

In [None]:
# Plot the time-series data

_ = sns.lineplot(data=df_monthly)

In [None]:
# Show the distribution of values for each month in a boxplot:
# Min, 25th percentile, median, 75th percentile, max 

months = df_monthly.index.to_series().dt.month

_ = sns.boxplot(x=months, y=df_monthly[target_col])

In [None]:
# Decompose the data into trend and seasonal components

result = seasonal_decompose(df_monthly, period=12)
fig = result.plot()

In [None]:
# Export the data for use in a future lab.
# This will generate a csv file, which you will use in the next labs of this quest.
# Inspect the csv file to see what the data looks like.

df_monthly.to_csv(monthly_file, index=True, index_label='ds')

## Explore Daily Data

In [None]:
# Aggregate all sales transactions by day

sql_daily = """
SELECT SUM(sale_dollars) as y, date as ds FROM `bigquery-public-data.iowa_liquor_sales.sales`
group by ds
order by ds
"""

In [None]:
# Run the query

df_daily = client.query(sql_daily).to_dataframe()

df_daily.head()

# Are all days provided in the dataset?

In [None]:
# Fill in missing days in the index

df_daily.index = pd.DatetimeIndex(df_daily.pop(ts_col))
index_with_missing_vals = pd.date_range(start=min(df_daily.index), end=max(df_daily.index))
df_daily = df_daily.reindex(index_with_missing_vals)
df_daily.index.freq='D'

df_daily.head()

In [None]:
# We still need to replace NaNs with 0

df_daily = df_daily.fillna(0)
df_daily.head()

In [None]:
# Provide summary statistics

with pd.option_context('display.float_format', '{:,.0f}'.format):
    print(df_daily[target_col].describe())

In [None]:
# Let's plot the data. Note the outlier in 2013.

result = sns.lineplot(data=df_daily)
fig = result.plot()

#### TODO 2: Create another line plot

In [None]:
# Let's look at a subset of the data, to see if there's any weekly pattern.

# TODO-2: Create another line plot using only the first 60 days of data
# Hint: to get the first n rows of data you can use df_daily[0:n]

In [None]:
# Plot the overall data distribution

result = sns.distplot(df_daily)
fig = result.plot()

#### TODO 3: Create another box plot as you did with monthly data

In [None]:
# Show the distribution by day-of-week

# TODO-3: Create another box plot as you did with monthly data, but looking at the distribution by day of week.
# Hint: you can use the dt.dayofweek() to get the day of week (0-6)

In [None]:
# See any meaningful difference on holidays?

cal = calendar()
dr = pd.date_range(start=df_daily.index.to_series().iloc[0], end=df_daily.index.to_series().iloc[-1])
holidays = cal.holidays(start=dr.min(), end=dr.max())
df_daily[holiday_col] = df_daily.index.isin(holidays) * 1

result = sns.boxplot(x=holiday_col, y=target_col, data=df_daily)
fig = result.plot()

In [None]:
# Look for weekly seasonality (just showing 90 days so pattern is clearer)

result = seasonal_decompose(df_daily[target_col][0:90], period=7)
fig = result.plot()

#### TODO 4: Try another seasonal decomposition

In [None]:
# There can be multiple layers of seasonality - now decomposing by year

# TODO-4: Try another seasonal decomposition, but this time apply it to all data (remove the [0:90] slice), and set the period to 365

In [None]:
# Export the data for use in a future lab.
# This will generate a csv file, which you will use in the next labs of this quest.
# Inspect the csv file to see what the data looks like.

df_daily[holiday_col] = df_daily[holiday_col].astype(float) # Avoids warnings in future labs
df_daily.to_csv(daily_file, index=True, index_label='ds')

## Conclusion

You've successfully completed the exploration and visualization lab.
We've learned how to:
* Create a query that groups data into a time series
* Fill missing values
* Visualize data
* Decompose time series into trend and seasonal components