In [1]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [2]:
import pandas_gbq
import pandas as pd  # You'll likely still use pandas for DataFrame operations

In [3]:
from google.cloud import bigquery

project_id = 'lookerstudio-seminar'  # Replace with your project ID
client = bigquery.Client(project=project_id)

In [4]:
# Running a query (using a magic command is also common in Colab)
query = """
    SELECT COUNT(*) as total_rows
    FROM `bigquery-public-data.iowa_liquor_sales.sales`
"""
query_job = client.query(query)
results = query_job.result()
for row in results:
    print(f"Total rows: {row.total_rows}")

# Alternatively, using the %%bigquery magic command in a cell:
# %%bigquery --project your-gcp-project-id
# SELECT COUNT(*) as total_rows
# FROM `bigquery-public-data.samples.gsod`

Total rows: 31339341


In [5]:
# Running a query (using a magic command is also common in Colab)
query_table = """
    SELECT *
    FROM `bigquery-public-data.iowa_liquor_sales.sales`
    --LIMIT 2000000
"""

# df = pandas_gbq.read_gbq(query_table, project_id)

In [6]:
from graphviz import Digraph

In [7]:
import bigframes.pandas as bf
bf.options.bigquery.project = 'lookerstudio-seminar' # Replace with your project ID

In [8]:
df_bf = bf.read_gbq(query_table)

In [9]:
df_bf.head(5)

Unnamed: 0,invoice_and_item_number,date,store_number,store_name,address,city,zip_code,store_location,county_number,county,...,item_number,item_description,pack,bottle_volume_ml,state_bottle_cost,state_bottle_retail,bottles_sold,sale_dollars,volume_sold_liters,volume_sold_gallons
0,S06647500053,2012-07-18,2595,HY-VEE WINE AND SPIRITS / DENISON,"1620 4TH AVE, SOUTH",DENISON,51442.0,POINT (-95.3486 42.0124),24,CRAWFORD,...,41279,BURNETT'S GRAPE VODKA,12,750,4.55,6.82,2,13.64,1.5,0.4
1,INV-04030300055,2017-03-28,2666,HY-VEE #2 / ANKENY,2510 SW STATE ST,ANKENY,50023.0,POINT (-93.62182 41.70519),77,POLK,...,33672,GREY GOOSE LE MELON,6,750,18.49,27.74,3,83.22,2.25,0.59
2,INV-42509200008,2021-12-01,2233,FORBES LIQUOR LOCKER / REMI,2508 ENTERPRISE AVE,SPIRIT LAKE,51360.0,POINT (-95.12679 43.41643),30,DICKINSON,...,52563,E & J XO,12,750,8.24,12.36,2,24.72,1.5,0.39
3,S12910100019,2013-06-18,4230,FAREWAY STORES #106 / CLIVE,10151 UNIVERSITY AVE,CLIVE,50325.0,POINT (-93.75649 41.60051),77,POLK,...,27474,BIRD DOG BLACKBERRY WHISKEY,6,750,8.0,13.0,4,52.0,3.0,0.79
4,INV-07420000004,2017-09-25,5128,PRICE CHOPPER / BEAVER,1819 BEAVER AVE,DES MOINES,50310.0,POINT (-93.6727 41.61052),77,POLK,...,64866,FIREBALL CINNAMON WHISKEY,12,750,9.0,13.5,12,161.64,9.0,2.38


In [10]:
df_bf.columns

Index(['invoice_and_item_number', 'date', 'store_number', 'store_name',
       'address', 'city', 'zip_code', 'store_location', 'county_number',
       'county', 'category', 'category_name', 'vendor_number', 'vendor_name',
       'item_number', 'item_description', 'pack', 'bottle_volume_ml',
       'state_bottle_cost', 'state_bottle_retail', 'bottles_sold',
       'sale_dollars', 'volume_sold_liters', 'volume_sold_gallons'],
      dtype='object')

In [11]:
print(df_bf['date'].dtype)

date32[day][pyarrow]


In [12]:
# Assuming df_bf has 'date' and 'sale_dollars' columns

# 1. Extract the month from the 'date' column
df_bf['month'] = df_bf['date'].dt.month

# 2. Group by the 'month' and sum 'sale_dollars'
monthly_sales_bf = df_bf.groupby('month')['sale_dollars'].sum().reset_index()
monthly_sales_bf = monthly_sales_bf.rename(columns={'sale_dollars': 'monthly_total_sales'})

# 3. Merge the monthly sales back into the original df_bf
df_bf = bf.merge(df_bf, monthly_sales_bf, on='month', how='left')

In [13]:
import pandas as pd
from graphviz import Digraph, Graph


In [14]:

# Load the dataset (replace with your BigQuery connection logic)
# For demo, assume `df` is your loaded DataFrame:
# df = pd.read_gbq('SELECT * FROM `bigquery-public-data.iowa_liquor_sales.sales`', project_id=your_project_id)

# --------------------------------------
# 1. Sales Performance Flowchart
# --------------------------------------
def create_sales_performance_flow(df):
    flowchart = Digraph("Sales_Performance", format="png")
    flowchart.attr(rankdir="LR", bgcolor="white")

    # Calculate key metrics
    total_sales = df['sale_dollars'].sum()
    top_product = df.groupby('item_description')['sale_dollars'].sum().idxmax()
    top_county = df.groupby('county')['sale_dollars'].sum().idxmax()

    # Define nodes with dynamic metrics
    flowchart.node("Sales", f"Total Sales\n${total_sales:,.0f}", shape="box", fillcolor="#e6f3ff")
    flowchart.node("Product", f"Top Product\n{top_product}", shape="box", fillcolor="#d4f7d4")
    flowchart.node("County", f"Top County\n{top_county}", shape="box", fillcolor="#d4f7d4")
    flowchart.node("Trends", "Monthly Trends", shape="ellipse", fillcolor="#ffe6e6")

    # Add edges
    flowchart.edge("Sales", "Product", label="Driven by")
    flowchart.edge("Sales", "County", label="Top Region")
    flowchart.edge("Sales", "Trends", label="Over Time")

    # Add time trend subgraph (example)
    with flowchart.subgraph(name="cluster_trends") as c:
        c.attr(label="Monthly Sales", style="dashed")

        # Assuming 'df' now has a 'month' column (integer 1-12)
        # and a 'monthly_total_sales' column

        # Group by the 'month' column and take the first 'monthly_total_sales' value
        # for each month. This assumes 'monthly_total_sales' is the same for all
        # rows within a given month after the merge.
        monthly_sales_for_graph = df.groupby('month')['monthly_total_sales'].max()

        for month, total_sales in monthly_sales_for_graph.items():
            c.node(f"Month_{month}", f"Month {month}\n${total_sales:,.0f}", shape="note")

    return flowchart


# --------------------------------------
# Execute & Render
# --------------------------------------
# Assuming `df` is your loaded DataFrame:
# sales_flow = create_sales_performance_flow(df)
# geo_hierarchy = create_geo_hierarchy(df)

sales_flow = create_sales_performance_flow(df_bf)
sales_flow.render("sales_performance_flow", view=True)



print("Graphs generated: sales_performance_flow.png")

Graphs generated: sales_performance_flow.png


In [15]:
min(df_bf['date']), max(df_bf['date'])

(datetime.date(2012, 1, 3), datetime.date(2025, 3, 31))

In [16]:

# --------------------------------------
# 2. Geographic Sales Hierarchy
# --------------------------------------
def create_geo_hierarchy(df):
    hierarchy = Graph("Sales_Hierarchy", format="png")
    hierarchy.attr(rankdir="TB", bgcolor="white")

    # Aggregate data: County → Store → Top Product
    # county_sales = df.groupby('county')['sale_dollars'].sum().nlargest(5).index.tolist()
    top_counties_pd = df.groupby('county')['sale_dollars'].sum().nlargest(5).to_pandas()
    county_sales = top_counties_pd.index.tolist() # Now top_counties_pd is a Pandas Series
    # --- End Correction ---

    for county in county_sales:
        hierarchy.node(county, shape="folder", fillcolor="#e6f3ff")
        stores = df[df['county'] == county]['store_number'].unique()[:3]  # Top 3 stores
        for store in stores:
            store_name = df[df['store_number'] == store]['store_name'].iloc[0]
            hierarchy.node(store, f"Store: {store_name}", shape="box", fillcolor="#d4f7d4")
            hierarchy.edge(county, store)
            # Add top product per store
            top_product = df[df['store_number'] == store].groupby('item_description')['sale_dollars'].sum().idxmax()
            hierarchy.node(f"{store}_product", f"Top Product\n{top_product}", shape="ellipse", fillcolor="#ffe6e6")
            hierarchy.edge(store, f"{store}_product")

    return hierarchy

geo_hierarchy = create_geo_hierarchy(df_bf)
geo_hierarchy.render("geo_sales_hierarchy", view=True)

print("Graphs generated:  geo_sales_hierarchy.png")

Graphs generated:  geo_sales_hierarchy.png
