<a href="https://colab.research.google.com/github/JonasWetzel94/google_collab_sql/blob/main/SQL_Window_Functions_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a target="_blank" href="https://colab.research.google.com/github/lukebarousse/Int_SQL_Data_Analytics_Course/blob/main/Resources/Blank_SQL_Notebook.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Blank SQL Notebook

#### Import Libraries & Database

In [1]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# If running in Google Colab, install PostgreSQL and restore the database
if 'google.colab' in sys.modules:
    # Install PostgreSQL
    !sudo apt-get install postgresql -qq > /dev/null 2>&1

    # Start PostgreSQL service (suppress output)
    !sudo service postgresql start > /dev/null 2>&1

    # Set password for the 'postgres' user to avoid authentication errors (suppress output)
    !sudo -u postgres psql -c "ALTER USER postgres WITH PASSWORD 'password';" > /dev/null 2>&1

    # Create the 'colab_db' database (suppress output)
    !sudo -u postgres psql -c "CREATE DATABASE contoso_100k;" > /dev/null 2>&1

    # Download the PostgreSQL .sql dump
    !wget -q -O contoso_100k.sql https://github.com/lukebarousse/Int_SQL_Data_Analytics_Course/releases/download/v.0.0.0/contoso_100k.sql

    # Restore the dump file into the PostgreSQL database (suppress output)
    !sudo -u postgres psql contoso_100k < contoso_100k.sql > /dev/null 2>&1

    # Shift libraries from ipython-sql to jupysql
    !pip uninstall -y ipython-sql > /dev/null 2>&1
    !pip install jupysql > /dev/null 2>&1

# Load the sql extension for SQL magic
%load_ext sql

# Connect to the PostgreSQL database
%sql postgresql://postgres:password@localhost:5432/contoso_100k

# Enable automatic conversion of SQL results to pandas DataFrames
%config SqlMagic.autopandas = True

# Disable named parameters for SQL magic
%config SqlMagic.named_parameters = "disabled"

# Display pandas number to two decimal places
pd.options.display.float_format = '{:.2f}'.format

In [2]:
%%sql

SELECT
    storekey,
    customerkey,
    quantity,
    SUM(quantity) OVER(PARTITION BY storekey) total_quantity_per_store,
    SUM(quantity) OVER(PARTITION BY customerkey, storekey) total_quantity_per_customer_store
FROM sales
ORDER BY storekey, customerkey
LIMIT 5;

Unnamed: 0,storekey,customerkey,quantity,total_quantity_per_store,total_quantity_per_customer_store
0,10,545,8,2395,29
1,10,545,7,2395,29
2,10,545,3,2395,29
3,10,545,6,2395,29
4,10,545,4,2395,29


In [3]:
%%sql
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public'


Unnamed: 0,table_name
0,currencyexchange
1,customer
2,sales
3,date
4,product
5,store


In [5]:
%%sql
SELECT * FROM sales LIMIT 2;

Unnamed: 0,orderkey,linenumber,orderdate,deliverydate,customerkey,storekey,productkey,quantity,unitprice,netprice,unitcost,currencycode,exchangerate
0,1000,0,2015-01-01,2015-01-01,947009,400,48,1,112.46,98.97,57.34,GBP,0.64
1,1000,1,2015-01-01,2015-01-01,947009,400,460,1,749.75,659.78,382.25,GBP,0.64


In [9]:
%%sql
SELECT customerkey,
       ROUND(AVG(quantity) OVER(PARTITION BY customerkey),2),
       ROUND(AVG(quantity) OVER(),2)
FROM sales
ORDER BY customerkey
LIMIT 10

Unnamed: 0,customerkey,round,round.1
0,15,5.0,3.14
1,180,2.0,3.14
2,180,2.0,3.14
3,180,2.0,3.14
4,185,3.0,3.14
5,243,5.0,3.14
6,387,2.89,3.14
7,387,2.89,3.14
8,387,2.89,3.14
9,387,2.89,3.14


In [10]:
%%sql
SELECT * FROM sales LIMIT 2;

Unnamed: 0,orderkey,linenumber,orderdate,deliverydate,customerkey,storekey,productkey,quantity,unitprice,netprice,unitcost,currencycode,exchangerate
0,1000,0,2015-01-01,2015-01-01,947009,400,48,1,112.46,98.97,57.34,GBP,0.64
1,1000,1,2015-01-01,2015-01-01,947009,400,460,1,749.75,659.78,382.25,GBP,0.64


In [30]:
%%sql
WITH weekly_revenue AS (
    SELECT
        EXTRACT(WEEK FROM orderdate) AS order_week,
        SUM(quantity * netprice * exchangerate) AS week_revenue
    FROM sales
    WHERE EXTRACT(YEAR FROM orderdate) = 2023
    GROUP BY order_week
)
SELECT
    order_week,
    week_revenue,
    AVG(week_revenue) OVER () AS avg_weekly_revenue,
    100.0 * week_revenue / AVG(week_revenue) OVER () AS pct_of_avg
FROM weekly_revenue
ORDER BY order_week
LIMIT 5;



Unnamed: 0,order_week,week_revenue,avg_weekly_revenue,pct_of_avg
0,1,1101256.97,636703.18,172.96
1,2,779868.87,636703.18,122.49
2,3,785218.52,636703.18,123.33
3,4,803262.51,636703.18,126.16
4,5,703916.98,636703.18,110.56


In [32]:
%%sql
WITH sales_with_cohort AS (
    SELECT
        customerkey,
        DATE_TRUNC('month', orderdate) AS order_month,
        EXTRACT(YEAR FROM MIN(orderdate) OVER (PARTITION BY customerkey)) AS cohort_year,
        quantity * netprice * exchangerate AS revenue
    FROM sales
),
monthly_revenue AS (
    SELECT
        cohort_year,
        order_month,
        SUM(revenue) AS monthly_revenue
    FROM sales_with_cohort
    GROUP BY cohort_year, order_month
)
SELECT
    cohort_year,
    TO_CHAR(order_month, 'YYYY-MM') AS year_month,
    monthly_revenue,
    SUM(monthly_revenue) OVER (
        PARTITION BY cohort_year
    ) AS total_cohort_revenue,
    100 * monthly_revenue / SUM(monthly_revenue) OVER (
        PARTITION BY cohort_year
    ) AS pct_of_cohort_revenue
FROM monthly_revenue
ORDER BY
    cohort_year,
    order_month
LIMIT 5;

Unnamed: 0,cohort_year,year_month,monthly_revenue,total_cohort_revenue,pct_of_cohort_revenue
0,2015,2015-01,384092.66,14892230.47,2.58
1,2015,2015-02,706374.12,14892230.47,4.74
2,2015,2015-03,332961.59,14892230.47,2.24
3,2015,2015-04,160767.0,14892230.47,1.08
4,2015,2015-05,548632.63,14892230.47,3.68
