In [12]:
import pandas as pd
from pandasql import sqldf

In [13]:
df = pd.read_excel("0 - Python Practice/Practice Datasets/Superstore.xlsx")

In [14]:
# view the results

df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2013-152156,2013-11-09,2013-11-12,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2013-152156,2013-11-09,2013-11-12,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2013-138688,2013-06-13,2013-06-17,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2012-108966,2012-10-11,2012-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2012-108966,2012-10-11,2012-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [6]:
df.dtypes

Row ID                    int64
Order ID                 object
Order Date       datetime64[ns]
Ship Date        datetime64[ns]
Ship Mode                object
Customer ID              object
Customer Name            object
Segment                  object
Country                  object
City                     object
State                    object
Postal Code               int64
Region                   object
Product ID               object
Category                 object
Sub-Category             object
Product Name             object
Sales                   float64
Quantity                  int64
Discount                float64
Profit                  float64
dtype: object

## Calculating Key Monthly Metrics using SQL 

Now we use SQL in python using the sqldf from the pandasql library

💡Note: this function uses SQLite syntax

In [15]:
# using the raw orders data to compute some key metrics on a monthly level

# using common aggregate functions

query = """
SELECT
  DATE(strftime('%Y', `Order Date`) || '-' || strftime('%m', `Order Date`) || '-' || '01') AS month,
  COUNT(*) AS num_of_rows,
  COUNT(DISTINCT `Order ID`) AS num_of_orders,
  ROUND(SUM(`Sales`)) AS revenue,
  ROUND(SUM(`Profit`)) AS profit,
  ROUND(SUM(`Sales`) / COUNT(DISTINCT `Order ID`)) AS avg_order_value,
  COUNT(DISTINCT `Customer ID`) AS active_customers,
  ROUND(COUNT(DISTINCT `Order ID`) / COUNT(DISTINCT `Customer ID`), 1) AS orders_per_active_customer,
  ROUND(SUM(`Sales`) / COUNT(DISTINCT `Customer ID`)) AS revenue_per_active_customer,
  ROUND(SUM(`Profit`) / COUNT(DISTINCT `Customer ID`)) AS profit_per_active_customer

FROM df
GROUP BY month
ORDER BY month DESC
"""

monthly_metrics = sqldf(query)
monthly_metrics.head()

Unnamed: 0,month,num_of_rows,num_of_orders,revenue,profit,avg_order_value,active_customers,orders_per_active_customer,revenue_per_active_customer,profit_per_active_customer
0,2014-12-01,477,235,90475.0,8533.0,385.0,203,1.0,446.0,42.0
1,2014-11-01,447,252,112326.0,9683.0,446.0,211,1.0,532.0,46.0
2,2014-10-01,302,150,77794.0,9441.0,519.0,139,1.0,560.0,68.0
3,2014-09-01,463,229,90489.0,11395.0,395.0,200,1.0,452.0,57.0
4,2014-08-01,218,110,61516.0,8894.0,559.0,103,1.0,597.0,86.0


In [20]:
# using LAG window functions to fetch previous month's revenue and use it to compute revenue growth rate

sqldf("""
SELECT 
    m.*,
    LAG(revenue, 1) OVER(ORDER BY month) AS revenue_previous_month,
    ROUND((revenue / (LAG(revenue, 1) OVER(ORDER BY month))) - 1, 3) AS revenue_growth_month_over_month
FROM monthly_metrics AS m
ORDER BY month DESC
"""
).head()

Unnamed: 0,month,num_of_rows,num_of_orders,revenue,profit,avg_order_value,active_customers,orders_per_active_customer,revenue_per_active_customer,profit_per_active_customer,revenue_previous_month,revenue_growth_month_over_month
0,2014-12-01,477,235,90475.0,8533.0,385.0,203,1.0,446.0,42.0,112326.0,-0.195
1,2014-11-01,447,252,112326.0,9683.0,446.0,211,1.0,532.0,46.0,77794.0,0.444
2,2014-10-01,302,150,77794.0,9441.0,519.0,139,1.0,560.0,68.0,90489.0,-0.14
3,2014-09-01,463,229,90489.0,11395.0,395.0,200,1.0,452.0,57.0,61516.0,0.471
4,2014-08-01,218,110,61516.0,8894.0,559.0,103,1.0,597.0,86.0,48428.0,0.27


💡 In a similar fashion, you write any other query for your data analysis in a syntax that is supported by SQLite