In [1]:
import pandas as pd
import numpy as np
import sqlite3

In [2]:
conn = sqlite3.connect("superstore.db")
cursor = conn.cursor()
cursor.execute("PRAGMA foreign_keys = ON;")

<sqlite3.Cursor at 0x26ffe460c40>

# GOAL: identify actionable discount strategies that maximize profits with driving sales.

In [3]:
query = """
    SELECT COUNT(DISTINCT order_id) AS num_orders, SUM(sales) AS total_sales, SUM(profit) AS total_profit, SUM(profit)/COUNT(DISTINCT order_id) AS avg_profit
    FROM order_details
"""

df_statistics = pd.read_sql_query(query, conn)
df_statistics

Unnamed: 0,num_orders,total_sales,total_profit,avg_profit
0,5009,2297201.0,286397.0217,57.176487


## How does the discount percentage affect sales volume and profit?

In [4]:
query = """
    SELECT discount, COUNT(*) AS num_orders, SUM(sales)/1000000 AS "total_sales(in millions)", AVG(sales) AS avg_sales, SUM(profit) AS total_profit, SUM(profit)/COUNT(DISTINCT order_id) AS avg_profit
    FROM order_details
    GROUP BY discount
    ORDER BY discount;
"""
df_discounts = pd.read_sql_query(query, conn)
df_discounts

Unnamed: 0,discount,num_orders,total_sales(in millions),avg_sales,total_profit,avg_profit
0,0.0,4798,1.087908,226.742074,320987.6032,121.40227
1,0.1,94,0.054369,578.397351,9029.177,101.451427
2,0.15,52,0.027559,529.971567,1418.9915,27.823363
3,0.2,3657,0.764594,209.07694,90337.306,37.531079
4,0.3,227,0.103227,454.742974,-10369.2774,-49.143495
5,0.32,27,0.014493,536.79477,-2391.1377,-88.560656
6,0.4,206,0.116418,565.134874,-23057.0504,-124.632705
7,0.45,11,0.005485,498.634,-2493.1111,-249.31111
8,0.5,66,0.058919,892.705152,-20506.4281,-320.412939
9,0.6,138,0.006645,48.15,-5944.6552,-46.808309


## Which discount ranges (e.g., 0–10%, 10–20%, 20–50%) are most profitable?

In [5]:
cursor.execute("DROP VIEW IF EXISTS discount_range_cte")
conn.commit()

In [6]:
cursor.execute("""
CREATE VIEW discount_range_cte AS
SELECT
    od.*,
    CASE
        WHEN discount == 0 THEN "None"
        WHEN discount > 0 AND discount <= 0.2 THEN "Low(0-20%)"
        WHEN discount > 0.2 AND discount <= 0.35 THEN "Medium(20-35%)"
        WHEN discount > 0.35 AND discount <= 0.5 THEN "High(35-50%)"
        WHEN discount > 0.5 THEN "Mega(>50%)"
    END AS discount_range
FROM order_details od;
""")

conn.commit()

In [7]:
query = """
    SELECT
        drc.discount_range, COUNT(DISTINCT order_id) AS num_orders, SUM(sales)/1000000 AS "total_sales(in millions)", AVG(sales) AS avg_sales, SUM(profit) AS total_profit,
        SUM(profit)/COUNT(DISTINCT order_id) AS avg_profit
    FROM discount_range_cte drc
    GROUP BY drc.discount_range
    ORDER BY 
        CASE drc.discount_range
            WHEN "Low(0-20%)" THEN 1
            WHEN "Medium(20-35%)" THEN 2
            WHEN "High(35-50%)" THEN 3
            WHEN "Mega(>50%)" THEN 4
        END;
"""

df_discounts_ranges = pd.read_sql_query(query, conn)
df_discounts_ranges

Unnamed: 0,discount_range,num_orders,total_sales(in millions),avg_sales,total_profit,avg_profit
0,,2644,1.087908,226.742074,320987.6032,121.40227
1,Low(0-20%),2507,0.846522,222.593279,100785.4745,40.201625
2,Medium(20-35%),234,0.11772,463.465015,-12760.4151,-54.531688
3,High(35-50%),254,0.180821,638.944516,-46056.5896,-181.325156
4,Mega(>50%),685,0.064229,75.033572,-76559.0513,-111.765038


## What are the different categories and respective sub-categories of products sold?

In [8]:
query = """
    SELECT category, sub_category
    FROM products
    GROUP BY category, sub_category
    ORDER BY category
"""
df_products = pd.read_sql_query(query, conn)
df_products

Unnamed: 0,category,sub_category
0,Furniture,Bookcases
1,Furniture,Chairs
2,Furniture,Furnishings
3,Furniture,Tables
4,Office Supplies,Appliances
5,Office Supplies,Art
6,Office Supplies,Binders
7,Office Supplies,Envelopes
8,Office Supplies,Fasteners
9,Office Supplies,Labels


In [9]:
query = """
    SELECT p.category, COUNT(DISTINCT od.order_id) AS num_orders, SUM(od.sales) AS total_sales, SUM(od.profit) AS total_profit, SUM(od.profit)/COUNT(DISTINCT od.order_id) AS avg_profit
    FROM products p
    JOIN order_details od ON od.product_id = p.product_id
    GROUP BY category
    ORDER BY category
"""
df_products_stats = pd.read_sql_query(query, conn)
df_products_stats

Unnamed: 0,category,num_orders,total_sales,total_profit,avg_profit
0,Furniture,1764,741999.7953,18451.2728,10.459905
1,Office Supplies,3742,719047.032,122490.8008,32.734046
2,Technology,1544,836154.033,145454.9481,94.206573


### Which discount ranges are most profitable in each Product Categories?

In [10]:
query = """
    SELECT
        drc.discount_range, p.category, COUNT(DISTINCT drc.order_id) AS num_orders, SUM(drc.sales) as total_sakeles, SUM(drc.profit) AS total_profit,
        SUM(drc.profit)/COUNT(DISTINCT drc.order_id) AS avg_profit,
        (SUM(drc.profit)/SUM(drc.sales)) * 100 AS profit_margin
    FROM discount_range_cte drc
    JOIN products p ON drc.product_id = p.product_id
    GROUP BY drc.discount_range, p.category
    ORDER BY
        CASE drc.discount_range
            WHEN "Low(0-20%)" THEN 1
            WHEN "Medium(20-35%)" THEN 2
            WHEN "High(35-50%)" THEN 3
            WHEN "Mega(>50%)" THEN 4
        END;
"""

df_product_category_discounts_ranges = pd.read_sql_query(query, conn)
df_product_category_discounts_ranges

Unnamed: 0,discount_range,category,num_orders,total_sakeles,total_profit,avg_profit,profit_margin
0,,Furniture,739,256025.27,58133.0764,78.664515,22.705992
1,,Office Supplies,2020,442150.0,130506.1064,64.606983,29.516252
2,,Technology,718,389733.2,132348.4204,184.329276,33.958724
3,Low(0-20%),Furniture,654,290823.7845,14795.9525,22.623781,5.087601
4,Low(0-20%),Office Supplies,1677,237373.885,39124.832,23.330252,16.482366
5,Low(0-20%),Technology,746,318324.571,46864.69,62.8213,14.722297
6,Medium(20-35%),Furniture,230,113963.8088,-13086.4546,-56.897629,-11.482992
7,Medium(20-35%),Technology,5,3756.305,326.0395,65.2079,8.679793
8,High(35-50%),Furniture,134,72082.85,-31551.7069,-235.460499,-43.771448
9,High(35-50%),Technology,129,108738.448,-14504.8827,-112.440951,-13.33924


### Which discount ranges are most profitable in each Product Subcategories?

In [11]:
query = """
    SELECT
        drc.discount_range, p.category, p.sub_category, COUNT(DISTINCT drc.order_id) AS num_orders, SUM(drc.profit) AS total_profit, SUM(drc.profit)/COUNT(DISTINCT drc.order_id) AS avg_profit,
        (SUM(drc.profit)/SUM(drc.sales)) * 100 AS profit_margin
    FROM discount_range_cte drc
    JOIN products p ON drc.product_id = p.product_id
    GROUP BY drc.discount_range, p.category, p.sub_category
    ORDER BY
        CASE drc.discount_range
            WHEN "Low(0-20%)" THEN 1
            WHEN "Medium(20-35%)" THEN 2
            WHEN "High(35-50%)" THEN 3
            WHEN "Mega(>50%)" THEN 4
        END;
"""

df_product_category_and_subcategory_discounts_ranges = pd.read_sql_query(query, conn)
df_product_category_and_subcategory_discounts_ranges

Unnamed: 0,discount_range,category,sub_category,num_orders,total_profit,avg_profit,profit_margin
0,,Furniture,Bookcases,60,6075.7117,101.261862,19.02466
1,,Furniture,Chairs,124,21933.0961,176.879807,24.086229
2,,Furniture,Furnishings,529,16847.9689,31.848712,27.417451
3,,Furniture,Tables,70,13276.2997,189.661424,18.54782
4,,Office Supplies,Appliances,265,23183.7361,87.485797,29.697538
5,,Office Supplies,Art,452,5380.6006,11.903984,29.867933
6,,Office Supplies,Binders,295,39314.4507,133.269324,48.044361
7,,Office Supplies,Envelopes,148,4976.9844,33.628273,46.92413
8,,Office Supplies,Fasteners,128,652.2052,5.095353,35.779619
9,,Office Supplies,Labels,225,4422.096,19.65376,47.860461


## What are the different customer Segments that order products

In [12]:
query = """
    SELECT segment
    FROM customers
    GROUP BY segment;
"""
df_customer_segments = pd.read_sql_query(query, conn)
df_customer_segments

Unnamed: 0,segment
0,Consumer
1,Corporate
2,Home Office


In [13]:
query = """
    SELECT c.segment, COUNT(DISTINCT od.order_id) AS num_orders, SUM(od.sales) AS total_sales, SUM(od.profit) AS total_profit, SUM(od.profit)/COUNT(DISTINCT od.order_id) AS avg_profit
    FROM order_details od
    JOIN orders o ON od.order_id = o.order_id
    JOIN customers c ON o.customer_id = c.customer_id
    GROUP BY segment
    ORDER BY segment
"""
df_customer_segments_stats = pd.read_sql_query(query, conn)
df_customer_segments_stats

Unnamed: 0,segment,num_orders,total_sales,total_profit,avg_profit
0,Consumer,2586,1161401.0,134119.2092,51.863577
1,Corporate,1514,706146.4,91979.134,60.7524
2,Home Office,909,429653.1,60298.6785,66.33518


### Which Customer Segment generates the highest total and average profits  at different Discount Ranges, and how do their profit margins compare?

In [14]:
query = """
    SELECT
        drc.discount_range, c.segment, COUNT(DISTINCT drc.order_id) AS num_orders, SUM(drc.profit) AS total_profit, SUM(drc.profit)/COUNT(DISTINCT drc.order_id) AS avg_profit,
        (SUM(drc.profit)/SUM(drc.sales)) * 100 AS profit_margin
    FROM discount_range_cte drc
    JOIN orders o ON drc.order_id = o.order_id
    JOIN customers c ON o.customer_id = c.customer_id
    GROUP BY drc.discount_range, c.segment
    ORDER BY
        CASE drc.discount_range
            WHEN "Low(0-20%)" THEN 1
            WHEN "Medium(20-35%)" THEN 2
            WHEN "High(35-50%)" THEN 3
            WHEN "Mega(>50%)" THEN 4
        END;
"""

df_customer_segments_discount_ranges = pd.read_sql_query(query, conn)
df_customer_segments_discount_ranges

Unnamed: 0,discount_range,segment,num_orders,total_profit,avg_profit,profit_margin
0,,Consumer,1351,157901.9614,116.87784,29.651998
1,,Corporate,784,102150.7824,130.294365,28.465538
2,,Home Office,509,60934.8594,119.714851,31.004797
3,Low(0-20%),Consumer,1321,48107.4086,36.417418,10.941919
4,Low(0-20%),Corporate,761,29599.1148,38.895026,11.939182
5,Low(0-20%),Home Office,425,23078.9511,54.303414,14.520093
6,Medium(20-35%),Consumer,130,-7673.6266,-59.027897,-10.741519
7,Medium(20-35%),Corporate,63,-3370.296,-53.496762,-12.097096
8,Medium(20-35%),Home Office,41,-1716.4925,-41.865671,-9.318226
9,High(35-50%),Consumer,129,-19654.438,-152.359984,-24.044758


## What are the different Regions that customers order products from? 

In [15]:
query = """
    SELECT region, COUNT(region) AS region_freq
    FROM customers
    GROUP BY region;
"""
df_customer_regions = pd.read_sql_query(query, conn)
df_customer_regions

Unnamed: 0,region,region_freq
0,Central,160
1,East,245
2,South,109
3,West,279


In [16]:
query = """
    SELECT c.region, COUNT(DISTINCT od.order_id) AS num_orders, SUM(od.sales) AS total_sales, SUM(od.profit) AS total_profit, SUM(od.profit)/COUNT(DISTINCT od.order_id) AS avg_profit
    FROM order_details od
    JOIN orders o ON od.order_id = o.order_id
    JOIN customers c ON o.customer_id = c.customer_id
    GROUP BY region
    ORDER BY region
"""
df_customer_region_stats = pd.read_sql_query(query, conn)
df_customer_region_stats

Unnamed: 0,region,num_orders,total_sales,total_profit,avg_profit
0,Central,1021,461993.302,62040.4592,60.764407
1,East,1504,681673.2008,84575.0085,56.233383
2,South,656,319778.5651,33706.3291,51.381599
3,West,1828,833755.7924,106075.2249,58.028022


### Which Region generates the highest total and average profits at different Discount Ranges, and how do their profit margins compare?

In [17]:
query = """
    SELECT
        drc.discount_range, c.region, COUNT(DISTINCT drc.order_id) AS num_orders, SUM(drc.profit) AS total_profit, SUM(drc.profit)/COUNT(DISTINCT drc.order_id) AS avg_profit,
        (SUM(drc.profit)/SUM(drc.sales)) * 100 AS profit_margin
    FROM discount_range_cte drc
    JOIN orders o ON drc.order_id = o.order_id
    JOIN customers c ON o.customer_id = c.customer_id
    GROUP BY drc.discount_range, c.region
    ORDER BY
        CASE drc.discount_range
            WHEN "Low(0-20%)" THEN 1
            WHEN "Medium(20-35%)" THEN 2
            WHEN "High(35-50%)" THEN 3
            WHEN "Mega(>50%)" THEN 4
        END;
"""

df_customer_regions_discount_ranges = pd.read_sql_query(query, conn)
df_customer_regions_discount_ranges

Unnamed: 0,discount_range,region,num_orders,total_profit,avg_profit,profit_margin
0,,Central,485,75845.9007,156.3833,30.157762
1,,East,792,94135.8569,118.858405,29.349557
2,,South,329,44419.007,135.012179,31.739785
3,,West,1038,106586.8386,102.684816,28.368413
4,Low(0-20%),Central,497,14216.7564,28.605144,10.863923
5,Low(0-20%),East,743,30027.9738,40.4145,13.094809
6,Low(0-20%),South,306,11518.2905,37.641472,10.741687
7,Low(0-20%),West,961,45022.4538,46.849588,11.875569
8,Medium(20-35%),Central,76,-4575.7804,-60.207637,-11.134125
9,Medium(20-35%),East,77,-3793.4012,-49.264951,-9.654375


## Which States do customers, who purchase products, reside in?

In [18]:
query = """
    SELECT state, COUNT(state) AS state_freq
    FROM customers
    GROUP BY state
    ORDER BY state_freq DESC;
"""
df_customer_states = pd.read_sql_query(query, conn)
df_customer_states

Unnamed: 0,state,state_freq
0,California,211
1,New York,107
2,Texas,88
3,Pennsylvania,42
4,Ohio,38
5,Washington,33
6,Illinois,31
7,Florida,25
8,North Carolina,23
9,Michigan,15


In [19]:
query = """
    SELECT c.state, c.region, COUNT(DISTINCT od.order_id) AS num_orders, SUM(od.sales) AS total_sales, SUM(od.profit) AS total_profit, SUM(od.profit)/COUNT(DISTINCT od.order_id) AS avg_profit
    FROM order_details od
    JOIN orders o ON od.order_id = o.order_id
    JOIN customers c ON o.customer_id = c.customer_id
    GROUP BY state, region
    ORDER BY region
"""
df_customer_states_stats = pd.read_sql_query(query, conn)
df_customer_states_stats

Unnamed: 0,state,region,num_orders,total_sales,total_profit,avg_profit
0,Arizona,Central,8,3241.8980,161.2263,20.153287
1,California,Central,91,27865.3648,4243.1309,46.627812
2,Colorado,Central,9,2523.2660,191.4898,21.276644
3,Florida,Central,9,1972.5990,117.6429,13.071433
4,Georgia,Central,4,688.3240,218.1597,54.539925
...,...,...,...,...,...,...
86,Texas,West,73,23870.2400,1257.0908,17.220422
87,Utah,West,16,11658.1620,2434.8515,152.178219
88,Virginia,West,6,2453.2840,280.7822,46.797033
89,Washington,West,175,76618.7207,14130.0230,80.742989


In [20]:
cursor.execute("DROP VIEW IF EXISTS states_ranked_by_overall_profits")
conn.commit()

In [21]:
cursor.execute("""
CREATE VIEW states_ranked_by_overall_profits AS
WITH state_summary AS (
    SELECT
        c.state, SUM(drc.profit) AS total_profit, SUM(drc.profit)/COUNT(DISTINCT drc.order_id) AS avg_profit, (SUM(drc.profit)/SUM(drc.sales)) * 100 AS profit_margin
    FROM discount_range_cte drc
    JOIN orders o ON drc.order_id = o.order_id
    JOIN customers c ON o.customer_id = c.customer_id
    GROUP BY c.state
)
SELECT
    state, total_profit, avg_profit, profit_margin,
    RANK() OVER (ORDER BY total_profit DESC) AS rank_total_profit,
    RANK() OVER (ORDER BY avg_profit DESC) AS rank_avg_profit,
    RANK() OVER (ORDER BY profit_margin DESC) AS rank_profit_margin,
    (RANK() OVER (ORDER BY total_profit DESC) + RANK() OVER (ORDER BY avg_profit DESC) + RANK() OVER (ORDER BY profit_margin DESC)) AS combined_rank
FROM state_summary;   
""")

conn.commit()

### Which States generates the highest total and average profits, and how do their profit margins compare?

In [22]:
query = """
    SELECT *
    FROM states_ranked_by_overall_profits
    ORDER BY combined_rank ASC
    LIMIT 10;
"""

df_highest_overall_regions = pd.read_sql_query(query, conn)
df_highest_overall_regions

Unnamed: 0,state,total_profit,avg_profit,profit_margin,rank_total_profit,rank_avg_profit,rank_profit_margin,combined_rank
0,Indiana,13489.5697,259.414802,34.547628,5,1,2,8
1,Arkansas,7607.7888,195.071508,33.727967,8,4,3,15
2,Massachusetts,4797.2961,149.915503,27.905032,12,7,7,26
3,Georgia,8907.6688,108.630107,25.821881,6,12,11,29
4,Alabama,1825.8816,228.2352,40.468239,25,3,1,29
5,Montana,1869.9294,233.741175,30.620045,24,2,4,30
6,Virginia,7947.3635,120.414598,23.593045,7,11,13,31
7,Wisconsin,2720.8422,143.202221,27.588895,20,9,8,37
8,South Carolina,2342.2244,180.171108,26.388115,22,5,10,37
9,Washington,18240.2336,83.288738,17.920573,3,18,21,42


### How has different Discount Ranges affected the total, average profits, and profit margins of the states with the highest overall profits?

In [23]:
query = """
    SELECT c.state, drc.discount_range, COUNT(DISTINCT drc.order_id) AS num_orders, SUM(drc.profit) AS total_profit, SUM(drc.profit)/COUNT(DISTINCT drc.order_id) AS avg_profit,
    (SUM(drc.profit)/SUM(drc.sales)) * 100 AS profit_margin
    FROM discount_range_cte drc
    JOIN orders o ON drc.order_id = o.order_id
    JOIN customers c ON o.customer_id = c.customer_id
    JOIN states_ranked_by_overall_profits sr ON c.state = sr.state
    JOIN (
        SELECT state 
        FROM states_ranked_by_overall_profits
        ORDER BY combined_rank ASC
        LIMIT 10
    ) top_states ON c.state = top_states.state
    GROUP BY drc.discount_range, c.state 
    ORDER BY sr.combined_rank,
        CASE drc.discount_range
            WHEN "Low(0-20%)" THEN 1
            WHEN "Medium(20-35%)" THEN 2
            WHEN "High(35-50%)" THEN 3
            WHEN "Mega(>50%)" THEN 4
        END;
"""

df_highest_customer_regions_discount_ranges = pd.read_sql_query(query, conn)
df_highest_customer_regions_discount_ranges

Unnamed: 0,state,discount_range,num_orders,total_profit,avg_profit,profit_margin
0,Indiana,,27,13397.9793,496.221456,37.620011
1,Indiana,Low(0-20%),22,404.6808,18.394582,20.411786
2,Indiana,Medium(20-35%),2,-117.882,-58.941,-11.512836
3,Indiana,High(35-50%),1,-13.978,-13.978,-6.666667
4,Indiana,Mega(>50%),7,-181.2304,-25.890057,-83.84241
5,Arkansas,,26,7025.4838,270.210915,42.093795
6,Arkansas,Low(0-20%),12,693.4392,57.7866,13.203316
7,Arkansas,Medium(20-35%),2,-96.9582,-48.4791,-20.649351
8,Arkansas,High(35-50%),1,15.992,15.992,13.333333
9,Arkansas,Mega(>50%),3,-30.168,-10.056,-121.851523


### Which States generates the lowest total and average profits, and how do their profit margins compare?

In [24]:
query = """
    SELECT *
    FROM states_ranked_by_overall_profits
    ORDER BY combined_rank DESC
    LIMIT 10;
"""

df_lowest_overall_regions = pd.read_sql_query(query, conn)
df_lowest_overall_regions

Unnamed: 0,state,total_profit,avg_profit,profit_margin,rank_total_profit,rank_avg_profit,rank_profit_margin,combined_rank
0,Colorado,-3301.0048,-38.383777,-9.093238,41,41,41,123
1,Ohio,-1255.1646,-5.251735,-1.545024,40,40,40,120
2,Tennessee,447.0073,6.040639,1.532177,36,39,39,114
3,Florida,1652.68,11.166757,1.940695,26,37,38,101
4,Oklahoma,21.897,10.9485,16.496406,39,38,24,101
5,North Carolina,1569.9738,12.661079,3.713464,27,36,36,99
6,Mississippi,912.7173,32.597046,8.328368,32,32,33,97
7,Pennsylvania,3274.3222,12.891032,2.715898,17,35,37,89
8,Iowa,324.8052,46.400743,21.014816,38,31,15,84
9,Illinois,5428.8462,27.983743,6.571504,11,33,34,78


### How has different Discount Ranges affected the total, average profits, and profit margins of the states with the lowest overall profits?

In [25]:
query = """
    SELECT c.state, drc.discount_range, COUNT(DISTINCT drc.order_id) AS num_orders, SUM(drc.profit) AS total_profit, SUM(drc.profit)/COUNT(DISTINCT drc.order_id) AS avg_profit,
    (SUM(drc.profit)/SUM(drc.sales)) * 100 AS profit_margin
    FROM discount_range_cte drc
    JOIN orders o ON drc.order_id = o.order_id
    JOIN customers c ON o.customer_id = c.customer_id
    JOIN states_ranked_by_overall_profits sr ON c.state = sr.state
    JOIN (
        SELECT state 
        FROM states_ranked_by_overall_profits
        ORDER BY combined_rank DESC
        LIMIT 10
    ) worst_states ON c.state = worst_states.state
    GROUP BY drc.discount_range, c.state 
    ORDER BY sr.combined_rank DESC,
        CASE drc.discount_range
            WHEN "Low(0-20%)" THEN 1
            WHEN "Medium(20-35%)" THEN 2
            WHEN "High(35-50%)" THEN 3
            WHEN "Mega(>50%)" THEN 4
        END;
"""

df_lowest_customer_regions_discount_ranges = pd.read_sql_query(query, conn)
df_lowest_customer_regions_discount_ranges

Unnamed: 0,state,discount_range,num_orders,total_profit,avg_profit,profit_margin
0,Colorado,,34,2137.9316,62.880341,20.538037
1,Colorado,Low(0-20%),45,1604.9231,35.664958,9.099063
2,Colorado,Medium(20-35%),4,-141.4643,-35.366075,-10.897906
3,Colorado,High(35-50%),5,-491.4762,-98.29524,-24.060621
4,Colorado,Mega(>50%),23,-6410.919,-278.735609,-130.487426
5,Ohio,,96,6807.0728,70.907008,26.39801
6,Ohio,Low(0-20%),124,3021.9693,24.37072,10.766781
7,Ohio,Medium(20-35%),19,-871.2324,-45.854337,-7.300119
8,Ohio,High(35-50%),31,-4165.226,-134.362129,-42.050245
9,Ohio,Mega(>50%),47,-6047.7483,-128.675496,-109.05757


## Which discount ranges generate the most profit (total, average, and margin) during different periods of time?

In [26]:
query = """
    SELECT strftime("%Y", order_date) AS year
    FROM orders
    GROUP BY year
    ORDER BY year;
"""

df_year = pd.read_sql_query(query, conn)
df_year

Unnamed: 0,year
0,2014
1,2015
2,2016
3,2017


In [27]:
query = """
    SELECT strftime("%Y", o.order_date) AS year, COUNT(DISTINCT od.order_id) AS num_orders, SUM(od.sales) AS total_sales, SUM(od.profit) AS total_profit,
    SUM(od.profit)/COUNT(DISTINCT od.order_id) AS avg_profit
    FROM order_details od
    JOIN orders o ON od.order_id = o.order_id
    GROUP BY year
    ORDER BY year
"""
df_year_stats = pd.read_sql_query(query, conn)
df_year_stats

Unnamed: 0,year,num_orders,total_sales,total_profit,avg_profit
0,2014,969,484247.4981,49543.9741,51.128972
1,2015,1038,470532.509,61618.6037,59.362817
2,2016,1315,609205.598,81795.1743,62.201653
3,2017,1687,733215.2552,93439.2696,55.38783


In [28]:
query = """
    SELECT strftime("%Y", o.order_date) AS year, drc.discount_range, COUNT(DISTINCT drc.order_id) AS num_orders, SUM(drc.profit) AS total_profit, 
    SUM(drc.profit)/COUNT(DISTINCT drc.order_id) AS avg_profit,
    (SUM(drc.profit)/SUM(drc.sales)) * 100 AS profit_margin
    FROM discount_range_cte drc
    JOIN orders o ON drc.order_id = o.order_id
    GROUP BY year, drc.discount_range
    ORDER BY
        CASE drc.discount_range
            WHEN "Low(0-20%)" THEN 1
            WHEN "Medium(20-35%)" THEN 2
            WHEN "High(35-50%)" THEN 3
            WHEN "Mega(>50%)" THEN 4
        END;
"""

df_discount_range_stats_each_year = pd.read_sql_query(query, conn)
df_discount_range_stats_each_year

Unnamed: 0,year,discount_range,num_orders,total_profit,avg_profit,profit_margin
0,2014,,500,58617.1665,117.234333,27.268906
1,2015,,559,68870.7566,123.2035,30.403851
2,2016,,684,89341.9679,130.616912,30.019739
3,2017,,901,104157.7122,115.602344,29.860176
4,2014,Low(0-20%),498,18970.6859,38.093747,11.499999
5,2015,Low(0-20%),512,21153.0116,41.314476,11.74975
6,2016,Low(0-20%),662,23810.3198,35.96725,10.606161
7,2017,Low(0-20%),835,36851.4572,44.133482,13.302089
8,2014,Medium(20-35%),49,-2111.5976,-43.093829,-7.746576
9,2015,Medium(20-35%),48,-2996.4969,-62.427019,-13.085223


In [29]:
query = """
    SELECT strftime("%Y-%m", order_date) AS year_month
    FROM orders
    GROUP BY year_month
    ORDER BY year_month;
"""

df_time = pd.read_sql_query(query, conn)
df_time

Unnamed: 0,year_month
0,2014-01
1,2014-02
2,2014-03
3,2014-04
4,2014-05
5,2014-06
6,2014-07
7,2014-08
8,2014-09
9,2014-10


In [30]:
query = """
    SELECT strftime("%Y-%m", o.order_date) AS year_month, COUNT(DISTINCT od.order_id) AS num_orders, SUM(od.sales) AS total_sales, SUM(od.profit) AS total_profit, SUM(od.profit)/COUNT(DISTINCT od.order_id) AS avg_profit
    FROM order_details od
    JOIN orders o ON od.order_id = o.order_id
    GROUP BY year_month
    ORDER BY year_month
"""
df_time_stats = pd.read_sql_query(query, conn)
df_time_stats

Unnamed: 0,year_month,num_orders,total_sales,total_profit,avg_profit
0,2014-01,32,14236.895,2450.1907,76.568459
1,2014-02,28,4519.892,862.3084,30.796729
2,2014-03,71,55691.009,498.7299,7.024365
3,2014-04,66,28295.345,3488.8352,52.861139
4,2014-05,69,23648.287,2738.7096,39.691443
5,2014-06,66,34595.1276,4976.5244,75.401885
6,2014-07,65,33946.393,-841.4826,-12.945886
7,2014-08,72,27909.4685,5318.105,73.862569
8,2014-09,130,81777.3508,8328.0994,64.062303
9,2014-10,78,31453.393,3448.2573,44.208427


In [31]:
query = """
    SELECT strftime("%Y-%m", o.order_date) AS year_month, drc.discount_range, COUNT(DISTINCT drc.order_id) AS num_orders, SUM(drc.profit) AS total_profit, 
    SUM(drc.profit)/COUNT(DISTINCT drc.order_id) AS avg_profit,
    (SUM(drc.profit)/SUM(drc.sales)) * 100 AS profit_margin
    FROM discount_range_cte drc
    JOIN orders o ON drc.order_id = o.order_id
    GROUP BY year_month, drc.discount_range
    ORDER BY year_month,
        CASE drc.discount_range
            WHEN "Low(0-20%)" THEN 1
            WHEN "Medium(20-35%)" THEN 2
            WHEN "High(35-50%)" THEN 3
            WHEN "Mega(>50%)" THEN 4
        END;
"""

df_discount_range_over_time = pd.read_sql_query(query, conn)
df_discount_range_over_time

Unnamed: 0,year_month,discount_range,num_orders,total_profit,avg_profit,profit_margin
0,2014-01,,18,2858.7547,158.819706,24.565721
1,2014-01,Low(0-20%),14,165.2015,11.800107,8.178952
2,2014-01,High(35-50%),2,-84.3356,-42.167800,-45.302750
3,2014-01,Mega(>50%),6,-489.4299,-81.571650,-124.306603
4,2014-02,,13,836.8803,64.375408,22.311394
...,...,...,...,...,...,...
231,2017-12,,127,12373.6771,97.430528,25.632432
232,2017-12,Low(0-20%),104,3308.8037,31.815420,13.854905
233,2017-12,Medium(20-35%),11,-307.0586,-27.914418,-8.440242
234,2017-12,High(35-50%),11,-1166.7366,-106.066964,-29.359834


In [32]:
conn.close()