In [1]:
from sqlalchemy import create_engine
import pandas as pd
from config import server, database, driver 

connection_string = f"mssql+pyodbc://@{server}/{database}?driver={driver}"

engine = create_engine(connection_string)


When is the peak season of our ecommerce ?

In [2]:

query = '''
    select 
        d.season
        ,d.year
        ,count(o.order_id) orders_count
        ,sum(o.total_order_value) total_revenu
    from order_fact o
    join date_dim d
        on d.date_key = o.order_date_key
    group by d.season, d.year
    order by d.year, total_revenu desc;
'''
PeakSeason = pd.read_sql(query, con=engine)
PeakSeason

Unnamed: 0,season,year,orders_count,total_revenu
0,Fall,2016,292,49601770.0
1,Fall,2017,25115,4044382000.0
2,Summer,2017,11982,1809600000.0
3,Spring,2017,8982,1471280000.0
4,Spring,2018,21424,3518763000.0
5,Summer,2018,19267,3148395000.0
6,Fall,2018,14253,2144670000.0


In [3]:
query = '''
    select 
        d.quarter
        ,d.year
        ,count(o.order_id) orders_count
        ,sum(o.total_order_value) total_revenu
    from order_fact o
    join date_dim d
        on d.date_key = o.order_date_key
    group by d.quarter, d.year
    order by d.year, total_revenu desc;
'''
PeakSeason = pd.read_sql(query, con=engine)
PeakSeason

Unnamed: 0,quarter,year,orders_count,total_revenu
0,4,2016,291,49458310.0
1,3,2016,1,143460.0
2,4,2017,18145,2872016000.0
3,3,2017,13023,2038887000.0
4,2,2017,9626,1547296000.0
5,1,2017,5285,867063400.0
6,2,2018,20439,3410036000.0
7,1,2018,21588,3317627000.0
8,3,2018,12917,2084164000.0


What time users are most likely make an order or using the ecommerce app?

In [4]:
query = ''' 

    select 
        t.period_of_day
        ,count(*) orders_count
    from order_fact o
    join time_dim t
    on o.order_time_key = t.time_key
    group by t.period_of_day
    order by orders_count desc
'''

MostOrderingTime = pd.read_sql(query, con=engine)
MostOrderingTime.head()

Unnamed: 0,period_of_day,orders_count
0,Afternoon,38928
1,Evening,34938
2,Morning,22589
3,Night,4860


What is the preferred way to pay in the ecommerce?

In [5]:
query = '''
    select 
        payment_type
        ,count(order_id) payment_type_count
    from order_fact
    where payment_type is not null
    group by payment_type
    order by payment_type_count desc;
'''

MostPopularPayment = pd.read_sql(query, con=engine)
MostPopularPayment


Unnamed: 0,payment_type,payment_type_count
0,credit_card,74974
1,blipay,19306
2,voucher,5544
3,debit_card,1490


How many installment is usually done when paying in the ecommerce?

In [6]:
query = '''
    select 
        avg(payment_installments) avg_payment_installments
    from order_fact;
'''
AVGPaymentInstallments = pd.read_sql(query, con=engine)
AVGPaymentInstallments

Unnamed: 0,avg_payment_installments
0,2


What is the average spending time for user for our ecommerce?

In [7]:
query = '''
    select
        avg(datediff(day , do.full_date,dd.full_date)) avg_spending_time_in_days
    from order_fact o
    join date_dim do
		on o.order_date_key = do.date_key
    join date_dim dd
		on o.delivered_date_key = dd.date_key
    where o.order_status = 'delivered'
'''
AVGUserSpendingTime = pd.read_sql(query, con=engine)
AVGUserSpendingTime

Unnamed: 0,avg_spending_time_in_days
0,12


What is the frequency of purchase on each state?

In [9]:
query = '''
    select 
        s.seller_state
        ,count(o.order_key) frequency_of_purchase
    from order_item_fact o
    join seller_dim s
    on o.seller_key = s.seller_key
    group by s.seller_state
    order by frequency_of_purchase desc;
    
'''

PurchaseFrequencyOfState = pd.read_sql(query, con=engine)
PurchaseFrequencyOfState

Unnamed: 0,seller_state,frequency_of_purchase
0,BANTEN,34684
1,JAWA BARAT,11347
2,JAWA TENGAH,10550
3,KALIMANTAN TIMUR,9458
4,JAWA TIMUR,8520
5,DKI JAKARTA,7841
6,SUMATERA UTARA,5845
7,SULAWESI SELATAN,4409
8,RIAU,2904
9,LAMPUNG,2343


Which logistic route that have heavy traffic in our ecommerce?

In [13]:
query = '''
    select 
        s.seller_state
        ,c.customer_state
        ,count(o.order_key) order_total_count
    from order_fact o
	join order_item_fact i on o.order_key = i.order_key
    join seller_dim s on i.seller_key = s.seller_key
    join customer_dim c on o.customer_key = c.customer_key
    group by c.customer_state, s.seller_state
    order by order_total_count desc;
'''

StatesTraffic = pd.read_sql(query, con=engine)
StatesTraffic.head(10)

Unnamed: 0,seller_state,customer_state,order_total_count
0,BANTEN,BANTEN,7736
1,BANTEN,JAWA BARAT,4596
2,BANTEN,DKI JAKARTA,4013
3,BANTEN,JAWA TIMUR,3106
4,BANTEN,JAWA TENGAH,3026
5,JAWA BARAT,BANTEN,2491
6,JAWA TENGAH,BANTEN,2349
7,KALIMANTAN TIMUR,BANTEN,2112
8,JAWA TIMUR,BANTEN,1793
9,JAWA BARAT,JAWA BARAT,1619


How many late delivered order in our ecommerce? Are late order affecting the customer satisfaction?

In [14]:
query = '''

select 
    case when dd.full_date > ed.full_date then 'Late' else 'On Time' end as delivery_status,
    count(o.order_key) orders_total_count,
    avg(o.feedback_score) avg_feedback_score,
    count(o.feedback_score) feedback_count
from order_fact o
join date_dim dd on o.delivered_date_key = dd.date_key
join date_dim ed on o.estimated_delivery_date_key = ed.date_key
where o.feedback_score is not null
group by case when dd.full_date > ed.full_date then 'Late' else 'On Time' end

'''

OnTime_Late_Delivery = pd.read_sql(query, con=engine)
OnTime_Late_Delivery

Unnamed: 0,delivery_status,orders_total_count,avg_feedback_score,feedback_count
0,Late,6815,2,6815
1,On Time,94500,4,94500


How long are the delay for delivery / shipping process in each state?

In [15]:
query = ''' 
select
    c.customer_state  state,
    count(o.order_key)  total_orders,
    sum(case when dd.full_date > ed.full_date then 1 else 0 end)  late_orders_count,
    avg(case when dd.full_date > ed.full_date then datediff(day, ed.full_date, dd.full_date) else null end)  avg_delay_days
from order_fact o
join customer_dim c on o.customer_key = c.customer_key
join date_dim dd on o.delivered_date_key = dd.date_key
join date_dim ed on o.estimated_delivery_date_key = ed.date_key
group by c.customer_state
order by avg_delay_days desc
'''

LateDeliveryRoutes = pd.read_sql(query, con=engine)
LateDeliveryRoutes

Unnamed: 0,state,total_orders,late_orders_count,avg_delay_days
0,BENGKULU,573,22,20
1,KEPULAUAN BANGKA BELITUNG,339,30,16
2,NUSA TENGGARA BARAT,387,22,16
3,KEPULAUAN RIAU,747,62,14
4,JAMBI,1101,80,14
5,BALI,1429,146,13
6,SULAWESI TENGGARA,914,64,13
7,SULAWESI BARAT,473,30,13
8,SULAWESI UTARA,1266,86,12
9,MALUKU UTARA,547,33,12


How long are the difference between estimated delivery time and actual delivery time in each state?

In [16]:
query = '''
   select 
        c.customer_state as state,
        avg(datediff(day, ed.full_date, dd.full_date)) avg_delivery_diff,
        max(datediff(day, ed.full_date, dd.full_date)) max_delivery_diff,
        min(datediff(day, ed.full_date, dd.full_date)) min_delivery_diff,
        count(o.order_key) total_orders
    from order_fact o
    join customer_dim c on o.customer_key = c.customer_key
    join date_dim dd on o.delivered_date_key = dd.date_key
    join date_dim ed on o.estimated_delivery_date_key = ed.date_key
    group by c.customer_state
    order by avg_delivery_diff desc
'''

DeliveryTimeByState = pd.read_sql(query, con=engine)
DeliveryTimeByState


Unnamed: 0,state,avg_delivery_diff,max_delivery_diff,min_delivery_diff,total_orders
0,JAMBI,-11,162,-42,1101
1,JAWA BARAT,-11,167,-84,13159
2,JAWA TENGAH,-11,161,-68,8723
3,JAWA TIMUR,-11,153,-147,8583
4,BALI,-11,69,-52,1429
5,BANTEN,-11,165,-78,21536
6,DI YOGYAKARTA,-11,105,-49,1810
7,MALUKU UTARA,-11,106,-41,547
8,SULAWESI TENGAH,-11,45,-66,1015
9,SULAWESI UTARA,-12,155,-77,1266
