In [1]:
from sqlalchemy import create_engine
import pandas as pd
from config import server, database, driver 

connection_string = f"mssql+pyodbc://@{server}/{database}?driver={driver}"

engine = create_engine(connection_string)


When is the peak season of our ecommerce ?

In [2]:

query = '''
    select 
        d.year
        ,d.quarter
        ,count(o.order_date_key) total_orders
        ,sum(o.shipping_cost + o.price) total_revenue
    from order_fact o
    join date_dim d
    on o.order_date_key = d.date_key
    group by d.quarter, d.year
    order by total_revenue desc
'''
PeakSeason_Quarter = pd.read_sql(query, con=engine)
PeakSeason_Quarter.head()

Unnamed: 0,year,quarter,total_orders,total_revenue
0,2018,2,23872,3465842000.0
1,2018,1,25011,3383463000.0
2,2017,4,21150,2921045000.0
3,2018,3,14794,2133773000.0
4,2017,3,15109,2091842000.0


In [3]:
query = '''
    select 
        d.year
        ,d.month
        ,count(o.order_date_key) total_orders
        ,sum(o.shipping_cost + o.price) total_revenue
    from order_fact o
    join date_dim d
    on o.order_date_key = d.date_key
    group by d.month, d.year
    order by total_revenue desc
'''
PeakSeason_Month = pd.read_sql(query, con=engine)
PeakSeason_Month.head()

Unnamed: 0,year,month,total_orders,total_revenue
0,2017,11,9016,1221835000.0
1,2018,3,8544,1204543000.0
2,2018,4,8261,1198573000.0
3,2018,5,8221,1192512000.0
4,2018,1,8545,1151137000.0


What time users are most likely make an order or using the ecommerce app?

In [4]:
query = ''' 

    select 
        hour + 1 hour
        ,count(*) orders_count
    from order_fact o
    join time_dim t
    on o.order_time_key = t.time_key
    group by t.hour
    order by orders_count desc
    
'''

MostOrderingTime = pd.read_sql(query, con=engine)
MostOrderingTime.head()

Unnamed: 0,hour,orders_count
0,17,7990
1,15,7896
2,12,7769
3,14,7671
4,16,7612


What is the preferred way to pay in the ecommerce?

In [5]:
query = '''
    select
        payment_type
        ,count(*) count
    from payment_dim
    group by payment_type
    order by count desc
'''

MostPopularPayment = pd.read_sql(query, con=engine)
MostPopularPayment['payment_type'][0]


'credit_card'

How many installment is usually done when paying in the ecommerce?

In [6]:
query = '''
    select 
        avg(payment_installments) avg_payment_installments
    from payment_dim;
'''
AVGPaymentInstallments = pd.read_sql(query, con=engine)
AVGPaymentInstallments['avg_payment_installments'][0]

2

What is the average spending time for user for our ecommerce?

In [7]:
query = '''
    select
        avg(datediff(day , do.date,dd.date)) avg_time
    from order_fact o
    join date_dim do
    on o.order_date_key = do.date_key
    join date_dim dd
    on o.delivered_date_key = dd.date_key
    where o.order_status = 'delivered'
'''
AVGUserSpendingTime = pd.read_sql(query, con=engine)
AVGUserSpendingTime['avg_time'][0]

12

What is the frequency of purchase on each state?

In [8]:
query = '''

    select 
        s.seller_state
        ,count(o.order_key) count
    from order_fact o
    join seller_dim s
    on o.seller_key = s.seller_key
    group by s.seller_state
    order by count desc;
    
'''

PurchaseFrequencyOfState = pd.read_sql(query, con=engine)
PurchaseFrequencyOfState.head(10)

Unnamed: 0,seller_state,count
0,BANTEN,35325
1,JAWA BARAT,11570
2,JAWA TENGAH,10688
3,KALIMANTAN TIMUR,9482
4,JAWA TIMUR,8648
5,DKI JAKARTA,8021
6,SUMATERA UTARA,5952
7,SULAWESI SELATAN,4502
8,RIAU,2907
9,LAMPUNG,2369


Which logistic route that have heavy traffic in our ecommerce?

In [9]:
query = '''
    select 
        s.seller_state
        ,s.seller_city
        ,u.user_state
        ,u.user_city
        ,count(o.order_key) count
    from order_fact o
    join seller_dim s
    on o.seller_key = s.seller_key
    join user_dim u
    on o.user_key = u.user_key
    group by u.user_city, u.user_state, s.seller_city, s.seller_state
    order by count desc;
'''

RoutesTraffic = pd.read_sql(query, con=engine)
RoutesTraffic.head(10)

Unnamed: 0,seller_state,seller_city,user_state,user_city,count
0,BANTEN,KOTA TANGERANG,BANTEN,KOTA TANGERANG,4995
1,BANTEN,KOTA TANGERANG,DKI JAKARTA,KOTA JAKARTA BARAT,1712
2,KALIMANTAN TIMUR,KABUPATEN BERAU,BANTEN,KOTA TANGERANG,1437
3,BANTEN,KOTA TANGERANG,BANTEN,KABUPATEN TANGERANG,740
4,KALIMANTAN TIMUR,KABUPATEN BERAU,DKI JAKARTA,KOTA JAKARTA BARAT,658
5,BANTEN,KOTA TANGERANG,JAWA BARAT,KABUPATEN BEKASI,567
6,JAWA BARAT,KABUPATEN BOGOR,BANTEN,KOTA TANGERANG,506
7,BANTEN,KOTA TANGERANG,DKI JAKARTA,KOTA JAKARTA SELATAN,463
8,BANTEN,KOTA TANGERANG,DKI JAKARTA,KOTA JAKARTA TIMUR,450
9,BANTEN,KOTA TANGERANG,DKI JAKARTA,KOTA JAKARTA UTARA,422


In [10]:
query = '''
    select 
        s.seller_state
        ,u.user_state
        ,count(o.order_key) count
    from order_fact o
    join seller_dim s
    on o.seller_key = s.seller_key
    join user_dim u
    on o.user_key = u.user_key
    group by u.user_state, s.seller_state
    order by count desc;
'''

StatesTraffic = pd.read_sql(query, con=engine)
StatesTraffic.head(10)

Unnamed: 0,seller_state,user_state,count
0,BANTEN,BANTEN,7885
1,BANTEN,JAWA BARAT,4670
2,BANTEN,DKI JAKARTA,4104
3,BANTEN,JAWA TIMUR,3162
4,BANTEN,JAWA TENGAH,3081
5,JAWA BARAT,BANTEN,2537
6,JAWA TENGAH,BANTEN,2390
7,KALIMANTAN TIMUR,BANTEN,2095
8,JAWA TIMUR,BANTEN,1818
9,JAWA BARAT,JAWA BARAT,1634


How many late delivered order in our ecommerce? Are late order affecting the customer satisfaction?

In [11]:
query = '''

    select 
        'Late' orders_type
        ,count(o.order_key) orders_count
        ,avg(feedback_score) avg_score
    from order_fact o
    join date_dim dd on o.delivered_date_key = dd.date_key
    join date_dim ed on o.estimated_time_delivery_key = ed.date_key
    left join feedback_dim f on o.order_key = f.order_key
    where dd.date > ed.date
    and f.feedback_score is not null
    union 
    select 
        'On Time' orders_type
        ,count(o.order_key) orders_count
        ,avg(feedback_score) avg_score
    from order_fact o
    join date_dim dd on o.delivered_date_key = dd.date_key
    join date_dim ed on o.estimated_time_delivery_key = ed.date_key
    left join feedback_dim f on o.order_key = f.order_key
    where dd.date <= ed.date
    and f.feedback_score is not null;

'''

OnTime_Late_Delivery = pd.read_sql(query, con=engine)
OnTime_Late_Delivery.head()

Unnamed: 0,orders_type,orders_count,avg_score
0,Late,6475,2
1,On Time,88040,4


How long are the delay for delivery / shipping process in each state?

In [12]:
query = ''' 
    select 
        s.seller_state [from]
        ,u.user_state [to]
        ,count(o.order_key) total_orders
        ,avg(datediff(day, ed.date, dd.date)) avg_delivery_days
    from order_fact o
    join seller_dim s on o.seller_key = s.seller_key
    join user_dim u on o.user_key = u.user_key
    join date_dim dd on o.delivered_date_key = dd.date_key
    join date_dim ed on o.estimated_time_delivery_key = ed.date_key
    where dd.date > ed.date
    group by u.user_state, s.seller_state
    order by avg_delivery_days desc
'''

LateDeliveryRoutes = pd.read_sql(query, con=engine)
LateDeliveryRoutes.head(10)

Unnamed: 0,from,to,total_orders,avg_delivery_days
0,ACEH,JAMBI,1,162
1,NUSA TENGGARA BARAT,DI YOGYAKARTA,1,96
2,BALI,KALIMANTAN SELATAN,1,94
3,JAWA TENGAH,KEPULAUAN BANGKA BELITUNG,2,87
4,KALIMANTAN TIMUR,SULAWESI BARAT,4,82
5,KALIMANTAN TENGAH,RIAU,1,76
6,SULAWESI BARAT,JAWA BARAT,1,68
7,PAPUA,RIAU,2,63
8,BALI,KEPULAUAN BANGKA BELITUNG,1,59
9,SULAWESI UTARA,KALIMANTAN TENGAH,2,58


In [13]:
query = ''' 

    select 
        u.user_state 
        ,count(o.order_key) total_orders
        ,avg(datediff(day, ed.date, dd.date)) avg_delivery_days
    from order_fact o
    join user_dim u on o.user_key = u.user_key
    join date_dim dd on o.delivered_date_key = dd.date_key
    join date_dim ed on o.estimated_time_delivery_key = ed.date_key
    where dd.date > ed.date
    group by u.user_state
    order by avg_delivery_days desc
    
'''

LateDeliveryStates = pd.read_sql(query, con=engine)
LateDeliveryStates.head(10)

Unnamed: 0,user_state,total_orders,avg_delivery_days
0,BENGKULU,22,20
1,SULAWESI BARAT,33,17
2,NUSA TENGGARA BARAT,22,16
3,KEPULAUAN BANGKA BELITUNG,31,15
4,KEPULAUAN RIAU,68,14
5,JAMBI,84,14
6,SULAWESI UTARA,92,14
7,SULAWESI TENGGARA,67,13
8,BALI,155,13
9,DKI JAKARTA,1326,11


How long are the difference between estimated delivery time and actual delivery time in each state?

In [14]:
query = '''

    select 
        u.user_state as state,
        avg(datediff(day, ed.date, dd.date)) as avg_delivery_diff,
        max(datediff(day, ed.date, dd.date)) as max_delivery_diff,
        min(datediff(day, ed.date, dd.date)) as min_delivery_diff,
        count(o.order_key) as total_orders
    from order_fact o
    join user_dim u on o.user_key = u.user_key
    join date_dim dd on o.delivered_date_key = dd.date_key
    join date_dim ed on o.estimated_time_delivery_key = ed.date_key
    group by u.user_state
    order by avg_delivery_diff desc

'''

DeliveryTimeByState = pd.read_sql(query, con=engine)
DeliveryTimeByState.head(10)


Unnamed: 0,state,avg_delivery_diff,max_delivery_diff,min_delivery_diff,total_orders
0,BALI,-11,69,-52,1557
1,BANTEN,-11,165,-78,24357
2,DI YOGYAKARTA,-11,105,-49,2052
3,JAMBI,-11,162,-42,1250
4,JAWA BARAT,-11,167,-84,14459
5,JAWA TENGAH,-11,161,-68,9720
6,JAWA TIMUR,-11,153,-147,9506
7,MALUKU UTARA,-11,24,-41,601
8,SULAWESI TENGAH,-11,45,-66,1097
9,SULAWESI TENGGARA,-12,52,-54,1008
