# Setup AND import

In [13]:
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px


ROOT = Path.cwd().parents[0]

DATA = ROOT / "data" / "processed" / "analytics_table.parquet"

FIGS = ROOT / "reports" / "figures"
FIGS.mkdir(parents=True, exist_ok=True)

def save_fig(fig, path: Path, *, scale: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    fig.write_image(str(path), scale=scale)


# loading the file 
### And printing a small summery about the data 

In [25]:
df = pd.read_parquet(DATA)
 #printing the a summery about the data 

print("Number Of Rows : " , " " , len(df))
print("-----------------------------------------------------------")
print("Data Types : " , "\n" , df.dtypes.head(15))
print("-----------------------------------------------------------")

print("Top missing values : " , " " , df.isna().sum().sort_values(ascending=False).head())
print("-------------------------------------------------------------------")
df.head(15)


Number Of Rows :    15
-----------------------------------------------------------
Data Types :  
 order_id               string[python]
user_id                string[python]
amount                        Float64
quantity                        Int64
created_at        datetime64[ns, UTC]
status                         object
status_clean           string[python]
amount__isna                     bool
quantity__isna                   bool
country                string[python]
signup_date                    object
amount_winsor                 Float64
dtype: object
-----------------------------------------------------------
Top missing values :    quantity         3
amount           3
amount_winsor    3
created_at       2
order_id         0
dtype: int64
-------------------------------------------------------------------


Unnamed: 0,order_id,user_id,amount,quantity,created_at,status,status_clean,amount__isna,quantity__isna,country,signup_date,amount_winsor
0,A0001,1,12.5,1.0,2025-12-01 10:05:00+00:00,Paid,paid,False,False,SA,2025-11-15,12.5
1,A0002,2,8.0,2.0,2025-12-01 11:10:00+00:00,paid,paid,False,False,SA,2025-11-20,8.495
2,A0003,3,,1.0,2025-12-02 09:00:00+00:00,Refund,refund,True,False,AE,2025-11-22,
3,A0004,1,25.0,,2025-12-03 14:30:00+00:00,PAID,paid,False,True,SA,2025-11-15,25.0
4,A0005,4,100.0,1.0,NaT,paid,paid,False,False,SA,2025-11-25,100.0
5,A0006,5,50.0,2.0,2025-12-05 13:00:00+00:00,Payment complete,payment complete,False,False,AE,2025-11-25,50.0
6,A0007,6,15.75,1.0,2025-12-06 16:45:00+00:00,paid,paid,False,False,US,2025-11-26,15.75
7,A0008,7,200.0,3.0,2025-12-07 10:00:00+00:00,PAID,paid,False,False,UK,2025-11-26,189.0
8,A0009,8,,2.0,2025-12-08 11:00:00+00:00,Paid,paid,True,False,SA,2025-11-27,
9,A0010,9,30.0,,2025-12-09 14:00:00+00:00,PENDING,pending,False,True,AE,2025-11-27,30.0


# Qustions 
- How does the total order amount change over time (daily)? 
- What is the average order amount per country?
- How many orders are refunded or have missing data?


# Qustion 1 :  How does the total order amount change over time (daily)? 


In [None]:
#First we need to create a new timestamp day becouse we will groupby day
df["Day"]=df["created_at"].dt.date
#print(df["Day"])
daily_revenu=(df.groupby("Day" , dropna=False).agg(num_orders=("order_id","size"), revenu=("amount_winsor" , "sum"))
                                              .reset_index().sort_values("Day"))
print(daily_revenu.head(10))

#Now we will create the plots 
#I will be using A line beacouse it shows trends overtime 
fig1 = px.line(
    daily_revenu,
    x="Day",
    y="revenu",

    title="Total Order Amount Over Time (Daily)"
)

#Saving the plot 
FIGS = Path("reports/figures")
save_fig(fig1, FIGS / "revenue_by_country.png")


          Day  num_orders  revenu
0  2025-12-01           2  20.995
1  2025-12-02           1     0.0
2  2025-12-03           1    25.0
3  2025-12-05           1    50.0
4  2025-12-06           1   15.75
5  2025-12-07           1   189.0
6  2025-12-08           1     0.0
7  2025-12-09           1    30.0
8  2025-12-10           1    70.0
9  2025-12-11           1    65.0


# Insight from total order amount change over time
-  From December 1st to 6th, revenue increased gradually from 20.5 to 50 SAR, indicating healthy early momentum.
- A dramatic spike to 189.0  occurred on December 7th—likely driven by a large bulk order, a promotional event, or a special sale.
- Revenue plummeted to 0 SAR on December 8th, immediately after the high. This could reflect a data recording gap.
### Caveat : 
- One possible caveat is that the data may be sparse or incomplete for some days (like Dec 2 and Dec 8), which could affect the accuracy of the trend. Always double-check for missing or erroneous entries. 

# Qustion 2 : What is the average order amount per country?

In [24]:
#Creating the table
#First we need to groupby contry 
avg_amo_co=(df.groupby("country" , dropna=False).agg(order_num=("order_id" , "size") , avg_amount=("amount" , "mean")).reset_index())
print(avg_amo_co.head(10))

#Creating the graph
fig2 = px.bar(avg_amo_co , x="country" , y="avg_amount"  , title="Average Order Amount per Country")
#Saving the plot 
FIGS = Path("reports/figures")
save_fig(fig2, FIGS / "averge_order_by_country.png")


  country  order_num  avg_amount
0      AE          3        40.0
1      EG          1        90.0
2      SA          7        42.1
3      UK          2       135.0
4      US          2      47.875


# Average Order Amount by Country – What I Found
- UK had the highest spend per order – around 135 SAR on average. That means either people there are buying pricier items, or they place fewer orders but spend more each time.
- Saudi Arabia (SA) had the most orders (7) but a lower average spend of ~42 SAR. this mean customers there buy more often, but spend less per order.
- UAE and US had the lowest average order values, at ~40 SAR and ~48 SAR respectively. They also weren’t the lowest in total orders, so it’s not a lack of traffic—it’s about how much people spend each time.

### Caveat:
- Countries like Egypt and UK have very few orders, which can make their averages misleading or not representative

# Qustion 3 :How many orders are refunded or have missing data?

