# Setup AND import

In [13]:
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px


ROOT = Path.cwd().parents[0]

DATA = ROOT / "data" / "processed" / "analytics_table.parquet"

FIGS = ROOT / "reports" / "figures"
FIGS.mkdir(parents=True, exist_ok=True)

def save_fig(fig, path: Path, *, scale: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    fig.write_image(str(path), scale=scale)


# loading the file 
### And printing a small summery about the data 

In [14]:
df = pd.read_parquet(DATA)
 #printing the a summery about the data 

print("Number Of Rows : " , " " , len(df))
print("-----------------------------------------------------------")
print("Data Types : " , "\n" , df.dtypes.head(15))
print("-----------------------------------------------------------")

print("Top missing values : " , " " , df.isna().sum().sort_values(ascending=False).head())
print("-------------------------------------------------------------------")
df.head()


Number Of Rows :    15
-----------------------------------------------------------
Data Types :  
 order_id               string[python]
user_id                string[python]
amount                        Float64
quantity                        Int64
created_at        datetime64[ns, UTC]
status                         object
status_clean           string[python]
amount__isna                     bool
quantity__isna                   bool
country                string[python]
signup_date                    object
amount_winsor                 Float64
dtype: object
-----------------------------------------------------------
Top missing values :    quantity         3
amount           3
amount_winsor    3
created_at       2
order_id         0
dtype: int64
-------------------------------------------------------------------


Unnamed: 0,order_id,user_id,amount,quantity,created_at,status,status_clean,amount__isna,quantity__isna,country,signup_date,amount_winsor
0,A0001,1,12.5,1.0,2025-12-01 10:05:00+00:00,Paid,paid,False,False,SA,2025-11-15,12.5
1,A0002,2,8.0,2.0,2025-12-01 11:10:00+00:00,paid,paid,False,False,SA,2025-11-20,8.495
2,A0003,3,,1.0,2025-12-02 09:00:00+00:00,Refund,refund,True,False,AE,2025-11-22,
3,A0004,1,25.0,,2025-12-03 14:30:00+00:00,PAID,paid,False,True,SA,2025-11-15,25.0
4,A0005,4,100.0,1.0,NaT,paid,paid,False,False,SA,2025-11-25,100.0


# Qustions 
- Which columns have the most missing values?
- How does the total order amount change over time (daily)? 
- What is the average order amount per country?
- What is the monthly trend in total order amount over time?


# Qustion 1 :  How does the total order amount change over time (daily)? 


In [15]:
#First we need to create a new timestamp day becouse we will groupby day
df["Day"]=df["created_at"].dt.date
#print(df["Day"])
daily_revenu=(df.groupby("Day" , dropna=False).agg(num_orders=("order_id","size"), revenu=("amount" , "sum"))
                                              .reset_index().sort_values("Day"))
print(daily_revenu.head(10))

#Now we will create the plots 
#I will be using A line beacouse it shows trends overtime 
fig = px.line(
    daily_revenu,
    x="Day",
    y="revenu",

    title="Total Order Amount Over Time (Daily)"
)

#Saving the plot 
FIGS = Path("reports/figures")
save_fig(fig, FIGS / "revenue_by_country.png")


          Day  num_orders  revenu
0  2025-12-01           2    20.5
1  2025-12-02           1     0.0
2  2025-12-03           1    25.0
3  2025-12-05           1    50.0
4  2025-12-06           1   15.75
5  2025-12-07           1   200.0
6  2025-12-08           1     0.0
7  2025-12-09           1    30.0
8  2025-12-10           1    70.0
9  2025-12-11           1    65.0


# Insight from total order amount change over time
-  From December 1st to 6th, revenue increased gradually from 20.5 to 50 SAR, indicating healthy early momentum.
- A dramatic spike to 200 SAR occurred on December 7thâ€”likely driven by a large bulk order, a promotional event, or a special sale.
- Revenue plummeted to 0 SAR on December 8th, immediately after the high. This could reflect a data recording gap.
### Caveat : 
- One possible caveat is that the data may be sparse or incomplete for some days (like Dec 2 and Dec 8), which could affect the accuracy of the trend. Always double-check for missing or erroneous entries. 