In [4]:
import os
import pymongo
import pymssql
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import json
import pandas as pd
from bson import ObjectId
load_dotenv()

True

In [None]:
# ------------------------ Connect to Botit-Production Database ---------------------- #
mongo_connection_string = os.getenv("botit_mongo_connection_string")
if mongo_connection_string:
    mongo_client = pymongo.MongoClient(mongo_connection_string)
    mongo_db = mongo_client["botitprod"]
    print("MongoDB connection string retrieved successfully.")
else:
    raise ValueError("MongoDB connection string is missing. Check environment variables.")

MongoDB connection string retrieved successfully.


In [15]:
orders_collection = mongo_db["Orders"]

In [19]:
def get_revenue(start_date=None, end_date=None, match_filter=None):
    pipeline = [
        {"$addFields": {"lastStatus": {"$arrayElemAt": ["$status.name", -1]}}}
    ]

    if start_date and end_date:
        pipeline.append({"$match": {"createdAt": {"$gte": start_date, "$lt": end_date}}})

    if match_filter:
        pipeline.append({"$match": match_filter})

    pipeline += [
        {
            "$project": {
                "price": {
                    "$cond": {
                        "if": {"$regexMatch": {"input": {"$ifNull": ["$price.total", ""]}, "regex": "^[0-9]+(\\.[0-9]+)?$"}},
                        "then": {"$toDouble": "$price.total"},
                        "else": 0,
                    }
                }
            }
        },
        {"$group": {"_id": None, "totalRevenue": {"$sum": "$price"}}},
    ]

    result = list(orders_collection.aggregate(pipeline))
    return result[0]["totalRevenue"] if result else 0

def print_revenue(start_date=None, end_date=None):
    delivered_revenue = get_revenue(start_date, end_date, {"lastStatus": "Delivered"})
    total_revenue = get_revenue(start_date, end_date)
    undelivered_revenue = total_revenue - delivered_revenue
    undelivered_percentage = (undelivered_revenue / total_revenue) * 100 if total_revenue else 0
    delivered_percentage = 100 - undelivered_percentage

    date_range_str = f" ({start_date.strftime('%d/%m/%Y')} - {end_date.strftime('%d/%m/%Y')})" if start_date and end_date else ""

    print(f"Total Revenue (Delivered Orders){date_range_str}: {delivered_revenue:,.0f} EGP")
    print("—" * 100)
    print(f"Total Revenue (All Orders){date_range_str}: {total_revenue:,.0f} EGP")
    print("—" * 100)
    print(f"Revenue Gap (Undelivered Orders): {undelivered_revenue:,.0f} EGP ({undelivered_percentage:.1f}% of total revenue)")
    print(f"Delivery Success Rate: {delivered_percentage:.1f}% of total revenue comes from delivered orders, indicating that nearly {undelivered_percentage:.1f}% of revenue is tied to undelivered orders.")


In [20]:
start_date = None
end_date = None
print_revenue(start_date, end_date)

Total Revenue (Delivered Orders): 21,134,611 EGP
————————————————————————————————————————————————————————————————————————————————————————————————————
Total Revenue (All Orders): 31,232,956 EGP
————————————————————————————————————————————————————————————————————————————————————————————————————
Revenue Gap (Undelivered Orders): 10,098,345 EGP (32.3% of total revenue)
Delivery Success Rate: 67.7% of total revenue comes from delivered orders, indicating that nearly 32.3% of revenue is tied to undelivered orders.


In [None]:
def get_order_status_distribution():
    pipeline = [
        {
            "$set": {
                "lastStatus": {"$arrayElemAt": ["$status.name", -1]}
            }
        },
        {
            "$group": {
                "_id": "$lastStatus",
                "count": {"$sum": 1}
            }
        },
        {
            "$setWindowFields": {
                "output": {
                    "total": {"$sum": "$count", "window": {}}
                }
            }
        },
        {
            "$project": {
                "Status": "$_id",
                "Count": "$count",
                "Percentage": {
                    "$multiply": [{"$divide": ["$count", "$total"]}, 100]
                }
            }
        },
        {
            "$sort": {"Count": -1}
        }
    ]

    result = list(orders_collection.aggregate(pipeline))
    df = pd.DataFrame(result)

    # Ensure correct column names and data types
    df = df[["Status", "Count", "Percentage"]]
    df["Count"] = df["Count"].fillna(0).astype(int)  # Ensure Count is integer
    df["Percentage"] = df["Percentage"].round(2)  # Round percentage to 2 decimal places
    
    return df

df = get_order_status_distribution()
df

Unnamed: 0,Status,Count,Percentage
0,Delivered,34584,73.41
1,Customer_Cancelled,7206,15.3
2,Vendor_Cancelled,3101,6.58
3,3PL_Cancelled,834,1.77
4,Pending_Payment,572,1.21
5,Ready_for_Pickup,424,0.9
6,Payment_Failed,132,0.28
7,Payment_Cancelled,90,0.19
8,3PL_Accepted,62,0.13
9,3PL_Pickedup,29,0.06


In [25]:
def get_order_statistics():
    pipeline = [
        {
            "$addFields": {
                "lastStatus": {"$arrayElemAt": ["$status.name", -1]}
            }
        },
        {
            "$match": {
                "lastStatus": "Delivered"
            }
        },
        {
            "$lookup": {
                "from": "Vendors",
                "localField": "_vendor",
                "foreignField": "_id",
                "as": "vendor"
            }
        },
        {
            "$unwind": "$vendor"
        },
        {
            "$project": {
                "_id": 0,
                "device_platform": "$device.platform",
                "shoppingCategory": "$vendor.shoppingCategory",
                "price": {
                    "$cond": {
                        "if": {
                            "$regexMatch": {
                                "input": {"$ifNull": ["$price.total", ""]},
                                "regex": "^[0-9]+(\\.[0-9]+)?$"
                            }
                        },
                        "then": {"$toDouble": "$price.total"},
                        "else": 0
                    }
                }
            }
        },
        {
            "$group": {
                "_id": "$shoppingCategory",
                "totalOrders": {"$sum": 1},
                "totalRevenue": {"$sum": "$price"},
                "iosRevenue": {
                    "$sum": {
                        "$cond": [{"$eq": ["$device_platform", "ios"]}, "$price", 0]
                    }
                },
                "androidRevenue": {
                    "$sum": {
                        "$cond": [{"$eq": ["$device_platform", "android"]}, "$price", 0]
                    }
                }
            }
        },
        {
            "$sort": {"totalOrders": -1}
        }
    ]

    result = list(orders_collection.aggregate(pipeline))
    df = pd.DataFrame(result)

    # Rename columns for clarity
    df.rename(columns={"_id": "Shopping Category"}, inplace=True)

    # Ensure proper column ordering
    df = df[["Shopping Category", "totalOrders", "totalRevenue", 
            #  "iosRevenue", "androidRevenue"
             ]]

    # Convert numeric values to appropriate formats
    df["totalOrders"] = df["totalOrders"].astype(int)
    df["totalRevenue"] = df["totalRevenue"].round(2)
    # df["iosRevenue"] = df["iosRevenue"].round(2)
    # df["androidRevenue"] = df["androidRevenue"].round(2)
    df["AOV"] = (df["totalRevenue"] / df["totalOrders"]).round(2)
    return df

df_orders = get_order_statistics()
df_orders


Unnamed: 0,Shopping Category,totalOrders,totalRevenue,AOV
0,restaurants,12632,3934926.59,311.5
1,beauty,6457,3309751.78,512.58
2,fashion,6266,6088390.86,971.66
3,home_garden,2634,2850805.56,1082.31
4,specialityfood,1473,839830.93,570.15
5,groceries,1380,654307.95,474.14
6,pharmacies,1026,440584.87,429.42
7,kids,777,683672.16,879.89
8,stationary,499,229292.85,459.5
9,sports,413,368557.57,892.39
