# Services (hour)

In [24]:
import pandas as pd

## Database connection

In [25]:
import yaml
from sqlalchemy import create_engine


with open("../config.yml", "r") as file:
	config = yaml.safe_load(file)
	config_OLTP = config["OLTP"]
	config_OLAP = config["OLAP"]


url_OLTP = (f"{config_OLTP['drivername']}://{config_OLTP['user']}:{config_OLTP['password']}"
			f"@{config_OLTP['host']}:{config_OLTP['port']}/{config_OLTP['database_name']}")

url_OLAP = (f"{config_OLAP['drivername']}://{config_OLAP['user']}:{config_OLAP['password']}"
			f"@{config_OLAP['host']}:{config_OLAP['port']}/{config_OLAP['database_name']}")

OLTP_connection = create_engine(url_OLTP)
OLAP_connection = create_engine(url_OLAP)

## Extraction

In [26]:
# Load dimensions
time_dimension = pd.read_sql_table("TIME_DIMENSION", OLAP_connection)
courier_dimension = pd.read_sql_table("COURIER_DIMENSION", OLAP_connection)
customer_dimension = pd.read_sql_table("CUSTOMER_DIMENSION", OLAP_connection)
office_dimension = pd.read_sql_table("OFFICE_DIMENSION", OLAP_connection)
service_status_dimension = pd.read_sql_table("SERVICE_STATUS_DIMENSION", OLAP_connection)

In [27]:
# Load fact table data from OLTP
services = pd.read_sql_table("mensajeria_servicio", OLTP_connection)
service_statuses = pd.read_sql_table("mensajeria_estadosservicio", OLTP_connection)
users_information = pd.read_sql_table("clientes_usuarioaquitoy", OLTP_connection)

## Transformation

In [28]:
# Drop services with no courier assigned
services = services.dropna(subset=["mensajero_id"])

In [29]:
# Select relevant columns
services = services[
    [
        "id",
        "fecha_solicitud",
        "hora_solicitud",
        "cliente_id",
        "mensajero_id",
        "usuario_id",
    ]
]

In [30]:
# Process service statuses to get the latest status per service
service_statuses["hora_str"] = service_statuses["hora"].astype(str)
service_statuses["datetime"] = pd.to_datetime(
    service_statuses["fecha"]
) + pd.to_timedelta(service_statuses["hora_str"])
latest_statuses = service_statuses.sort_values("datetime").drop_duplicates(
    "servicio_id", keep="last"
)

In [31]:
# Rename columns for consistency
latest_statuses = latest_statuses.rename(
    columns={"estado_id": "status_id", "servicio_id": "service_id"}
)

In [32]:
# Rename columns in services for consistency
services = services.rename(
    columns={
        "id": "service_id",
        "cliente_id": "original_customer_id",
        "mensajero_id": "original_courier_id",
    }
)

In [33]:
# Merge the latest status into the services
services = services.merge(
    latest_statuses[["service_id", "status_id"]], on="service_id", how="left"
)

In [34]:
# Create a 'request_time' column at hourly granularity by combining date and time
services["hora_solicitud_str"] = services["hora_solicitud"].astype(str)
services["request_time"] = pd.to_datetime(
    services["fecha_solicitud"]
) + pd.to_timedelta(services["hora_solicitud_str"])
services["request_time"] = services["request_time"].dt.floor("h")

In [35]:
# Convert time_dimension["date"] to datetime for proper merging
time_dimension["date"] = pd.to_datetime(time_dimension["date"])

In [36]:
# Create a dictionary to map datetime to time_id
time_mapping = dict(zip(time_dimension["date"], time_dimension["time_id"]))

In [37]:
# Map 'request_time' to 'time_id'
services["time_id"] = services["request_time"].map(time_mapping)

In [38]:
# Check for unmapped times
unmapped_times = services["time_id"].isna().sum()
if unmapped_times > 0:
    print(f"Warning: {unmapped_times} records do not have a mapped time_id.")

In [39]:
# Merge services with user info to get the office (sede_id)
services = services.merge(
    users_information[["id", "sede_id"]],
    left_on="usuario_id",
    right_on="id",
    how="inner",
)
services.drop(columns=["usuario_id", "id"], inplace=True)

In [40]:
# Merge with office_dimension to get the office_id
services = services.merge(
    office_dimension[["office_id", "original_office_id"]],
    left_on="sede_id",
    right_on="original_office_id",
    how="inner",
)
services.drop(columns=["sede_id", "original_office_id"], inplace=True)

In [41]:
# Aggregate services to obtain 'total_services' per hour
hourly_agg = (
    services.groupby(
        ["original_customer_id", "original_courier_id", "time_id", "office_id"]
    )
    .size()
    .reset_index(name="total_services")
)

In [42]:
# Merge with customer_dimension to get 'customer_id'
hourly_agg = hourly_agg.merge(
    customer_dimension[["customer_id", "original_customer_id"]],
    on="original_customer_id",
    how="inner",
)
hourly_agg.drop(columns=["original_customer_id"], inplace=True)

In [43]:
# Merge with courier_dimension to get 'courier_id'
hourly_agg = hourly_agg.merge(
    courier_dimension[["courier_id", "original_courier_id"]],
    left_on="original_courier_id",
    right_on="original_courier_id",
    how="inner",
)
hourly_agg.drop(columns=["original_courier_id"], inplace=True)

In [44]:
# Rename index
hourly_agg.reset_index(inplace=True)
hourly_agg = hourly_agg.rename(columns={"index": "service_hour_id"})
hourly_agg.set_index("service_hour_id", inplace=True)

## Load

In [45]:
from sqlalchemy import BigInteger


dtype_mapping = {
	"time_id": BigInteger(),
	"customer_id": BigInteger(),
	"courier_id": BigInteger(),
	"office_id": BigInteger(),
}

In [46]:
# Save the fact table to OLAP
hourly_agg.to_sql(
    "SERVICE_HOUR_FACT_TABLE",
    OLAP_connection,
    if_exists="replace",
    index=True,
    dtype=dtype_mapping,
)

251