# Services (hour)

In [78]:
import pandas as pd

## Database connection

In [79]:
import yaml
from sqlalchemy import create_engine


with open("../config.yml", "r") as file:
	config = yaml.safe_load(file)
	config_OLTP = config["OLTP"]
	config_OLAP = config["OLAP"]


url_OLTP = (f"{config_OLTP['drivername']}://{config_OLTP['user']}:{config_OLTP['password']}"
			f"@{config_OLTP['host']}:{config_OLTP['port']}/{config_OLTP['database_name']}")

url_OLAP = (f"{config_OLAP['drivername']}://{config_OLAP['user']}:{config_OLAP['password']}"
			f"@{config_OLAP['host']}:{config_OLAP['port']}/{config_OLAP['database_name']}")

OLTP_connection = create_engine(url_OLTP)
OLAP_connection = create_engine(url_OLAP)

## Extraction

In [80]:
# Load dimensions
time_dimension = pd.read_sql_table("TIME_DIMENSION", OLAP_connection)
courier_dimension = pd.read_sql_table("COURIER_DIMENSION", OLAP_connection)
customer_dimension = pd.read_sql_table("CUSTOMER_DIMENSION", OLAP_connection)
office_dimension = pd.read_sql_table("OFFICE_DIMENSION", OLAP_connection)
service_status_dimension = pd.read_sql_table("SERVICE_STATUS_DIMENSION", OLAP_connection)

In [81]:
# Load fact table data from OLTP
services = pd.read_sql_table("mensajeria_servicio", OLTP_connection)
service_statuses = pd.read_sql_table("mensajeria_estadosservicio", OLTP_connection)
users_information = pd.read_sql_table("clientes_usuarioaquitoy", OLTP_connection)

## Transformation

In [82]:
services = services.dropna(subset=["mensajero_id"])

In [83]:
services = services[[
		"id",
		"fecha_solicitud",
		"hora_solicitud",
		"cliente_id",
		"mensajero_id",
		"usuario_id"
	]]

In [84]:
services["date"] = pd.to_datetime(services["fecha_solicitud"].astype(str) + ' ' + services["hora_solicitud"].astype(str)).dt.floor("h")
services.drop(columns=["fecha_solicitud", "hora_solicitud"])

Unnamed: 0,id,cliente_id,mensajero_id,usuario_id,date
1,35,5,7.0,8,2023-10-26 11:00:00
6,46,4,12.0,22,2023-11-09 18:00:00
7,45,5,12.0,9,2023-11-09 17:00:00
8,47,5,12.0,8,2023-11-11 11:00:00
13,31,5,7.0,173,2023-10-13 17:00:00
...,...,...,...,...,...
28425,28267,11,8.0,121,2024-08-30 10:00:00
28426,28363,11,22.0,193,2024-08-30 16:00:00
28427,28403,5,30.0,11,2024-08-31 07:00:00
28428,28438,11,39.0,298,2024-08-31 10:00:00


In [85]:
latest_statuses = service_statuses.sort_values("fecha").drop_duplicates(
		"servicio_id", keep="last"
)

In [86]:
latest_statuses = latest_statuses[["estado_id", "servicio_id"]]

In [87]:
latest_statuses = latest_statuses.rename(
		columns={
				"id": "status_record_id",
				"estado_id": "status_id",
				"servicio_id": "service_id"
		}
)

In [88]:
services = services.rename(
		columns={
				"id": "service_id",
				"cliente_id": "original_customer_id",
				"mensajero_id": "original_courier_id"
		}
)

In [89]:
services = services.merge(
		latest_statuses, on="service_id", how="left"
	)

In [90]:
services = services.merge(users_information[["id", "sede_id"]], left_on="usuario_id", right_on="id", how="inner")
services.drop(columns=["usuario_id", "id"], inplace=True)

In [91]:
services = services.merge(office_dimension[["office_id", "original_office_id"]], left_on="sede_id", right_on="original_office_id", how="inner")
services.drop(columns=["sede_id", "original_office_id"], inplace=True)

In [92]:
services = pd.merge(services, time_dimension[["time_id", "date"]], on="date", how="inner")
services.drop(columns=["date", "status_id"], inplace=True)

In [93]:
services = services.groupby(["original_customer_id", "original_courier_id", "time_id", "office_id"]).size().reset_index(name="total_services")

In [94]:
service_fact = services.merge(customer_dimension[["customer_id", "original_customer_id"]], on="original_customer_id", how="inner")
service_fact.drop(columns=["original_customer_id"], inplace=True)

In [95]:
service_fact.head()

Unnamed: 0,original_courier_id,time_id,office_id,total_services,customer_id
0,2.0,233880,15,1,1
1,7.0,195780,15,1,1
2,7.0,197160,15,1,1
3,7.0,201060,15,1,1
4,7.0,202560,15,2,1


In [None]:
service_fact = service_fact.merge(courier_dimension[["courier_id", "original_courier_id"]], on="original_courier_id", how="inner")
service_fact.drop(columns=["original_courier_id"], inplace=True)

In [55]:
service_fact.reset_index(inplace=True)
service_fact = service_fact.rename(columns={ "index": "service_hour_id" })
service_fact.set_index("service_hour_id", inplace=True)

## Load

In [56]:
from sqlalchemy import BigInteger


dtype_mapping = {
	"time_id": BigInteger(),
	"customer_id": BigInteger(),
	"courier_id": BigInteger(),
	"office_id": BigInteger(),
}

In [57]:
# Save the fact table to OLAP
service_fact.to_sql(
	"SERVICE_HOUR_FACT_TABLE", OLAP_connection, if_exists="replace", index=True, dtype=dtype_mapping
)

251