# Acummulating snapshot

In [1]:
import numpy as np
import pandas as pd

## Database connection

In [2]:
import yaml
from sqlalchemy import create_engine


with open("../config.yml", "r") as file:
	config = yaml.safe_load(file)
	config_OLTP = config["OLTP"]
	config_OLAP = config["OLAP"]


url_OLTP = (f"{config_OLTP['drivername']}://{config_OLTP['user']}:{config_OLTP['password']}"
			f"@{config_OLTP['host']}:{config_OLTP['port']}/{config_OLTP['database_name']}")

url_OLAP = (f"{config_OLAP['drivername']}://{config_OLAP['user']}:{config_OLAP['password']}"
			f"@{config_OLAP['host']}:{config_OLAP['port']}/{config_OLAP['database_name']}")

OLTP_connection = create_engine(url_OLTP)
OLAP_connection = create_engine(url_OLAP)

## Extraction

In [3]:
time_dimension = pd.read_sql_table("TIME_DIMENSION", OLAP_connection)
service_status = pd.read_sql_table("mensajeria_estado", OLTP_connection)
service_statuses = pd.read_sql_table("mensajeria_estadosservicio", OLTP_connection)
# service_statuses = service_statuses.head(1000)

## Transformation

In [4]:
service_statuses.drop(columns=["foto", "observaciones", "es_prueba", "foto_binary"], inplace=True)

In [5]:
service_statuses["hora"] = service_statuses["hora"].astype(str)
service_statuses["date"] = pd.to_datetime(
	service_statuses["fecha"].astype(str) + " " + service_statuses["hora"],
	infer_datetime_format=True,
	errors="coerce"
)

invalid_records_number = service_statuses["date"].isna().sum()
print(f"Invalid records number: {invalid_records_number}")

if invalid_records_number > 0:
	invalid_entries = service_statuses[service_statuses["date"].isna()]
	print("Invalid records: ")
	print(invalid_entries[["fecha", "hora"]])


Invalid records number: 51
Invalid records: 
           fecha             hora
676   2023-12-28  11:20:20.603000
677   2023-12-28  11:20:27.311000
686   2023-10-31  12:02:48.844000
802   2023-12-28  10:50:48.748000
803   2023-12-28  10:51:00.065000
806   2023-12-28  10:53:11.601000
807   2023-12-28  10:53:19.016000
808   2023-12-28  10:53:25.357000
811   2023-12-28  10:59:24.505000
812   2023-12-28  10:59:30.531000
813   2023-12-28  10:59:36.579000
822   2023-12-28  11:20:32.434000
825   2023-12-28  14:01:33.010000
828   2023-12-28  14:15:17.274000
829   2023-12-28  14:15:37.236000
834   2023-12-28  19:09:32.485000
835   2023-12-28  19:10:01.692000
837   2023-12-28  19:14:14.753000
838   2023-12-28  19:14:43.554000
839   2023-12-28  19:23:44.718000
840   2023-12-28  19:23:48.731000
841   2023-12-28  19:24:16.948000
851   2023-12-28  19:45:13.652000
852   2023-12-28  19:45:15.098000
853   2023-12-28  19:45:41.915000
854   2023-12-28  19:45:42.962000
855   2023-12-28  19:46:16.291000
856

  service_statuses["date"] = pd.to_datetime(


In [6]:
service_statuses["date"] = service_statuses["date"].dt.floor("min")

In [7]:
service_statuses.drop(columns=["fecha", "hora"], inplace=True)

In [8]:
service_statuses = pd.merge(service_statuses, service_status[["id", "nombre"]],
								left_on="estado_id", right_on="id", how="left")

In [9]:
service_statuses.drop(columns=["estado_id", "id_y"], inplace=True)
service_statuses.rename(
	columns={
		"id_x": "id",
		"servicio_id": "service_id",
		"nombre": "status"
	}, inplace=True
)

In [10]:
acummulating_snapshot_fact_table = service_statuses[["service_id"]].drop_duplicates().reset_index(drop=True)

In [11]:
date_columns = ["request_time_id", "assignment_time_id", "pickup_time_id", "delivery_time_id", "closure_time_id"]
for col in date_columns:
	acummulating_snapshot_fact_table[col] = pd.NaT

for _, row in service_statuses.iterrows():
	service_id = row["service_id"]
	status = row["status"]
	date = row["date"]

	if status == "Iniciado":
		acummulating_snapshot_fact_table.loc[acummulating_snapshot_fact_table["service_id"] == service_id,
												"request_time_id"] = date
	elif status == "Con mensajero Asignado":
		acummulating_snapshot_fact_table.loc[acummulating_snapshot_fact_table["service_id"] == service_id,
												"assignment_time_id"] = date
	elif status == "Recogido por mensajero":
		acummulating_snapshot_fact_table.loc[acummulating_snapshot_fact_table["service_id"] == service_id,
												"pickup_time_id"] = date
	elif status == "Entregado en destino":
		acummulating_snapshot_fact_table.loc[acummulating_snapshot_fact_table["service_id"] == service_id,
												"delivery_time_id"] = date
	elif status == "Terminado completo":
		acummulating_snapshot_fact_table.loc[acummulating_snapshot_fact_table["service_id"] == service_id,
												"closure_time_id"] = date


In [12]:
def substract_series(series_1, series_2):
	result = series_1 - series_2
	result[pd.isna(series_1) | pd.isna(series_2)] = pd.Timedelta(0)
	return result

acummulating_snapshot_fact_table["request_assignment_time"] = substract_series(acummulating_snapshot_fact_table["assignment_time_id"], acummulating_snapshot_fact_table["request_time_id"])
acummulating_snapshot_fact_table["assignment_pickup_time"] = substract_series(acummulating_snapshot_fact_table["pickup_time_id"], acummulating_snapshot_fact_table["assignment_time_id"])
acummulating_snapshot_fact_table["pickup_delivery_time"] = substract_series(acummulating_snapshot_fact_table["delivery_time_id"], acummulating_snapshot_fact_table["pickup_time_id"])
acummulating_snapshot_fact_table["delivery_closure_time"] = substract_series(acummulating_snapshot_fact_table["closure_time_id"], acummulating_snapshot_fact_table["delivery_time_id"])

In [13]:
acummulating_snapshot_fact_table["request_assignment_time"] = acummulating_snapshot_fact_table["request_assignment_time"].dt.total_seconds()
acummulating_snapshot_fact_table["assignment_pickup_time"] = acummulating_snapshot_fact_table["assignment_pickup_time"].dt.total_seconds()
acummulating_snapshot_fact_table["pickup_delivery_time"] = acummulating_snapshot_fact_table["pickup_delivery_time"].dt.total_seconds()
acummulating_snapshot_fact_table["delivery_closure_time"] = acummulating_snapshot_fact_table["delivery_closure_time"].dt.total_seconds()

In [14]:
time_mapping = dict(zip(time_dimension["date"], time_dimension["time_id"]))

In [15]:
acummulating_snapshot_fact_table["request_time_id"] = acummulating_snapshot_fact_table["request_time_id"].map(time_mapping)
acummulating_snapshot_fact_table["assignment_time_id"] = acummulating_snapshot_fact_table["assignment_time_id"].map(time_mapping)
acummulating_snapshot_fact_table["pickup_time_id"] = acummulating_snapshot_fact_table["pickup_time_id"].map(time_mapping)
acummulating_snapshot_fact_table["delivery_time_id"] = acummulating_snapshot_fact_table["delivery_time_id"].map(time_mapping)
acummulating_snapshot_fact_table["closure_time_id"] = acummulating_snapshot_fact_table["closure_time_id"].map(time_mapping)

In [16]:
acummulating_snapshot_fact_table.reset_index(inplace=True)
acummulating_snapshot_fact_table.rename(columns={ "index": "acummulating_snapshot_id" }, inplace=True)
acummulating_snapshot_fact_table.set_index("acummulating_snapshot_id", inplace=True)

In [17]:
acummulating_snapshot_fact_table.head()

Unnamed: 0_level_0,service_id,request_time_id,assignment_time_id,pickup_time_id,delivery_time_id,closure_time_id,request_assignment_time,assignment_pickup_time,pickup_delivery_time,delivery_closure_time
acummulating_snapshot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,226,191517.0,191547.0,191593.0,191619.0,,1800.0,2760.0,1560.0,0.0
1,79,150580.0,155057.0,155259.0,194084.0,194085.0,268620.0,12120.0,2329500.0,60.0
2,613,203675.0,203682.0,203700.0,203734.0,203746.0,420.0,1080.0,2040.0,720.0
3,376,195652.0,196730.0,196730.0,196730.0,,64680.0,0.0,0.0,0.0
4,7164,290401.0,290470.0,290476.0,290557.0,,4140.0,360.0,4860.0,0.0


## Load

In [18]:
from sqlalchemy import BigInteger


dtype_mapping = {
	"request_time_id": BigInteger(),
	"assignment_time_id": BigInteger(),
	"pickup_time_id": BigInteger(),
	"delivery_time_id": BigInteger(),
	"closure_time_id": BigInteger()
}

In [19]:
acummulating_snapshot_fact_table.to_sql(
	"ACUMMULATING_SNAPSHOT_FACT_TABLE", OLAP_connection, if_exists="replace", index=True, dtype=dtype_mapping
)

430