# Services (daily)

In [218]:
import pandas as pd

## Database connection

In [238]:
import yaml
from sqlalchemy import create_engine


with open("../config.yml", "r") as file:
	config = yaml.safe_load(file)
	config_OLTP = config["OLTP"]
	config_OLAP = config["OLAP"]


url_OLTP = (f"{config_OLTP['drivername']}://{config_OLTP['user']}:{config_OLTP['password']}"
			f"@{config_OLTP['host']}:{config_OLTP['port']}/{config_OLTP['database_name']}")

url_OLAP = (f"{config_OLAP['drivername']}://{config_OLAP['user']}:{config_OLAP['password']}"
			f"@{config_OLAP['host']}:{config_OLAP['port']}/{config_OLAP['database_name']}")

OLTP_connection = create_engine(url_OLTP)
OLAP_connection = create_engine(url_OLAP)

## Extraction

In [239]:
# Load dimensions
time_dimension = pd.read_sql_table("TIME_DIMENSION", OLAP_connection)
courier_dimension = pd.read_sql_table("COURIER_DIMENSION", OLAP_connection)
customer_dimension = pd.read_sql_table("CUSTOMER_DIMENSION", OLAP_connection)
office_dimension = pd.read_sql_table("OFFICE_DIMENSION", OLAP_connection)
service_status_dimension = pd.read_sql_table("SERVICE_STATUS_DIMENSION", OLAP_connection)

In [240]:
# Load fact table data from OLTP
services = pd.read_sql_table("mensajeria_servicio", OLTP_connection)
service_statuses = pd.read_sql_table("mensajeria_estadosservicio", OLTP_connection)

## Transformation

In [241]:
# Ensure `fecha` column is datetime
service_statuses["fecha"] = pd.to_datetime(service_statuses["fecha"])

In [242]:
# Sort by 'fecha' and drop duplicates to get the latest status per 'servicio_id'
latest_status = service_statuses.sort_values("fecha").drop_duplicates(
		"servicio_id", keep="last"
)

In [243]:
# Rename columns
latest_status = latest_status.rename(
		columns={
				"id": "status_record_id",
				"estado_id": "status_id",
				"servicio_id": "service_id",
		}
)

In [244]:
# Rename columns in services for consistency
services = services.rename(
		columns={
				"id": "service_id",
				"cliente_id": "customer_id",
				"mensajero_id": "courier_id",
				"origen_id": "origin_office_id",
				"destino_id": "destination_office_id",
		}
)

In [245]:
null_couriers = services["courier_id"].isna().sum()
print(f"Number of services with no assigned courier: {null_couriers}")

Number of services with no assigned courier: 727


In [247]:
# Ignore services with no assigned courier
services = services.dropna(subset=["courier_id"])

In [248]:
# Merge the latest status with services
services = services.merge(
		latest_status[["service_id", "status_id"]], on="service_id", how="left"
)

In [249]:
# Ensure that 'time_dimension["date"]' is in datetime format
request_date = pd.to_datetime(time_dimension["date"]).dt.normalize()
print(request_date)

0        2023-09-18
1        2023-09-18
2        2023-09-18
3        2023-09-18
4        2023-09-18
            ...    
502556   2024-08-31
502557   2024-08-31
502558   2024-08-31
502559   2024-08-31
502560   2024-09-01
Name: date, Length: 502561, dtype: datetime64[ns]


In [250]:
# Create a mapping from request_date to time_id
date_mapping = dict(zip(request_date, time_dimension["time_id"]))
print(date_mapping)

{Timestamp('2023-09-18 00:00:00'): 1439, Timestamp('2023-09-19 00:00:00'): 2879, Timestamp('2023-09-20 00:00:00'): 4319, Timestamp('2023-09-21 00:00:00'): 5759, Timestamp('2023-09-22 00:00:00'): 7199, Timestamp('2023-09-23 00:00:00'): 8639, Timestamp('2023-09-24 00:00:00'): 10079, Timestamp('2023-09-25 00:00:00'): 11519, Timestamp('2023-09-26 00:00:00'): 12959, Timestamp('2023-09-27 00:00:00'): 14399, Timestamp('2023-09-28 00:00:00'): 15839, Timestamp('2023-09-29 00:00:00'): 17279, Timestamp('2023-09-30 00:00:00'): 18719, Timestamp('2023-10-01 00:00:00'): 20159, Timestamp('2023-10-02 00:00:00'): 21599, Timestamp('2023-10-03 00:00:00'): 23039, Timestamp('2023-10-04 00:00:00'): 24479, Timestamp('2023-10-05 00:00:00'): 25919, Timestamp('2023-10-06 00:00:00'): 27359, Timestamp('2023-10-07 00:00:00'): 28799, Timestamp('2023-10-08 00:00:00'): 30239, Timestamp('2023-10-09 00:00:00'): 31679, Timestamp('2023-10-10 00:00:00'): 33119, Timestamp('2023-10-11 00:00:00'): 34559, Timestamp('2023-10-12

In [254]:
# Map 'fecha_solicitud' to 'date_mapping'
services["fecha_solicitud"] = pd.to_datetime(services["fecha_solicitud"]).dt.normalize()
services["time_id"] = services["fecha_solicitud"].map(date_mapping)
print(services["time_id"])

0         56159
2         76319
6         37439
10        79199
15        86399
          ...  
27695    469439
27696    475199
27697    491039
27699    501119
27702    502559
Name: time_id, Length: 251, dtype: int64


In [255]:
# Check for unmapped 'time_id's
services = services.dropna(subset=["time_id"])
services = services.drop_duplicates("time_id", keep="last")

In [256]:
# Select necessary columns for the fact table
service_fact = services[
		[
				"service_id",
				"customer_id",
				"courier_id",
				"status_id",
				"origin_office_id",
				"destination_office_id",
				"time_id",
		]
]

In [257]:
# Add service_fact_table_id
service_fact = service_fact.reset_index().rename(
		columns={"index": "service_fact_hour_table_id"}
)

In [258]:
service_fact.head(10)

Unnamed: 0,service_fact_hour_table_id,service_id,customer_id,courier_id,status_id,origin_office_id,destination_office_id,time_id
0,0,35,5,7.0,5,236,214,56159
1,2,45,5,12.0,4,242,214,76319
2,6,29,5,7.0,5,256,229,37439
3,10,48,5,12.0,1,239,214,79199
4,15,53,5,7.0,6,15,214,86399
5,18,59,5,7.0,2,258,214,119519
6,20,57,5,7.0,5,15,223,116639
7,24,51,5,7.0,5,237,214,83519
8,25,80,5,7.0,4,254,229,151199
9,28,63,11,3.0,5,26,38,123839


In [259]:
null_couriers = service_fact["courier_id"].isna().sum()
print(f"Number of records with 'courier_id' empty: {null_couriers}")

Number of records with 'courier_id' empty: 0


## Load

In [260]:
# Save the fact table to OLAP
service_fact.to_sql(
		"SERVICE_FACT_DAILY_TABLE", OLAP_connection, if_exists="replace", index=False
)

251