# Services (daily)

In [152]:
import pandas as pd

## Database connection

In [153]:
import yaml
from sqlalchemy import create_engine


with open("../config.yml", "r") as file:
	config = yaml.safe_load(file)
	config_OLTP = config["OLTP"]
	config_OLAP = config["OLAP"]


url_OLTP = (f"{config_OLTP['drivername']}://{config_OLTP['user']}:{config_OLTP['password']}"
			f"@{config_OLTP['host']}:{config_OLTP['port']}/{config_OLTP['database_name']}")

url_OLAP = (f"{config_OLAP['drivername']}://{config_OLAP['user']}:{config_OLAP['password']}"
			f"@{config_OLAP['host']}:{config_OLAP['port']}/{config_OLAP['database_name']}")

OLTP_connection = create_engine(url_OLTP)
OLAP_connection = create_engine(url_OLAP)

## Extraction

In [154]:
# Load dimensions
time_dimension = pd.read_sql_table("TIME_DIMENSION", OLAP_connection)
courier_dimension = pd.read_sql_table("COURIER_DIMENSION", OLAP_connection)
customer_dimension = pd.read_sql_table("CUSTOMER_DIMENSION", OLAP_connection)
office_dimension = pd.read_sql_table("OFFICE_DIMENSION", OLAP_connection)
service_status_dimension = pd.read_sql_table("SERVICE_STATUS_DIMENSION", OLAP_connection)

In [155]:
# Load fact table data from OLTP
services = pd.read_sql_table("mensajeria_servicio", OLTP_connection)
service_statuses = pd.read_sql_table("mensajeria_estadosservicio", OLTP_connection)
users_information = pd.read_sql_table("clientes_usuarioaquitoy", OLTP_connection)

## Transformation

In [156]:
services["fecha_solicitud"] = services["fecha_solicitud"].dt.floor("d").dt.tz_localize(None)
time_dimension["date"] = time_dimension["date"].dt.floor("d").dt.tz_localize(None)

In [157]:
# Ensure `fecha` column is datetime
service_statuses["fecha"] = pd.to_datetime(service_statuses["fecha"])

In [158]:
# Sort by 'fecha' and drop duplicates to get the latest status per 'servicio_id'
latest_status = service_statuses.sort_values("fecha").drop_duplicates(
		"servicio_id", keep="last"
)

In [159]:
# Rename columns
latest_status = latest_status.rename(
		columns={
				"id": "status_record_id",
				"estado_id": "status_id",
				"servicio_id": "service_id",
		}
)

In [160]:
# Rename columns in services for consistency
services = services.rename(
		columns={
				"id": "service_id",
				"cliente_id": "customer_id",
				"mensajero_id": "courier_id",
		}
)

In [161]:
null_couriers = services["courier_id"].isna().sum()
print(f"Number of services with no assigned courier: {null_couriers}")

Number of services with no assigned courier: 727


In [162]:
# Ignore services with no assigned courier
services = services.dropna(subset=["courier_id"])

In [163]:
# Merge the latest status with services
services = services.merge(
		latest_status[["service_id", "status_id"]], on="service_id", how="left"
)

In [164]:
# Ensure that 'time_dimension["date"]' is in datetime format
request_date = pd.to_datetime(time_dimension["date"]).dt.normalize()
print(request_date)

0        2023-09-18
1        2023-09-18
2        2023-09-18
3        2023-09-18
4        2023-09-18
            ...    
502556   2024-08-31
502557   2024-08-31
502558   2024-08-31
502559   2024-08-31
502560   2024-09-01
Name: date, Length: 502561, dtype: datetime64[ns]


In [165]:
# Create a mapping from request_date to time_id
date_mapping = dict(zip(request_date, time_dimension["time_id"]))
print(date_mapping)

{Timestamp('2023-09-18 00:00:00'): 1439, Timestamp('2023-09-19 00:00:00'): 2879, Timestamp('2023-09-20 00:00:00'): 4319, Timestamp('2023-09-21 00:00:00'): 5759, Timestamp('2023-09-22 00:00:00'): 7199, Timestamp('2023-09-23 00:00:00'): 8639, Timestamp('2023-09-24 00:00:00'): 10079, Timestamp('2023-09-25 00:00:00'): 11519, Timestamp('2023-09-26 00:00:00'): 12959, Timestamp('2023-09-27 00:00:00'): 14399, Timestamp('2023-09-28 00:00:00'): 15839, Timestamp('2023-09-29 00:00:00'): 17279, Timestamp('2023-09-30 00:00:00'): 18719, Timestamp('2023-10-01 00:00:00'): 20159, Timestamp('2023-10-02 00:00:00'): 21599, Timestamp('2023-10-03 00:00:00'): 23039, Timestamp('2023-10-04 00:00:00'): 24479, Timestamp('2023-10-05 00:00:00'): 25919, Timestamp('2023-10-06 00:00:00'): 27359, Timestamp('2023-10-07 00:00:00'): 28799, Timestamp('2023-10-08 00:00:00'): 30239, Timestamp('2023-10-09 00:00:00'): 31679, Timestamp('2023-10-10 00:00:00'): 33119, Timestamp('2023-10-11 00:00:00'): 34559, Timestamp('2023-10-12

In [166]:
# Map 'fecha_solicitud' to 'date_mapping'
services["fecha_solicitud"] = pd.to_datetime(services["fecha_solicitud"]).dt.normalize()
services["time_id"] = services["fecha_solicitud"].map(date_mapping)
print(services["time_id"])

0         56159
1         76319
2         76319
3         79199
4         37439
          ...  
27698    501119
27699    501119
27700    502559
27701    502559
27702    502559
Name: time_id, Length: 27703, dtype: int64


In [167]:
# Check for unmapped 'time_id's
services = services.dropna(subset=["time_id"])

In [168]:
services = services.drop_duplicates(subset=["service_id"], keep="last")

In [169]:
services["total_services_per_day"] = services.groupby(["courier_id", "customer_id", "time_id"])["time_id"].transform("count")
print(services.head())

   service_id                                 descripcion nombre_solicitante  \
0          35                            Recogervmx a las           chat_GPT   
1          46  Recoger sangre en remedios para farallones           chat_GPT   
2          45          Recoger mx en toma de mx a las 830           chat_GPT   
3          47                  Recoger mx martes 8:30 a.m           chat_GPT   
4          31                                           M           chat_GPT   

  fecha_solicitud hora_solicitud fecha_deseada hora_deseada nombre_recibe  \
0      2023-10-26       11:18:14    2023-10-26     11:18:14        Gemini   
1      2023-11-09       18:11:22    2023-11-09     18:11:22        Gemini   
2      2023-11-09       17:54:40    2023-11-10     17:54:40        Gemini   
3      2023-11-11       11:31:56    2023-11-14     11:31:56        Gemini   
4      2023-10-13       17:21:26    2023-10-13     17:21:26        Gemini   

  telefono_recibe descripcion_pago  ...  descripcion_mul

In [170]:
services = services.drop_duplicates(subset=['courier_id', 'customer_id', 'time_id'], keep='first')

In [171]:
# Select necessary columns for the fact table
service_fact = services[
		[
			"time_id",
			"customer_id",
			"courier_id",
			"total_services_per_day",
			"usuario_id"
		]
]

In [172]:
service_fact.head()

Unnamed: 0,time_id,customer_id,courier_id,total_services_per_day,usuario_id
0,56159,5,7.0,1,8
1,76319,4,12.0,1,22
2,76319,5,12.0,1,9
3,79199,5,12.0,2,8
4,37439,5,7.0,3,173


In [173]:
null_couriers = service_fact["courier_id"].isna().sum()
print(f"Number of records with 'courier_id' empty: {null_couriers}")

Number of records with 'courier_id' empty: 0


In [174]:
service_fact = service_fact.merge(courier_dimension, left_on="courier_id", right_on="original_courier_id", how="inner")
service_fact.drop(columns=["courier_id_x", "courier_id_y", "courier_city"], inplace=True)
service_fact.rename(columns={ "original_courier_id": "courier_id" }, inplace=True)

In [None]:
service_fact = service_fact.merge(users_information[["id", "sede_id"]], left_on="usuario_id", right_on="id", how="inner")
service_fact.drop(columns=["usuario_id", "id"], inplace=True)

In [None]:
service_fact = service_fact.merge(office_dimension[["office_id", "original_office_id"]], left_on="sede_id", right_on="original_office_id", how="inner")
service_fact.drop(columns=["sede_id", "original_office_id"], inplace=True)

In [185]:
service_fact.head()

Unnamed: 0,time_id,customer_id,total_services_per_day,courier_id,office_id
0,184319,7,1,17,44
1,179999,11,2,24,27
2,182879,7,1,17,44
3,179999,7,1,17,44
4,383039,25,2,24,13


In [186]:
service_fact.reset_index(inplace=True)
service_fact = service_fact.rename(columns={ "index": "service_daily_id" })
service_fact.set_index("service_daily_id", inplace=True)

## Load

In [187]:
from sqlalchemy import BigInteger


dtype_mapping = {
	"time_id": BigInteger(),
	"customer_id": BigInteger(),
	"courier_id": BigInteger(),
	"office_id": BigInteger(),
}

In [188]:
# Save the fact table to OLAP
service_fact.to_sql(
	"SERVICE_DAILY_FACT_TABLE", OLAP_connection, if_exists="replace", index=True, dtype=dtype_mapping
)

53