In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import tensorflow as tf

In [2]:
df_ped = pd.read_csv("data/pedidos.csv", dtype={"PedidoId": "string"}, low_memory=False)
df_prod = pd.read_csv("data/produccion.csv", dtype={"PedidoId": "string"}, low_memory=False)

# Drop empty 'Unnamed' columns
df_ped = df_ped.loc[:, ~df_ped.columns.str.startswith("Unnamed")]
df_prod = df_prod.loc[:, ~df_prod.columns.str.startswith("Unnamed")]

# Normalize key
for df in (df_ped, df_prod):
    df["PedidoId"] = df["PedidoId"].astype("string").str.strip()

In [3]:
# Diagnostics: check key uniqueness and join explosion
print('PedidoId unique in df_ped:', df_ped['PedidoId'].is_unique)
print('PedidoId unique in df_prod:', df_prod['PedidoId'].is_unique)
print('Duplicate counts -> df_ped:', df_ped['PedidoId'].duplicated().sum(), 'df_prod:', df_prod['PedidoId'].duplicated().sum())

sizes = (df_ped.groupby('PedidoId').size().rename('left_n').to_frame()
         .join(df_prod.groupby('PedidoId').size().rename('right_n'), how='inner'))
sizes['product'] = sizes['left_n'] * sizes['right_n']
print('Top exploding PedidoId (left_n * right_n):')
display(sizes.sort_values('product', ascending=False).head(10))

# Enforce left one-to-many: make df_ped unique on PedidoId, then merge with validation
df_ped_unique = df_ped.drop_duplicates('PedidoId', keep='first')
df_unique = pd.merge(
    df_ped_unique,
    df_prod,
    on='PedidoId',
    how='left',
    suffixes=('_ped','_prod'),
    validate='one_to_many'
)
df_unique.to_csv('data/join_pedidos_produccion_unique_left.csv', index=False)
print(f"Saved left one-to-many merge to data/join_pedidos_produccion_unique_left.csv (rows={len(df_unique)}, cols={df_unique.shape[1]})")
df_unique.head()


PedidoId unique in df_ped: False
PedidoId unique in df_prod: False
Duplicate counts -> df_ped: 219167 df_prod: 40231
Top exploding PedidoId (left_n * right_n):


Unnamed: 0_level_0,left_n,right_n,product
PedidoId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31995,113,128,14464
31182,100,98,9800
27538,140,68,9520
18209,125,62,7750
12494,90,86,7740
31071,96,70,6720
28669,105,62,6510
12305,96,52,4992
33238,113,44,4972
20960,170,28,4760


Saved left one-to-many merge to data/join_pedidos_produccion_unique_left.csv (rows=72746, cols=47)


Unnamed: 0,PedidoId,NumeroPedido,FechaIngreso,NumeroTulas,OrdenesGeneradas,PedidoDetalleId,Talla,MaterialPedido,Cantidad,Fondo,...,Velocidad,Puntadas,Plastico,MaquinaPlasticoId,TiempoQuitarPlastico,Canutillo,Chenille,Cordon,Presion,Temperatura
0,1,P2,2020-03-13 11:10:40.9153456,1.0,1,1,32,1706,12,Indigo,...,,1017.0,,,,,,,,
1,2,P3,2020-03-13 11:33:30.2220018,1.0,1,2,24,1843,66,Indigo,...,,438.0,,,,,,,,
2,2,P3,2020-03-13 11:33:30.2220018,1.0,1,2,24,1843,66,Indigo,...,,438.0,,,,,,,,
3,3,P4,2020-03-13 11:34:42.0784881,1.0,1,7,UNICA,1138,30,Indigo,...,,,,,,,,,,
4,4,P5,2020-03-13 12:02:58.5231326,1.0,1,8,2XS,7840,2,Chaqueta-Blanca,...,,2000.0,,,,,,,,
