In [1]:
import pandas as pd

In [2]:
CSV_PATH = 'output/uber_rides_enriched.csv'

In [3]:
df = pd.read_csv(
        CSV_PATH,
        parse_dates=['start_at','end_at','arrived_at','time_grid'],
    )

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23111 entries, 0 to 23110
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   user_id                 23111 non-null  object        
 1   driver_id               19726 non-null  object        
 2   icon                    23111 non-null  object        
 3   start_type              23111 non-null  object        
 4   start_at                23111 non-null  datetime64[ns]
 5   start_lat               22212 non-null  float64       
 6   start_lon               22212 non-null  float64       
 7   end_at                  22835 non-null  datetime64[ns]
 8   end_lat                 22212 non-null  float64       
 9   end_lon                 22212 non-null  float64       
 10  end_state               23099 non-null  object        
 11  driver_start_lat        18310 non-null  float64       
 12  driver_start_lon        18310 non-null  float6

In [5]:
# Filas con coordenadas nulas y end_state == 'drop off'
mask = (
    df['start_lat'].isna() &
    (df['end_state'] == 'drop off')
)

# Cuántas filas son
print(f"Se eliminan {mask.sum()} ({mask.sum()/len(df):.2%}) filas de un total de {len(df)}")

# Eliminar esas filas
df = df.loc[~mask].copy()

Se eliminan 533 (2.31%) filas de un total de 23111


In [6]:
df_drop = df[
    df.end_state == 'drop off'
]

df_n = df[
    ~(df.end_state == 'drop off')
].drop(
    columns=[
        'end_lat',
        'end_lon',
        'arrived_at',
        'price_distance',
        'price_duration',
        'distance',
        'duration',
        'cost',
        'cost_distance',
        'cost_duration',
        'driver_score',
        'rider_score',
        'dist_start_to_end_m',
        'time_start_to_end_s',
    ]
)

In [7]:
df_drop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16884 entries, 0 to 23108
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   user_id                 16884 non-null  object        
 1   driver_id               16884 non-null  object        
 2   icon                    16884 non-null  object        
 3   start_type              16884 non-null  object        
 4   start_at                16884 non-null  datetime64[ns]
 5   start_lat               16884 non-null  float64       
 6   start_lon               16884 non-null  float64       
 7   end_at                  16880 non-null  datetime64[ns]
 8   end_lat                 16884 non-null  float64       
 9   end_lon                 16884 non-null  float64       
 10  end_state               16884 non-null  object        
 11  driver_start_lat        16180 non-null  float64       
 12  driver_start_lon        16180 non-null  float64    

In [8]:
# Series de valores calculados
t_api_wait = df_drop['time_driver_to_start_s']
t_calc_wait = (df_drop['arrived_at'] - df_drop['start_at']).dt.total_seconds().clip(lower=0)

t_api_trip = df_drop['time_start_to_end_s']
t_calc_trip = (
    df_drop['end_at']
    - pd.DataFrame({'1': df_drop['start_at'], '2': df_drop['arrived_at']})
      .max(axis=1)
)
t_calc_trip = t_calc_trip.dt.total_seconds()

# Error porcentual
err_wait_pct = (t_api_wait - t_calc_wait).abs() / t_api_wait * 100
err_trip_pct = (t_api_trip - t_calc_trip).abs() / t_api_trip * 100

# Umbral del 20%
mask_wait = err_wait_pct > 50
mask_trip = err_trip_pct > 50
mask_any  = mask_wait | mask_trip

# Imprimir conteos
print(f"Valores de espera a reemplazar por API: {mask_wait.sum()}")
print(f"Valores de viaje a reemplazar por API: {mask_trip.sum()}")
print(f"Total de filas a usar dato de API: {mask_any.sum()}")

# DataFrame de filas para reemplazo
df_flag = df_drop.loc[mask_any].copy()
df_flag['err_wait_pct'] = err_wait_pct[mask_any]
df_flag['err_trip_pct'] = err_trip_pct[mask_any]


Valores de espera a reemplazar por API: 10519
Valores de viaje a reemplazar por API: 9034
Total de filas a usar dato de API: 13537


In [9]:
# Reemplazar las columnas de fechas por NaT en las filas que superen el umbral
#   Para las discrepancias de espera, anulamos 'arrived_at' (así falla el cálculo de espera).
df_drop.loc[mask_wait, ['arrived_at']] = pd.NaT

#   Para las discrepancias de trayecto, anulamos 'end_at' (así falla el cálculo de duración de viaje).
df_drop.loc[mask_trip, ['end_at']]      = pd.NaT

# Verificación
print(f"Filas modificadas: {mask_any.sum()}")


Filas modificadas: 13537
