In [None]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from statsmodels.api import Logit, add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [102]:
CSV_PATH = 'output/uber_rides_enriched.csv'

In [103]:
df = pd.read_csv(
        CSV_PATH,
        parse_dates=['start_at','end_at','arrived_at','time_grid'],
    )

# Localizamos las fechas en la zona horaria de Lima
for col in ['start_at', 'end_at', 'arrived_at', 'time_grid']:
    df[col] = df[col].dt.tz_localize(
        'America/Lima',
        ambiguous='NaT',
        nonexistent='shift_forward'
    )

In [None]:
df.drop(
    columns=[
        'duration',
        'distance'
    ],
    inplace=True
)

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23111 entries, 0 to 23110
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype                       
---  ------                  --------------  -----                       
 0   user_id                 23111 non-null  object                      
 1   driver_id               19726 non-null  object                      
 2   icon                    23111 non-null  object                      
 3   start_type              23111 non-null  object                      
 4   start_at                23111 non-null  datetime64[ns, America/Lima]
 5   start_lat               22212 non-null  float64                     
 6   start_lon               22212 non-null  float64                     
 7   end_at                  22835 non-null  datetime64[ns, America/Lima]
 8   end_lat                 22212 non-null  float64                     
 9   end_lon                 22212 non-null  float64                     
 10

In [105]:
# Filas con coordenadas nulas y end_state == 'drop off'
mask = (
    df['start_lat'].isna() &
    (df['end_state'] == 'drop off')
)

# Cuántas filas son
print(f"Se eliminan {mask.sum()} ({mask.sum()/len(df):.2%}) filas de un total de {len(df)}")

# Eliminar esas filas
df = df.loc[~mask].copy()

Se eliminan 533 (2.31%) filas de un total de 23111


In [106]:
df_drop = df[
    df.end_state == 'drop off'
].copy()

df_n = df[
    ~(df.end_state == 'drop off')
].drop(
    columns=[
        'end_lat',
        'end_lon',
        'arrived_at',
        'price_distance',
        'price_duration',
        'distance',
        'duration',
        'cost',
        'cost_distance',
        'cost_duration',
        'driver_score',
        'rider_score',
        'dist_start_to_end_m',
        'time_start_to_end_s',
    ]
).copy()

In [107]:
df_drop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16884 entries, 0 to 23108
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype                       
---  ------                  --------------  -----                       
 0   user_id                 16884 non-null  object                      
 1   driver_id               16884 non-null  object                      
 2   icon                    16884 non-null  object                      
 3   start_type              16884 non-null  object                      
 4   start_at                16884 non-null  datetime64[ns, America/Lima]
 5   start_lat               16884 non-null  float64                     
 6   start_lon               16884 non-null  float64                     
 7   end_at                  16880 non-null  datetime64[ns, America/Lima]
 8   end_lat                 16884 non-null  float64                     
 9   end_lon                 16884 non-null  float64                     
 10  end

In [108]:
# Series de valores calculados
t_api_wait = df_drop['time_driver_to_start_s']
t_calc_wait = (df_drop['arrived_at'] - df_drop['start_at']).dt.total_seconds().clip(lower=0)

t_api_trip = df_drop['time_start_to_end_s']
t_calc_trip = (
    df_drop['end_at']
    - pd.DataFrame({'1': df_drop['start_at'], '2': df_drop['arrived_at']})
      .max(axis=1)
)
t_calc_trip = t_calc_trip.dt.total_seconds()

# Error porcentual
err_wait_pct = (t_api_wait - t_calc_wait).abs() / t_api_wait * 100
err_trip_pct = (t_api_trip - t_calc_trip).abs() / t_api_trip * 100

# Umbral del 20%
mask_wait = err_wait_pct > 50
mask_trip = err_trip_pct > 50
mask_any  = mask_wait | mask_trip

# Imprimir conteos
print(f"Valores de espera a reemplazar por API: {mask_wait.sum()}")
print(f"Valores de viaje a reemplazar por API: {mask_trip.sum()}")
print(f"Total de filas a usar dato de API: {mask_any.sum()}")

# DataFrame de filas para reemplazo
df_flag = df_drop.loc[mask_any].copy()
df_flag['err_wait_pct'] = err_wait_pct[mask_any]
df_flag['err_trip_pct'] = err_trip_pct[mask_any]


Valores de espera a reemplazar por API: 10519
Valores de viaje a reemplazar por API: 9034
Total de filas a usar dato de API: 13537


In [109]:
# Reemplazar las columnas de fechas por NaT en las filas que superen el umbral
#   Para las discrepancias de espera, anulamos 'arrived_at' (así falla el cálculo de espera).
df_drop.loc[mask_wait, ['arrived_at']] = pd.NaT

#   Para las discrepancias de trayecto, anulamos 'end_at' (así falla el cálculo de duración de viaje).
df_drop.loc[mask_trip, ['end_at']]      = pd.NaT

# Verificación
print(f"Filas modificadas: {mask_any.sum()}")


Filas modificadas: 13537


In [110]:
df_drop['hour'] = df_drop['start_at'].dt.hour
df_drop['weekday'] = df_drop['start_at'].dt.weekday      # 0=Lunes … 6=Domingo
df_drop['weekday_name'] = df_drop['start_at'].dt.day_name()

In [111]:
# --- 1) Crear wait_calc y trip_calc ---
df_drop['wait_calc'] = (
    df_drop['arrived_at'] - df_drop['start_at']
).dt.total_seconds().clip(lower=0)
df_drop['trip_calc'] = (
    df_drop['end_at']
    - pd.DataFrame({'1': df_drop['start_at'], '2': df_drop['arrived_at']})
      .max(axis=1)
).dt.total_seconds()

In [None]:
def select_features(
    df, candidates, target,
    corr_thresh=0.1,
    imp_cum_thresh=0.9,
    vif_thresh=10,
    miss_pval=0.05,
    min_complete_cases=20
):
    print(f"\n--- Selecting features for target '{target}' ---")
    # 1) DataFrame de casos completos para este target
    cols_fc = candidates + [target]
    df_comp = df[cols_fc].dropna()
    print(f"Complete cases for '{target}': {df_comp.shape[0]} rows")
    if df_comp.shape[0] < min_complete_cases:
        print(f"  Not enough complete cases (<{min_complete_cases}); returning all candidates.")
        return candidates.copy()

    # 2) Missingness (Logistic MAR test)
    flag = df[target].isna().astype(int)
    X_miss = df[candidates].fillna(df[candidates].median())
    try:
        logit = Logit(flag, add_constant(X_miss)).fit(disp=False)
        pvals = logit.pvalues.drop('const', errors='ignore')
        miss_preds = pvals[pvals < miss_pval].index.tolist()
        print(f"Predictors of missingness (p < {miss_pval}): {miss_preds}")
    except Exception as e:
        miss_preds = []
        print(f"  Could not run MAR test: {e}")

    # 3) Univariate correlation
    corrs = df_comp.corr()[target].abs().drop(target)
    corr_preds = corrs[corrs >= corr_thresh].index.tolist()
    print(f"Predictors by correlation ≥ {corr_thresh}: {corr_preds}")

    # 4) Initial union
    init_preds = list(dict.fromkeys(miss_preds + corr_preds))
    if not init_preds:
        init_preds = candidates.copy()
        print("  No predictors from MAR/corr; using all candidates.")
    print(f"Initial candidate predictors: {init_preds}")

    # 5) RandomForest importances
    X_rf, y_rf = df_comp[init_preds], df_comp[target]
    rf = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1)
    rf.fit(X_rf, y_rf)
    imp = pd.Series(rf.feature_importances_, index=init_preds).sort_values(ascending=False)
    cum_imp = imp.cumsum()
    # select features up to cumulative importance threshold
    sel = cum_imp[cum_imp <= imp_cum_thresh].index.tolist()
    remaining = [f for f in imp.index if f not in sel]
    if remaining and (cum_imp.iloc[len(sel)] < imp_cum_thresh):
        sel.append(remaining[0])
    if not sel:
        sel = init_preds.copy()
        print("  No features passed importance threshold; falling back to init_preds.")

    print("RandomForest importances and cumulative sums:")
    for feat, importance in imp.items():
        print(f"  {feat}: {importance:.4f} (cumulative: {cum_imp[feat]:.4f})")
    print(f"Selected by importance (cum ≤ {imp_cum_thresh}): {sel}")
    if remaining:
        print(f"Dropped by importance: {remaining}")

    # 6) VIF filtering, dropping the least-important among high-VIF features
    features = sel.copy()
    if len(features) > 1:
        try:
            def compute_vif_matrix(X, feats):
                arr = X[feats].dropna().values
                return pd.Series(
                    [variance_inflation_factor(arr, i) for i in range(len(feats))],
                    index=feats
                )

            mat = df_comp[features].dropna().values
            if mat.shape[0] > mat.shape[1]:
                print(f"Starting VIF filtering (threshold={vif_thresh}) on: {features}")
                vif_series = compute_vif_matrix(df_comp, features)
                while True:
                    high_vif = vif_series[vif_series > vif_thresh]
                    if high_vif.empty or len(features) == 1:
                        print("  No further VIF > threshold; done.")
                        break

                    # drop the high-VIF feature with the lowest RF importance
                    to_drop = imp.loc[high_vif.index].idxmin()
                    print(f"  Dropping '{to_drop}' (VIF={vif_series[to_drop]:.1f}, lowest imp={imp[to_drop]:.4f})")
                    features.remove(to_drop)

                    mat = df_comp[features].dropna().values
                    if mat.shape[0] <= mat.shape[1]:
                        print("  Insufficient rows for further VIF; stopping.")
                        break
                    vif_series = compute_vif_matrix(df_comp, features)

                print(f"Features after VIF filtering: {features}")
            else:
                print("  Skipping VIF: not enough complete rows vs. features.")
        except Exception as e:
            print(f"  Skipping VIF due to error: {e}")
    else:
        print("  Skipping VIF: need at least 2 features to compute.")

    return features


In [None]:
wait_candidates = [
    'time_driver_to_start_s',      # API’s estimate of driver travel
    'dist_driver_to_start_m',      # distance driver→pickup
    'hour', 'weekday',             # temporal context at request
    't2m', 'tp', 'u10', 'v10'  # weather at request
]

trip_candidates = [
    'time_start_to_end_s',         # API’s estimate of trip
    'dist_start_to_end_m',         # distance pickup→dropoff
    'hour', 'weekday',             # temporal context at pickup
    't2m', 'tp', 'u10', 'v10'  # weather at pickup
]


# --- 4) Ejecutar selección para cada target ---
wait_features = select_features(df_drop, wait_candidates, 'wait_calc')
trip_features = select_features(df_drop, trip_candidates, 'trip_calc')

print("Features selected for wait_calc:")
print(wait_features)
print("\nFeatures selected for trip_calc:")
print(trip_features)



--- Selecting features for target 'wait_calc' ---
Complete cases for 'wait_calc': 304 rows




Predictors of missingness (p < 0.05): ['time_driver_to_start_s', 'dist_driver_to_start_m', 'hour', 'weekday', 't2m', 'tp', 'v10']
Predictors by correlation ≥ 0.1: ['time_driver_to_start_s', 'dist_driver_to_start_m', 'hour', 'weekday', 't2m', 'u10', 'v10']
Initial candidate predictors: ['time_driver_to_start_s', 'dist_driver_to_start_m', 'hour', 'weekday', 't2m', 'tp', 'v10', 'u10']
RandomForest importances and cumulative sums:
  time_driver_to_start_s: 0.6803 (cumulative: 0.6803)
  dist_driver_to_start_m: 0.2095 (cumulative: 0.8898)
  u10: 0.0286 (cumulative: 0.9184)
  t2m: 0.0251 (cumulative: 0.9435)
  v10: 0.0250 (cumulative: 0.9685)
  tp: 0.0176 (cumulative: 0.9862)
  weekday: 0.0083 (cumulative: 0.9945)
  hour: 0.0055 (cumulative: 1.0000)
Selected by importance (cum ≤ 0.9): ['time_driver_to_start_s', 'dist_driver_to_start_m']
Dropped by importance: ['u10', 't2m', 'v10', 'tp', 'weekday', 'hour']
Starting VIF filtering (threshold=10) on: ['time_driver_to_start_s', 'dist_driver_to_sta



Predictors of missingness (p < 0.05): ['time_start_to_end_s', 'dist_start_to_end_m', 'hour', 'weekday', 'tp']
Predictors by correlation ≥ 0.1: ['time_start_to_end_s', 'dist_start_to_end_m', 'hour', 't2m', 'tp', 'u10']
Initial candidate predictors: ['time_start_to_end_s', 'dist_start_to_end_m', 'hour', 'weekday', 'tp', 't2m', 'u10']
RandomForest importances and cumulative sums:
  time_start_to_end_s: 0.3641 (cumulative: 0.3641)
  dist_start_to_end_m: 0.2975 (cumulative: 0.6616)
  u10: 0.1084 (cumulative: 0.7700)
  t2m: 0.0761 (cumulative: 0.8462)
  tp: 0.0664 (cumulative: 0.9125)
  weekday: 0.0505 (cumulative: 0.9630)
  hour: 0.0370 (cumulative: 1.0000)
Selected by importance (cum ≤ 0.9): ['time_start_to_end_s', 'dist_start_to_end_m', 'u10', 't2m']
Dropped by importance: ['tp', 'weekday', 'hour']
Starting VIF filtering (threshold=10) on: ['time_start_to_end_s', 'dist_start_to_end_m', 'u10', 't2m']
  Dropping 't2m' (VIF=10.3, lowest imp=0.0761)
  Dropping 'dist_start_to_end_m' (VIF=15.2,

In [114]:
wait_feats = wait_features + ['wait_calc']
trip_features.remove('u10')
trip_feats = trip_features + ['trip_calc']

# 2) Configure imputers
imp_wait = IterativeImputer(
    estimator=RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1),
    max_iter=10,
    random_state=0
)
imp_trip = IterativeImputer(
    estimator=RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1),
    max_iter=10,
    random_state=0
)

# 3) Fit & transform wait_calc
wait_array = imp_wait.fit_transform(df_drop[wait_feats])
df_drop['wait_calc'] = wait_array[:, wait_feats.index('wait_calc')]

# 4) Fit & transform trip_calc
trip_array = imp_trip.fit_transform(df_drop[trip_feats])
df_drop['trip_calc'] = trip_array[:, trip_feats.index('trip_calc')]

# 5) Reconstruct arrived_at where needed
mask_arr = df_drop['arrived_at'].isna()
df_drop.loc[mask_arr, 'arrived_at'] = (
    df_drop.loc[mask_arr, 'start_at']
    + pd.to_timedelta(df_drop.loc[mask_arr, 'wait_calc'], unit='s')
)

# 6) Reconstruct end_at where needed
trip_start = pd.DataFrame({
    '1': df_drop['start_at'],
    '2': df_drop['arrived_at']
}).max(axis=1)
mask_end = df_drop['end_at'].isna()
df_drop.loc[mask_end, 'end_at'] = (
    trip_start.loc[mask_end]
    + pd.to_timedelta(df_drop.loc[mask_end, 'trip_calc'], unit='s')
)

# 7) Report
print(f"wait_calc was imputed in {mask_arr.sum()} rows")
print(f"trip_calc was imputed in {mask_end.sum()} rows")




wait_calc was imputed in 10520 rows
trip_calc was imputed in 9038 rows


In [115]:
# Series de valores calculados
t_api_wait = df_drop['time_driver_to_start_s']
t_calc_wait = (df_drop['arrived_at'] - df_drop['start_at']).dt.total_seconds().clip(lower=0)

t_api_trip = df_drop['time_start_to_end_s']
t_calc_trip = (
    df_drop['end_at']
    - pd.DataFrame({'1': df_drop['start_at'], '2': df_drop['arrived_at']})
      .max(axis=1)
)
t_calc_trip = t_calc_trip.dt.total_seconds()

# Error porcentual
err_wait_pct = (t_api_wait - t_calc_wait).abs() / t_api_wait * 100
err_trip_pct = (t_api_trip - t_calc_trip).abs() / t_api_trip * 100

# Umbral del 20%
mask_wait = err_wait_pct > 50
mask_trip = err_trip_pct > 50
mask_any  = mask_wait | mask_trip

# Imprimir conteos
print(f"Valores de espera a reemplazar por API: {mask_wait.sum()}")
print(f"Valores de viaje a reemplazar por API: {mask_trip.sum()}")
print(f"Total de filas a usar dato de API: {mask_any.sum()}")

# DataFrame de filas para reemplazo
df_flag = df_drop.loc[mask_any].copy()
df_flag['err_wait_pct'] = err_wait_pct[mask_any]
df_flag['err_trip_pct'] = err_trip_pct[mask_any]


Valores de espera a reemplazar por API: 421
Valores de viaje a reemplazar por API: 4284
Total de filas a usar dato de API: 4602
