In [1]:
import pandas as pd
import tempfile
import os
import altair as alt
import numpy as np
import folium
from folium.plugins import MarkerCluster
import geopandas as gpd
from shapely.geometry import Point
import osmnx as ox
import json
import time
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('output\\uber_rides_enriched.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23111 entries, 0 to 23110
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 23111 non-null  object 
 1   driver_id               19726 non-null  object 
 2   icon                    23111 non-null  object 
 3   start_type              23111 non-null  object 
 4   start_at                23111 non-null  object 
 5   start_lat               22212 non-null  float64
 6   start_lon               22212 non-null  float64
 7   end_at                  22835 non-null  object 
 8   end_lat                 22212 non-null  float64
 9   end_lon                 22212 non-null  float64
 10  end_state               23099 non-null  object 
 11  driver_start_lat        18310 non-null  float64
 12  driver_start_lon        18310 non-null  float64
 13  arrived_at              17716 non-null  object 
 14  price                   22713 non-null

In [24]:
to_date = lambda x : pd.to_datetime(x, dayfirst=True)
for col in ('start_at','end_at','arrived_at') :
     df[col] = df[col].apply(to_date)

In [45]:
# Calcular el máximo y el mínimo entre start_at y arrived_at
max_inicio = df[['start_at', 'arrived_at']].max(axis=1)
min_inicio = df[['start_at', 'arrived_at']].min(axis=1)

# Usar el máximo si es menor a end_at, si no usar el mínimo
inicio_utilizado = np.where(max_inicio < df['end_at'], max_inicio, min_inicio)

# Calcular duración
df['duration_min'] = (df['end_at'] - pd.to_datetime(inicio_utilizado)).dt.total_seconds() / 60


In [46]:
df.loc[df['start_at'] > df['end_at'],'duration_min']=np.nan

In [51]:
def outliers(
    df : pd.DataFrame,
    col : str,
    rango : range = range(0,101),
    intervalo : int = 1000
) :
    '''
    Genera un gráfico interactivo de línea que muestra el valor de cada percentil
    de la columna especificada.

    Parámetros
    ----------
    df : pd.DataFrame
        DataFrame de entrada; se eliminan filas con valores NaN en `start_lat`.
    col : str
        Nombre de la columna de la que calcular los percentiles.
    rango : range, opcional
        Rango de percentiles a evaluar (por defecto 0-100).
    intervalo : int, opcional
        Número de divisiones usadas para calcular los quantiles (por defecto 1000).

    Devuelve
    -------
    alt.Chart
        Gráfico Altair interactivo con percentil (eje x) vs. valor (eje y).
    '''
    df = df.dropna(subset='start_lat')
    resultados = []
    
    for i in rango:
        valor = df[col].quantile(i / intervalo)
        resultados.append(
            {
                'p' : i/(intervalo/100),
                'valor' : valor
            }
        )

    df_res = pd.DataFrame(resultados)

    chart = (
        alt.Chart(df_res, width=700, height=400)
        .mark_line(point=True)
        .encode(
            x=alt.X('p:Q', title='Percentil (p)'),
            y=alt.Y('valor:Q', title='Distancia'),
            tooltip=[
                alt.Tooltip('p:Q', format='.1f', title='p'),
                alt.Tooltip('valor:Q', format='.2f', title='distancia')
            ]
        )
        .interactive()
    )

    return chart

In [47]:
df_drop = df.loc[df['end_state']=='drop off']
df_cancel = df.loc[df['end_state']!='drop off']

In [48]:
df_cancel.drop(columns=['arrived_at','price','price_distance','price_duration','distance','duration','cost','cost_distance','cost_duration','driver_score','rider_score'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cancel.drop(columns=['arrived_at','price','price_distance','price_duration','distance','duration','cost','cost_distance','cost_duration','driver_score','rider_score'],inplace=True)


In [73]:
outliers(df_drop,'duration_min',range(90,99),100)

In [93]:
df_drop['duration_diff']=df_drop['duration_min'] - (df_drop['time_start_to_end_s']/60)
df_drop['duration_diff'].describe()
df_drop.loc[df_drop['duration_diff'].min()==df_drop['duration_diff'],['start_at','arrived_at','end_at','duration_diff','time_start_to_end_s','duration_min','dist_start_to_end_m']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_drop['duration_diff']=df_drop['duration_min'] - (df_drop['time_start_to_end_s']/60)


Unnamed: 0,start_at,arrived_at,end_at,duration_diff,time_start_to_end_s,duration_min,dist_start_to_end_m
5342,2010-05-31 19:10:00,2010-05-31 19:11:00,2010-05-31 19:13:00,-58.733333,3644.0,2.0,24157.0


In [None]:
outliers(df_drop,'duration_diff',range(0,50),100)