# Reto: Análisis de Uso de TransMilenio en Domingos
### *by Juan Pablo Morales*

In [1]:
import pandas as pd

1. Se definen las funciones `optimize_data_frame` y `collect_data`, para optimizar y unir la información (data-frames) respectivamente

In [417]:
# TODO: Remove special chars from Valor column so that conversion works properly
def optimize_data_frame(df: pd.DataFrame):
    """This function will do the following optimizations to a copy of the provided data-frame `df`:
        - Set columns for date and numeric values to the right type
        - Remove useless columns/rows
        - Optimize cols with few unique values using `Series.astype()` method with `"category"` as the `dtype` parameter, significantly reducing memory consumption.

    Args:
        df (pd.DataFrame): data frame to be optimized and cleaned 

    Returns:
        pd.DataFrame: optimized copy of the original data-frame
    """
    optimized_df = df.copy()

    # Optimize cols with few unique values using `Series.astype()` method with `"category"` as the `dtype` parameter, significantly reducing memory consumption.
    cols_to_categorize = [
        "Acceso_Estacion",
        "Day_Group_Type",
        "Dispositivo",
        "Emisor",
        "Estacion_Parada",
        "Fase",
        "Hora_Pico_SN",
        "Linea",
        "Nombre_Perfil",
        "Numero_Tarjeta",
        "Operador",
        "Sistema",
        "Tipo_Tarifa",
        "Tipo_Tarjeta"
    ]
    optimized_df[cols_to_categorize] = df[cols_to_categorize].astype("category")
    
    # Remove useless (completely empty) columns ["ID_Vehiculo","Ruta"]
    optimized_df = optimized_df.dropna(axis="columns",how="all")

    # Set columns for date and numeric values to the right type
    optimized_df["Fecha_Transaccion"] = pd.to_datetime(df["Fecha_Transaccion"], format="%Y-%m-%d %H:%M:%S")
    optimized_df["Valor"] = df["Valor"].str.removeprefix("$ ").astype(float)

    # Remove rows where Fecha_Transaccion doesn't match Fecha_Clearing
    correct_data = optimized_df["Fecha_Transaccion"].dt.date == optimized_df["Fecha_Clearing"]
    optimized_df = optimized_df[correct_data]

    # Remove symbol at the start of "Linea" column values
    optimized_df["Linea"] = optimized_df["Linea"].str.removeprefix("(")

    return optimized_df

In [456]:
def collect_data(data_list):
    """Method that groups all the data into a single, optimized data-frame

    Args:
        data_list (list): list of paths of csv files from which data is to be collected and grouped
    """
    final_df = pd.DataFrame()
    dfs = []

    for data in data_list: 
        print(data)
        current_df = pd.read_csv(data, parse_dates=["Fecha_Clearing"], date_format="%Y-%m-%d")
        optimized_df = optimize_data_frame(current_df)
        dfs.append(optimized_df)
    

    # print(dfs)
    # In the first two data-frames set column "Es_Puente" as True 
    for df in dfs:
        df["Es_Puente"] = True
        df["Es_Puente"] = df["Es_Puente"].astype("category")
    # for df in dfs[:2]:
    #     df["Es_Puente"] = True

    # Set column "Es_Puente" as False in the other two data-frames
    # for df in dfs[2:]:
    #     df["Es_Puente"] = True

    # current_df = pd.read_csv(data_list[0], parse_dates=["Fecha_Clearing"], date_format="%Y-%m-%d")
    # current_df = optimize_data_frame(current_df)

    # Perform an "union all" operation in all the df's
    final_df = pd.concat(dfs)

    # Add extra columns
    # Add "Nombre_linea" col
    lineas_info = pd.read_csv("challenge/Lineas.csv").melt(var_name="Nombre_linea").drop(0).reset_index(drop=True)

    lineas_info = lineas_info.rename(columns={"value":"Linea"})
    lineas_info["Nombre_linea"] = lineas_info["Nombre_linea"].str.strip().str.strip("'")
    lineas_info["Linea"] = lineas_info["Linea"].str.strip().str.strip("'")

    final_df = final_df.merge(lineas_info, how="left", on="Linea")
    linea_col_pos = final_df.columns.get_loc("Linea")
    linea_name_col = final_df.pop("Nombre_linea")
    final_df.insert(linea_col_pos+1, "Nombre_linea", linea_name_col)

    # Add "Nombre_emisor" col
    emisor_info = pd.read_json("challenge/emisor.json")
    emisor_info.columns = ["Emisor", "Nombre_emisor"]

    final_df = final_df.merge(emisor_info, how="left", on="Emisor")
    emisor_col_pos = final_df.columns.get_loc("Emisor")
    emisor_name_col = final_df.pop("Nombre_emisor")
    final_df.insert(emisor_col_pos+1, "Nombre_emisor", emisor_name_col)

    return final_df

    

In [271]:
day1 = pd.read_csv("challenge/validacion20241020", parse_dates=["Fecha_Clearing"], date_format="%Y-%m-%d")

In [296]:
day1.info()

# day1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 638602 entries, 0 to 638601
Data columns (total 22 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   Acceso_Estacion             638602 non-null  object        
 1   Day_Group_Type              638602 non-null  object        
 2   Dispositivo                 638602 non-null  int64         
 3   Emisor                      638602 non-null  int64         
 4   Estacion_Parada             638602 non-null  object        
 5   Fase                        638602 non-null  object        
 6   Fecha_Clearing              638602 non-null  datetime64[ns]
 7   Fecha_Transaccion           638602 non-null  object        
 8   Hora_Pico_SN                638602 non-null  object        
 9   ID_Vehiculo                 0 non-null       float64       
 10  Linea                       638602 non-null  object        
 11  Nombre_Perfil               638602 non-

In [457]:
# Read info of the four days from csv files and create a dataframe for each one
data_list = ["challenge/validacion20241020","challenge/validacion20241027","challenge/validacion20241103", "challenge/validacion20241110"]

# op_df = optimize_data_frame(day1)
op_df = collect_data(data_list)
op_df.info()


challenge/validacion20241020
challenge/validacion20241027
challenge/validacion20241103
challenge/validacion20241110
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2573679 entries, 0 to 2573678
Data columns (total 22 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   Acceso_Estacion             object        
 1   Day_Group_Type              object        
 2   Dispositivo                 int64         
 3   Emisor                      int64         
 4   Nombre_emisor               object        
 5   Estacion_Parada             object        
 6   Fase                        category      
 7   Fecha_Clearing              datetime64[ns]
 8   Fecha_Transaccion           datetime64[ns]
 9   Hora_Pico_SN                object        
 10  Linea                       object        
 11  Nombre_linea                object        
 12  Nombre_Perfil               category      
 13  Numero_Tarjeta              object        
 14

In [460]:
#2573679
op_df.head()

Unnamed: 0,Acceso_Estacion,Day_Group_Type,Dispositivo,Emisor,Nombre_emisor,Estacion_Parada,Fase,Fecha_Clearing,Fecha_Transaccion,Hora_Pico_SN,...,Nombre_Perfil,Numero_Tarjeta,Operador,Saldo_Despues_Transaccion,Saldo_Previo_a_Transaccion,Sistema,Tipo_Tarifa,Tipo_Tarjeta,Valor,Es_Puente
0,(LA) ALIMENTACION CASTILLA Y BIBLIOTECA TINTAL,Dia 2,10000784,3101000,Bogota Card(Citizen),(05100) Banderas P. Central,Fase 3,2024-10-20,2024-10-20 03:50:43,Peak Time,...,(002) Adulto Mayor,d970ef6f244d5c52ed0319c87dc31e2c0eea23ed151fc7...,(201) Trunk agency,1730.0,4680.0,TRONCAL,10,tullave Plus,2950.0,True
1,(01) Acceso Peatonal Norte,Dia 2,10001263,3101000,Bogota Card(Citizen),(07010) Bosa,Fase 3,2024-10-20,2024-10-20 03:53:34,Peak Time,...,(001) Anonymous,a432a8fb2c4ed0eee674c0d3ae6758e27f02d71db785ff...,(201) Trunk agency,14750.0,17700.0,TRONCAL,10,tullave Básica,2950.0,True
2,(01) Acceso Peatonal Norte,Dia 2,10001264,3101000,Bogota Card(Citizen),(07010) Bosa,Fase 3,2024-10-20,2024-10-20 03:53:57,Peak Time,...,(001) Anonymous,82e95c47d12e418f3d661df39905cbce80ef9e21804c53...,(201) Trunk agency,250.0,3200.0,TRONCAL,10,tullave Básica,2950.0,True
3,(BA) BATERIA VAGON ORIENTE PARQUE EL TUNAL,Dia 2,10000528,3101000,Bogota Card(Citizen),(08001) Parque el Tunal,Fase 3,2024-10-20,2024-10-20 03:54:20,Peak Time,...,(001) Anonymous,f72f11a243e1821860d4525f6ba295de6f21fe783fa7e2...,(201) Trunk agency,68000.0,70950.0,TRONCAL,10,tullave Básica,2950.0,True
4,(23) Acceso Peatonal,Dia 2,40000160,3101000,Bogota Card(Citizen),(05000) Portal Américas,Fase 3,2024-10-20,2024-10-20 03:54:47,Peak Time,...,(002) Adulto Mayor,ff62cbf07e71e84f49e9f6af4671f60efb77c1735229ad...,(201) Trunk agency,54230.0,56730.0,TRONCAL,10,tullave Plus,2500.0,True
