### Instalar scikit-learn

In [1]:
pip install scikit-learn




In [2]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd

## Leer el CSV

In [3]:
df = pd.read_csv('cleaned_combined_data.csv')

## Filtrar los registros con "Desconocido" en start_station_name

In [4]:
df_desconocidos = df[df['start_station_name'] == 'Desconocido']

## Extraer las coordenadas de las estaciones conocidas

In [5]:
stations_df = df[df['start_station_name'] != 'Desconocido']

## Crear un modelo de estaciones vecinas más cercanas

In [6]:
knn = NearestNeighbors(n_neighbors=1, metric='haversine')

## Entrenar el modelo con las coordenadas de las estaciones conocidas

In [7]:
coordinates = stations_df[['start_lat', 'start_lng']].to_numpy()
coordinates = coordinates * (3.141592653589793 / 180)  # Convertir a radianes
knn.fit(coordinates)

## Buscar la estación más cercana para cada registro con 'Desconocido'

In [8]:
desconocido_coords = df_desconocidos[['start_lat', 'start_lng']].to_numpy()
desconocido_coords = desconocido_coords * (3.141592653589793 / 180)  # Convertir a radianes

## Predecir la estación más cercana


In [9]:
distances, indices = knn.kneighbors(desconocido_coords)

## Asignar los resultados a las estaciones más cercanas

In [10]:
df_desconocidos['imputed_station_name'] = stations_df.iloc[indices.flatten()]['start_station_name'].values
df_desconocidos['imputed_station_id'] = stations_df.iloc[indices.flatten()]['start_station_id'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_desconocidos['imputed_station_name'] = stations_df.iloc[indices.flatten()]['start_station_name'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_desconocidos['imputed_station_id'] = stations_df.iloc[indices.flatten()]['start_station_id'].values


## Reemplazar los valores en le DataFrame original

In [11]:
df.loc[df['start_station_name'] == 'Desconocido', 'start_station_name'] = df_desconocidos['imputed_station_name']
df.loc[df['start_station_name'] == 'Desconocido', 'start_station_id'] = df_desconocidos['imputed_station_id']

In [12]:
# Verificar los cambios
print(df.head())

            ride_id  rideable_type           started_at             ended_at  \
0  4EAD8F1AD547356B  electric_bike  2023-11-30 21:50:05  2023-11-30 22:13:27   
1  6322270563BF5470  electric_bike  2023-11-03 09:44:02  2023-11-03 10:17:15   
2  B37BDE091ECA38E0  electric_bike  2023-11-30 11:39:44  2023-11-30 11:40:08   
3  CF0CA5DD26E4F90E   classic_bike  2023-11-08 10:01:45  2023-11-08 10:27:05   
4  EB8381AA641348DB   classic_bike  2023-11-03 16:20:25  2023-11-03 16:54:25   

       start_station_name start_station_id               end_station_name  \
0         Millennium Park            13008  Pine Grove Ave & Waveland Ave   
1  Broadway & Sheridan Rd            13323         Broadway & Sheridan Rd   
2   State St & Pearson St     TA1307000061          State St & Pearson St   
3     Theater on the Lake     TA1308000001            Theater on the Lake   
4     Theater on the Lake     TA1308000001            Theater on the Lake   

  end_station_id  start_lat  start_lng    end_lat    end

# Repetir el proceso para las columnas de estaciones finales

In [13]:
# Filtrar los registros donde "end_station_name" es "Desconocido"
df_end_desconocidos = df[df['end_station_name'] == 'Desconocido']

# Verificar los primeros registros para asegurarnos de que los datos son correctos
print(df_end_desconocidos[['ride_id', 'end_station_name', 'end_station_id', 'end_lat', 'end_lng']].head())


                 ride_id end_station_name end_station_id  end_lat  end_lng
134605  1BD2042C0D74B448      Desconocido    Desconocido    41.95   -87.66
134606  DA97B63616B4C411      Desconocido    Desconocido    41.94   -87.67
134607  90028890571F74FE      Desconocido    Desconocido    41.94   -87.70
134608  6A7F34404A485C6F      Desconocido    Desconocido    41.90   -87.63
134609  CB5758FC1EBB52B5      Desconocido    Desconocido    41.93   -87.67


In [14]:
# Filtrar las estaciones conocidas donde end_station_name no es "Desconocido"
stations_end_df = df[df['end_station_name'] != 'Desconocido']

# Verificar las primeras filas
print(stations_end_df[['end_station_name', 'end_lat', 'end_lng']].head())


                end_station_name    end_lat    end_lng
0  Pine Grove Ave & Waveland Ave  41.949473 -87.646453
1         Broadway & Sheridan Rd  41.952833 -87.649993
2          State St & Pearson St  41.897448 -87.628722
3            Theater on the Lake  41.926277 -87.630834
4            Theater on the Lake  41.926277 -87.630834


In [15]:
from sklearn.neighbors import NearestNeighbors

# Crear un modelo de vecinos más cercanos (KNN)
knn_end = NearestNeighbors(n_neighbors=1, metric='haversine')


In [16]:
# Entrenar el modelo con las coordenadas de las estaciones conocidas
end_coordinates = stations_end_df[['end_lat', 'end_lng']].to_numpy()
end_coordinates = end_coordinates * (3.141592653589793 / 180)  # Convertir a radianes
knn_end.fit(end_coordinates)

In [17]:
# Buscar la estación más cercana para cada registro con "Desconocido" en end_station_name
desconocido_end_coords = df_end_desconocidos[['end_lat', 'end_lng']].to_numpy()
desconocido_end_coords = desconocido_end_coords * (3.141592653589793 / 180)  # Convertir a radianes

In [18]:
# Predecir la estación más cercana
distances_end, indices_end = knn_end.kneighbors(desconocido_end_coords)

In [19]:
# Asignar los resultados a las estaciones más cercanas
df_end_desconocidos['imputed_end_station_name'] = stations_end_df.iloc[indices_end.flatten()]['end_station_name'].values
df_end_desconocidos['imputed_end_station_id'] = stations_end_df.iloc[indices_end.flatten()]['end_station_id'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_end_desconocidos['imputed_end_station_name'] = stations_end_df.iloc[indices_end.flatten()]['end_station_name'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_end_desconocidos['imputed_end_station_id'] = stations_end_df.iloc[indices_end.flatten()]['end_station_id'].values


In [20]:
# Verificar las primeras filas con las estaciones imputadas
print(df_end_desconocidos[['ride_id', 'imputed_end_station_name', 'imputed_end_station_id']].head())

                 ride_id      imputed_end_station_name imputed_end_station_id
134605  1BD2042C0D74B448           Clark St & Grace St           TA1307000127
134606  DA97B63616B4C411      Lincoln Ave & Melrose St           TA1309000042
134607  90028890571F74FE     Elston Ave & Henderson St                    433
134608  6A7F34404A485C6F   Dearborn Pkwy & Delaware Pl           TA1307000128
134609  CB5758FC1EBB52B5  Ashland Ave & Wrightwood Ave                  13296


In [21]:
# Reemplazar los valores en el DataFrame original
df.loc[df['end_station_name'] == 'Desconocido', 'end_station_name'] = df_end_desconocidos['imputed_end_station_name']
df.loc[df['end_station_name'] == 'Desconocido', 'end_station_id'] = df_end_desconocidos['imputed_end_station_id']

# Verificar que los cambios se hayan realizado correctamente
print(df.head())

            ride_id  rideable_type           started_at             ended_at  \
0  4EAD8F1AD547356B  electric_bike  2023-11-30 21:50:05  2023-11-30 22:13:27   
1  6322270563BF5470  electric_bike  2023-11-03 09:44:02  2023-11-03 10:17:15   
2  B37BDE091ECA38E0  electric_bike  2023-11-30 11:39:44  2023-11-30 11:40:08   
3  CF0CA5DD26E4F90E   classic_bike  2023-11-08 10:01:45  2023-11-08 10:27:05   
4  EB8381AA641348DB   classic_bike  2023-11-03 16:20:25  2023-11-03 16:54:25   

       start_station_name start_station_id               end_station_name  \
0         Millennium Park            13008  Pine Grove Ave & Waveland Ave   
1  Broadway & Sheridan Rd            13323         Broadway & Sheridan Rd   
2   State St & Pearson St     TA1307000061          State St & Pearson St   
3     Theater on the Lake     TA1308000001            Theater on the Lake   
4     Theater on the Lake     TA1308000001            Theater on the Lake   

  end_station_id  start_lat  start_lng    end_lat    end

In [24]:
# Contar registros con "Desconocido" en cada columna relevante
desconocido_counts = {
    "start_station_name": (df['start_station_name'] == 'Desconocido').sum(),
    "start_station_id": (df['start_station_id'] == 'Desconocido').sum(),
    "end_station_name": (df['end_station_name'] == 'Desconocido').sum(),
    "end_station_id": (df['end_station_id'] == 'Desconocido').sum()
}

# Mostrar el resultado
print("Registros con 'Desconocido' por columna:")
for column, count in desconocido_counts.items():
    print(f"{column}: {count}")

# Total de registros con "Desconocido" en cualquier columna
total_desconocido = sum(desconocido_counts.values())
print(f"Total de registros con 'Desconocido' en cualquier columna: {total_desconocido}")


Registros con 'Desconocido' por columna:
start_station_name: 0
start_station_id: 336
end_station_name: 0
end_station_id: 0
Total de registros con 'Desconocido' en cualquier columna: 336


In [23]:
# Crear un diccionario de mapeo para start_station_name y start_station_id
station_name_to_id = stations_end_df.set_index('end_station_name')['end_station_id'].to_dict()

# Actualizar start_station_id y end_station_id en base a los nombres de las estaciones
df['start_station_id'] = df['start_station_name'].map(station_name_to_id).fillna(df['start_station_id'])
df['end_station_id'] = df['end_station_name'].map(station_name_to_id).fillna(df['end_station_id'])

# Verificar los resultados nuevamente
print("Registros con 'Desconocido' después de la imputación:")
print(f"start_station_id: {(df['start_station_id'] == 'Desconocido').sum()}")
print(f"end_station_id: {(df['end_station_id'] == 'Desconocido').sum()}")


Registros con 'Desconocido' después de la imputación:
start_station_id: 336
end_station_id: 0


### Calcular el porcentaje que representan los 336 registros "Desconocidos"

In [25]:
# Total de registros en la columna start_station_id
total_registros = len(df)

# Registros con "Desconocido" en start_station_id
desconocidos_start_station_id = (df['start_station_id'] == 'Desconocido').sum()

# Calcular el porcentaje
porcentaje_desconocidos = (desconocidos_start_station_id / total_registros) * 100

# Mostrar el resultado
print(f"Total de registros: {total_registros}")
print(f"Registros con 'Desconocido' en start_station_id: {desconocidos_start_station_id}")
print(f"Porcentaje de 'Desconocido': {porcentaje_desconocidos:.2f}%")

Total de registros: 4559898
Registros con 'Desconocido' en start_station_id: 336
Porcentaje de 'Desconocido': 0.01%


Debido a que el porcentaje es tan bajo (0.01%), se toma la decision de excluir del analisis estos registros, ya que no tienen un impacto real en el posterior analisis

In [26]:
# Excluir registros con "Desconocido" en start_station_id
df_cleaned = df[df['start_station_id'] != 'Desconocido']

# Verificar el total de registros después de la exclusión
print(f"Total de registros después de la exclusión: {len(df_cleaned)}")

Total de registros después de la exclusión: 4559562


In [28]:
df_cleaned

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_length,day_of_week,time_of_day
0,4EAD8F1AD547356B,electric_bike,2023-11-30 21:50:05,2023-11-30 22:13:27,Millennium Park,13008,Pine Grove Ave & Waveland Ave,TA1307000150,41.881101,-87.624082,41.949473,-87.646453,member,23.366667,Thursday,Evening
1,6322270563BF5470,electric_bike,2023-11-03 09:44:02,2023-11-03 10:17:15,Broadway & Sheridan Rd,13323,Broadway & Sheridan Rd,13323,41.952868,-87.650035,41.952833,-87.649993,member,33.216667,Friday,Morning
2,B37BDE091ECA38E0,electric_bike,2023-11-30 11:39:44,2023-11-30 11:40:08,State St & Pearson St,TA1307000061,State St & Pearson St,TA1307000061,41.897533,-87.628694,41.897448,-87.628722,member,0.400000,Thursday,Morning
3,CF0CA5DD26E4F90E,classic_bike,2023-11-08 10:01:45,2023-11-08 10:27:05,Theater on the Lake,TA1308000001,Theater on the Lake,TA1308000001,41.926277,-87.630834,41.926277,-87.630834,member,25.333333,Wednesday,Morning
4,EB8381AA641348DB,classic_bike,2023-11-03 16:20:25,2023-11-03 16:54:25,Theater on the Lake,TA1308000001,Theater on the Lake,TA1308000001,41.926277,-87.630834,41.926277,-87.630834,member,34.000000,Friday,Afternoon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4559893,9FC228393F5FFEBF,electric_bike,2024-05-21 16:02:09,2024-05-21 16:06:58,Clarendon Ave & Leland Ave,TA1307000119,Clarendon Ave & Junior Ter,13389,41.967860,-87.650089,41.960000,-87.650000,member,4.816667,Tuesday,Afternoon
4559894,B939C8F50D7BAB8F,electric_bike,2024-05-28 05:30:46,2024-05-28 05:35:51,Wabash Ave & Roosevelt Rd,TA1305000002,Canal St & Taylor St,15550,41.867148,-87.625999,41.870000,-87.640000,member,5.083333,Tuesday,Night
4559895,53138A05B0B81C07,electric_bike,2024-05-21 18:05:12,2024-05-21 18:14:05,DuSable Lake Shore Dr & Belmont Ave,TA1309000049,Clark St & Armitage Ave,13146,41.940771,-87.639185,41.920000,-87.630000,member,8.883333,Tuesday,Evening
4559896,05D362D8910B1234,electric_bike,2024-05-17 14:47:31,2024-05-17 14:58:45,Green St & Washington Blvd,13053,Larrabee St & Oak St,KA1504000116,41.883253,-87.648854,41.900000,-87.640000,member,11.233333,Friday,Afternoon


## Guardar el CSV

In [29]:
# Guardar el DataFrame actualizado en un nuevo archivo CSV
df_cleaned.to_csv('cleaned_combined_data_imputed.csv', index=False)