In [3]:
import pandas as pd
import numpy as np
import ast
from math import radians, cos, sin, asin, sqrt



# Read the CSV file
df = pd.read_csv('../datasets/tripdata_2022_05_05.csv')
df.head()

Unnamed: 0,id,trip_minutes,geolocation_unlock,unlock_date,geolocation_lock,lock_date,station_unlock,unlock_station_name,station_lock,lock_station_name
0,283770,17.58,"{'type': 'Point', 'coordinates': [-3.6967169, ...",2022-05-05T00:00:17,"{'type': 'Point', 'coordinates': [-3.6691838, ...",2022-05-05T00:17:52,130.0,122 - Santa Engracia 14,76.0,72 - Sainz de Baranda
1,283771,23.13,"{'type': 'Point', 'coordinates': [-3.66096, 40...",2022-05-05T00:00:23,"{'type': 'Point', 'coordinates': [-3.7026735, ...",2022-05-05T00:23:31,248.0,240 - Avenida Brasilia,218.0,210 - Tres Cruces
2,283772,5.85,"{'type': 'Point', 'coordinates': [-3.7028265, ...",2022-05-05T00:00:28,"{'type': 'Point', 'coordinates': [-3.7088337, ...",2022-05-05T00:06:19,51.0,47 - Embajadores 1,43.0,39 - Plaza de la Cebada
3,283773,12.25,"{'type': 'Point', 'coordinates': [-3.7149166, ...",2022-05-05T00:00:55,"{'type': 'Point', 'coordinates': [-3.728318, 4...",2022-05-05T00:13:10,207.0,199 - Fernando el CatÃ³lico,224.0,216 - Puerta del Ãngel
4,283774,23.25,"{'type': 'Point', 'coordinates': [-3.665936242...",2022-05-05T00:00:59,"{'type': 'Point', 'coordinates': [-3.7061931, ...",2022-05-05T00:24:14,236.0,227 - Concordia,13.0,12 - San Hermenegildo


In [4]:
# Drop unwanted columns
columns_to_drop = [
    'fecha',
    'idBike',
    'fleet',
    'locktype',
    'unlocktype',
    'address_lock',
    'address_unlock',
    'dock_unlock',
    'dock_lock'
]

df_cleaned = df.drop(columns=columns_to_drop)

# Preview the result
df_cleaned.to_csv('../datasets/tripdata_2022_05_05.csv', index=False)

KeyError: "['fecha', 'idBike', 'fleet', 'locktype', 'unlocktype', 'address_lock', 'address_unlock', 'dock_unlock', 'dock_lock'] not found in axis"

In [5]:
#Creating all_stations.csv
def extract_coordinates(geo_str):
    try:
        geo_dict = ast.literal_eval(geo_str)
        lon, lat = geo_dict['coordinates']
        return pd.Series([lat, lon])
    except:
        return pd.Series([None, None])

# Extract lat/lon
df[['lat', 'lon']] = df['geolocation_unlock'].apply(extract_coordinates)

# Select station ID, name, and coordinates
station_df = df[['station_unlock', 'unlock_station_name', 'lat', 'lon']]

# Drop duplicates so we only have one row per station
station_df = station_df.drop_duplicates()

# Save to CSV
station_df.to_csv('../datasets/all_stations.csv', index=False)
station_df.head()

Unnamed: 0,station_unlock,unlock_station_name,lat,lon
0,130.0,122 - Santa Engracia 14,40.429199,-3.696717
1,248.0,240 - Avenida Brasilia,40.43666,-3.66096
2,51.0,47 - Embajadores 1,40.404785,-3.702827
3,207.0,199 - Fernando el CatÃ³lico,40.434361,-3.714917
4,236.0,227 - Concordia,40.39383,-3.665936


In [6]:
# Creating all_trips.csv
# Haversine formula to calculate distance in km
def haversine(lon1, lat1, lon2, lat2):
    if pd.isnull([lon1, lat1, lon2, lat2]).any():
        return np.nan
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371  # Radius of Earth in kilometers
    return c * r

# Load main trip dataset (assuming it's still in df)
# and station data
stations = pd.read_csv('../datasets/all_stations.csv')

# Creating all_trips.csv
trips_df = df[['id', 'unlock_date', 'lock_date', 'station_unlock', 'station_lock', 'trip_minutes']]

# Remove rows with missing critical data (e.g., trips with no end time or duration)
trips_df = trips_df.dropna(subset=['unlock_date', 'lock_date', 'trip_minutes'])

# Merge unlock station coordinates
trips_df = trips_df.merge(
    stations.rename(columns={
        'station_unlock': 'station_unlock',
        'lat': 'unlock_lat',
        'lon': 'unlock_lon'
    })[['station_unlock', 'unlock_lat', 'unlock_lon']],
    on='station_unlock',
    how='left'
)

# Merge lock station coordinates
trips_df = trips_df.merge(
    stations.rename(columns={
        'station_unlock': 'station_lock',
        'lat': 'lock_lat',
        'lon': 'lock_lon'
    })[['station_lock', 'lock_lat', 'lock_lon']],
    on='station_lock',
    how='left'
)

# Calculate distance
trips_df['distance_km'] = trips_df.apply(
    lambda row: haversine(row['unlock_lon'], row['unlock_lat'], row['lock_lon'], row['lock_lat']),
    axis=1
)

# Optional: round distances
trips_df['distance_km'] = trips_df['distance_km'].round(3)

# Save to CSV
trips_df.to_csv('../datasets/all_trips.csv', index=False)

# Preview
trips_df.head()

Unnamed: 0,id,unlock_date,lock_date,station_unlock,station_lock,trip_minutes,unlock_lat,unlock_lon,lock_lat,lock_lon,distance_km
0,283770,2022-05-05T00:00:17,2022-05-05T00:17:52,130.0,76.0,17.58,40.429199,-3.696717,40.415741,-3.669184,2.77
1,283771,2022-05-05T00:00:23,2022-05-05T00:23:31,248.0,218.0,23.13,40.43666,-3.66096,40.419674,-3.702673,4.004
2,283771,2022-05-05T00:00:23,2022-05-05T00:23:31,248.0,218.0,23.13,40.47435,-3.68797,40.419674,-3.702673,6.206
3,283772,2022-05-05T00:00:28,2022-05-05T00:06:19,51.0,43.0,5.85,40.404785,-3.702827,40.411274,-3.708834,0.883
4,283773,2022-05-05T00:00:55,2022-05-05T00:13:10,207.0,224.0,12.25,40.434361,-3.714917,40.413764,-3.728318,2.556


In [8]:
# Order by station_id and removing missing values
# Load the station data
df = pd.read_csv('../datasets/all_stations.csv')

# Sort by station_unlock in ascending order
df_sorted = df.sort_values(by='station_unlock')

# Drop any rows with missing values
df_cleaned = df_sorted.dropna()

# Save back to CSV (optional)
df_cleaned.to_csv('../datasets/all_stations.csv', index=False)

# Preview result
df_cleaned.head()

Unnamed: 0,station_unlock,unlock_station_name,lat,lon
113,1.0,1a - Puerta del Sol A,40.417214,-3.701834
154,2.0,1b - Puerta del Sol B,40.417313,-3.701603
146,3.0,2 - Miguel Moya,40.420589,-3.705842
200,4.0,3 - Plaza Conde Suchil,40.430294,-3.706917
195,5.0,4 - MalasaÃ±a,40.428552,-3.702587
