# Get Line of stations from Renfe data

In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import sys

sys.path.append('..')

In [4]:
import datetime

from src.robin.scraping.renfe.entities import RenfeScraper

scraper = RenfeScraper(stations_csv_path='../data/renfe/renfe_stations.csv')

for station_id, station_name in scraper.available_stations.items():
    print(f'{station_id}: {station_name}')

The chromedriver version (116.0.5845.96) detected in PATH at /usr/local/bin/chromedriver might not be compatible with the detected chrome version (117.0.5938.132); currently, chromedriver 117.0.5938.92 is recommended for chrome 117.*, so it is advised to delete the driver in PATH and retry


?: Estaciones de Origen
31412: A Coruña
94707: Abrantes
60911: Alicante / Alacant
60600: Albacete
06008: Alcantarilla-Los Romanos
60400: Alcázar de San Juan
55020: Algeciras
56312: Almería
99003: Altet Bus
99115: Aguadulce Bus
87912: Aix En Provence
99114: Andorra-Bus
ANTEQ: Antequera (TODAS)
87814: Avignon
10400: Avila
37606: Badajoz
BARCE: Barcelona (TODAS)
87078: Beziers
65318: Benicassim
BILBA: Bilbao (TODAS)
54400: Bobadilla
11014: Burgos Rosa Manzano
35400: Cáceres
51405: Cádiz
70600: Calatayud
50417: Campus Rabanales
61307: Cartagena
65300: Castellón /Castelló
37200: Ciudad Real
50500: Córdoba
CUENC: Cuenca (TODAS)
92201: Denia-Bus
60905: Elda-Petrer
03410: Elche AV/Elx AV
94428: Entroncamento
92157: Estepona Bus
21010: Ferrol
79309: Figueres
79333: Figueres Bus
04307: Figueres Vilafant
69110: Gandía
GIJON: Gijón
79300: Girona
05000: Granada
GUADA: Guadalajara (TODAS)
43019: Huelva
74200: Huesca
IRUN-: Irun-Hendaya (TODAS)
80100: Pamplona/Iruña
99103: Jaca-Bus
03100: Jaén
64100:

In [5]:
scraper.stations_df

Unnamed: 0,stop_id,stop_name,renfe_id,stop_lat,stop_lon
0,00000,Unknown,00000,0.000000,0.000000
1,31412,A Corunya,31412,43.352761,-8.409755
2,60911,Alicante/alacant,60911,38.344450,-0.495053
3,60600,Albacete-Los Llanos,60600,38.999384,-1.848450
4,60400,Alcazar de San Juan,60400,39.395628,-3.205744
...,...,...,...,...,...
100,87173,Montpellier Saint Roch,87173,,
101,87302,Nimes,87302,,
102,87814,Avignon TGV,87814,,
103,87912,Aix En Provence TGV,87912,,


In [13]:
origin_id = scraper.get_renfe_station_id('60000')
destination_id = scraper.get_renfe_station_id('74200')

# Huesca: 74200
# Barcelona: 71801
date = datetime.date(day=11, month=10, year=2023)

df_trips, df_stops = scraper.scrape_trips(origin_id=origin_id, 
                                          destination_id=destination_id, 
                                          init_date=date)

print(df_trips.head())

Date:  2023-10-11
Search url:  https://horarios.renfe.com/HIRRenfeWeb/buscar.do?O=MADRI&D=74200&AF=2023&MF=10&DF=11&SF=3&ID=s
  trip_id train_type                                           schedule  \
0   03393        AVE  {'60000': (0, 0), '70200': (22, 23), '70600': ...   

            departure  duration                                  price  
0 2023-10-11 19:05:00       137  {'Turist': 75.6, 'TuristaPlu': 100.5}  
  trip_id train_type                                           schedule  \
0   03393        AVE  {'60000': (0, 0), '70200': (22, 23), '70600': ...   

            departure  duration                                  price  \
0 2023-10-11 19:05:00       137  {'Turist': 75.6, 'TuristaPlu': 100.5}   

               service_id  
0  03393_11-10-2023-19.05  


In [14]:
df_stops.head()

Unnamed: 0,service_id,stop_id,arrival,departure
0,03393_11-10-2023-19.05,60000,0,0
1,03393_11-10-2023-19.05,70200,22,23
2,03393_11-10-2023-19.05,70600,60,61
3,03393_11-10-2023-19.05,4040,85,88
4,03393_11-10-2023-19.05,78200,120,121


In [15]:
# Save dataframe to csv
df_stops.to_csv('stops_MAD_HSC_11-10-2023.csv', index=False)

In [16]:
result_dict = {}

grouped = df_stops.groupby("service_id")

for name, group in grouped:
    sub_dict = {}
    for index, row in group.iterrows():
        sub_dict[row['stop_id']] = (row['arrival'], row['departure'])
    result_dict[name] = sub_dict

# print(result_dict)

In [17]:
trips = list(set(tuple(service.keys())for service in result_dict.values()))
# print(trips)

In [18]:
# Initialize line with max length trip
line_stations = list(trips.pop(trips.index(max(trips, key=len))))

# Complete corridor with other stops that are not in the initial defined corridor
for trip in trips:
    for i, station in enumerate(trip):
        if station not in line_stations:
            # If station is the last one, append it to the end of the corridor
            if i == len(trip) - 1:
                line_stations.append(station)
            else:
                # If station is not the last one, insert it in the corridor before the next station
                index = line_stations.index(trip[i + 1])
                line_stations.insert(index, station)

print(line_stations)

['60000', '70200', '70600', '04040', '78200', '74200']


In [11]:
mapped_names = scraper.stations_df.set_index('stop_id')['stop_name'].to_dict()
line_stations_names = list(map(mapped_names.get, line_stations))

print(line_stations_names)

['Madrid-Puerta de Atocha', 'Guadalajara', 'Calatayud', 'Zaragoza-Delicias', 'Tarragona', 'Lleida', 'Barcelona-Sants', 'Girona', 'Figueres Vilafant', 'Perpignan', 'Narbonne', 'Beziers', 'Montpellier Saint Roch', 'Nimes', 'Avignon TGV', 'Aix En Provence TGV', 'Marseille St Charles']


In [None]:
# ['Madrid-Puerta de Atocha', 'Guadalajara', 'Calatayud', 'Zaragoza-Delicias', 'Tardienta', 'Huesca']

['Madrid-Puerta de Atocha', 'Guadalajara', 'Calatayud', 'Zaragoza-Delicias', [['Tarragona', 'Lleida', 'Barcelona-Sants', 'Girona', 'Figueres Vilafant'], ['Tardienta', 'Huesca']]]

In [None]:
['60000', '70200', '70600', '04040'], [['71500', '78400', '71801', '79300', '04307'], ['78200', '74200']]

In [19]:
df_stations = scraper.stations_df.copy()

print(df_stations.head())

  stop_id            stop_name renfe_id   stop_lat  stop_lon
0   00000              Unknown    00000   0.000000  0.000000
1   31412            A Corunya    31412  43.352761 -8.409755
2   60911     Alicante/alacant    60911  38.344450 -0.495053
3   60600  Albacete-Los Llanos    60600  38.999384 -1.848450
4   60400  Alcazar de San Juan    60400  39.395628 -3.205744


In [22]:
estaciones_noreste = ['60000', '70200', '70600', '04040', '71500', '78400', '71801', '79300', '04307', '78200', '74200']

df_stations = df_stations[df_stations['stop_id'].isin(estaciones_noreste)]
df_stations.drop(columns=['renfe_id'], inplace=True)
df_stations.reset_index(drop=True, inplace=True)
print(df_stations)

   stop_id                stop_name   stop_lat  stop_lon
0    70600                Calatayud  41.346692 -1.638680
1    04307        Figueres Vilafant  42.264771  2.943547
2    79300                   Girona  41.979303  2.817006
3    74200                   Huesca  42.133594 -0.409745
4    78400                   Lleida  41.620696  0.632669
5    60000  Madrid-Puerta de Atocha  40.406442 -3.690886
6    71500                Tarragona  41.111624  1.253214
7    04040        Zaragoza-Delicias  41.658649 -0.911615
8    71801          Barcelona-Sants  41.379220  2.140624
9    70200              Guadalajara  40.644103 -3.182230
10   78200                Tardienta  41.975751 -0.538314


In [23]:
# save dataframe to csv
df_stations.to_csv('estaciones_corredor_noreste.csv', index=False)