In [1]:
import pandas as pd
import requests
import datetime
import shutil
import errno
import os

savepath = "datasets/"

In [2]:
# Renfe Horarios de alta velocidad, larga distancia y media distancia
url = 'https://ssl.renfe.com/gtransit/Fichero_AV_LD/google_transit.zip'

flags = os.O_CREAT | os.O_EXCL | os.O_WRONLY

# Download zip file
req = requests.get(url)

# Get file name from url
filename = url.split('/')[-1]

if not os.path.exists(savepath):
   os.makedirs(savepath)

try:
    file_handle = os.open(savepath+filename, flags)
except OSError as e:
    if e.errno == errno.EEXIST:  # Failed as the file already exists.
        print('File already exists!')
    else:  # Something unexpected went wrong so reraise the exception.
        raise
else:  # No exception, so the file must have been created successfully.
    # Writing the file to the local file system
    with open(savepath+filename, 'wb') as output_file:
        output_file.write(req.content)

    # Unzip
    shutil.unpack_archive(savepath+filename, savepath)

    print('File downloaded successfully!')

File already exists!


In [3]:
agency = pd.read_csv(savepath+"/agency.txt", sep=",")

agency.head()

Unnamed: 0,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone
0,1071,RENFE OPERADORA,http://www.renfe.com,Europe/Madrid,ES,902320320


In [4]:
calendar_dates = pd.read_csv(savepath+"/calendar_dates.txt", sep=",")

calendar_dates.head()

Unnamed: 0,service_id,date,exception_type
0,2022-12-012022-12-10001901,20221204,2
1,2022-12-112023-03-12001901,20221211,2
2,2022-12-112023-03-12001901,20221218,2
3,2022-12-112023-03-12001901,20221225,2
4,2022-12-112023-03-12001901,20230101,2


In [5]:
# Parse date column
calendar_dates.date = calendar_dates.date.apply(lambda d : datetime.datetime.strptime(str(d), '%Y%m%d'))

In [6]:
calendar_dates.head()

Unnamed: 0,service_id,date,exception_type
0,2022-12-012022-12-10001901,2022-12-04,2
1,2022-12-112023-03-12001901,2022-12-11,2
2,2022-12-112023-03-12001901,2022-12-18,2
3,2022-12-112023-03-12001901,2022-12-25,2
4,2022-12-112023-03-12001901,2023-01-01,2


In [7]:
res = []

# Iterate directory
for filename in os.listdir(savepath):
    # check only text files
    if filename.endswith('.txt'):
        res.append(filename)
print(res)

['agency.txt', 'calendar_dates.txt', 'stop_times.txt', 'trips.txt', 'stops.txt', 'calendar.txt', 'routes.txt']


In [2]:
renfe_schedules = {}

savepath = "datasets/"

for filename in os.listdir(savepath):
    if filename.endswith('.txt'):
        renfe_schedules[filename.split(".")[0]] = pd.read_csv(savepath+filename, delimiter=',')

In [3]:
renfe_schedules.keys()

dict_keys(['agency', 'calendar_dates', 'stop_times', 'trips', 'stops', 'calendar', 'routes'])

In [10]:
renfe_schedules['calendar_dates']

Unnamed: 0,service_id,date,exception_type
0,2022-12-012022-12-10001901,20221204,2
1,2022-12-112023-03-12001901,20221211,2
2,2022-12-112023-03-12001901,20221218,2
3,2022-12-112023-03-12001901,20221225,2
4,2022-12-112023-03-12001901,20230101,2
...,...,...,...
49942,2022-12-012022-12-10385232,20221205,2
49943,2022-12-012022-12-10385232,20221206,2
49944,2022-12-012022-12-10385232,20221207,2
49945,2022-12-012022-12-10385232,20221208,2


In [11]:
# Parse date of service to format: Day/Month/Year

parse_date = lambda d : datetime.datetime.strptime(str(d), '%Y%m%d').date().strftime("%d/%m/%Y")

renfe_schedules['calendar_dates'].date = renfe_schedules['calendar_dates'].date.apply(parse_date)

renfe_schedules['calendar_dates']

Unnamed: 0,service_id,date,exception_type
0,2022-12-012022-12-10001901,04/12/2022,2
1,2022-12-112023-03-12001901,11/12/2022,2
2,2022-12-112023-03-12001901,18/12/2022,2
3,2022-12-112023-03-12001901,25/12/2022,2
4,2022-12-112023-03-12001901,01/01/2023,2
...,...,...,...
49942,2022-12-012022-12-10385232,05/12/2022,2
49943,2022-12-012022-12-10385232,06/12/2022,2
49944,2022-12-012022-12-10385232,07/12/2022,2
49945,2022-12-012022-12-10385232,08/12/2022,2


In [12]:
renfe_schedules['routes']

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,7180161307GL026,1071,Intercity,,,2,,F2F5F5,...
1,7180161200GL026,1071,Intercity,,,2,,F2F5F5,...
2,1700037606GL026,1071,Intercity,,,2,,F2F5F5,...
3,3760617000GL026,1071,Intercity,,,2,,F2F5F5,...
4,1700037606GL023,1071,ALVIA,,,2,,F2F5F5,...
...,...,...,...,...,...,...,...,...,...
598,8111081100VRM,1071,MD,,,2,,F2F5F5,...
599,8110081110VRM,1071,MD,,,2,,F2F5F5,...
600,1510022308VRM,1071,MD,,,2,,F2F5F5,...
601,1510022100VRM,1071,MD,,,2,,F2F5F5,...


In [13]:
x = renfe_schedules['routes'].route_short_name.values

x = set(x)

x

{'ALVIA',
 'AVANT',
 'AVE',
 'AVE-TGV',
 'AVLO',
 'EUROMED',
 'Intercity',
 'MD',
 'REG.EXP.',
 'REGIONAL',
 'RODALIES',
 'TORRE ORO'}

In [14]:
# Servicios alta velocidad y larga distancia en territorio español
s = {'ALVIA', 'AVE', 'AVLO'}

renfe_schedules['routes'] = renfe_schedules['routes'][renfe_schedules['routes']['route_short_name'].isin(s)]

renfe_schedules['routes']

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
4,1700037606GL023,1071,ALVIA,,,2,,F2F5F5,...
5,3760617000GL023,1071,ALVIA,,,2,,F2F5F5,...
16,1320071801GL023,1071,ALVIA,,,2,,F2F5F5,...
17,0404071801GL023,1071,ALVIA,,,2,,F2F5F5,...
18,1320011200GL023,1071,ALVIA,,,2,,F2F5F5,...
...,...,...,...,...,...,...,...,...,...
231,0430760000LC112,1071,AVLO,,,2,,F2F5F5,...
232,7180160000LC112,1071,AVLO,,,2,,F2F5F5,...
233,6000004307LC112,1071,AVLO,,,2,,F2F5F5,...
234,6000003216LC112,1071,AVLO,,,2,,F2F5F5,...


In [15]:
# Filter Route id contains GL (ALVIA), AV (AVE) or LC (AVLO)

renfe_schedules['trips']

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible
0,7180161307GL026,2022-12-012022-12-10001651,0016512022-12-01,,165,,,,1
1,7180161307GL026,2022-12-112022-12-19001651,0016512022-12-11,,165,,,,2
2,7180161307GL026,2022-12-202023-01-29001651,0016512022-12-20,,165,,,,2
3,7180161200GL026,2022-12-012022-12-10001653,0016532022-12-01,,165,,,,2
4,7180161200GL026,2022-12-112022-12-19001653,0016532022-12-11,,165,,,,2
...,...,...,...,...,...,...,...,...,...
15623,6000003216AV006,2022-12-242022-12-24950701,9507012022-12-24,,95070,,,,2
15624,6000003216AV006,2022-12-252023-02-05950701,9507012022-12-25,,95070,,,,2
15625,6000003216AV006,2023-02-062023-02-06950701,9507012023-02-06,,95070,,,,2
15626,6000003216AV006,2023-02-072023-02-07950701,9507012023-02-07,,95070,,,,2


In [16]:
s = {'GL', 'AV', 'LC'}

# Service id es único
# Route ids se repiten

renfe_schedules['trips'] = renfe_schedules['trips'][renfe_schedules['trips']['route_id'].str.contains('|'.join(s))]

renfe_schedules['trips']

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible
0,7180161307GL026,2022-12-012022-12-10001651,0016512022-12-01,,165,,,,1
1,7180161307GL026,2022-12-112022-12-19001651,0016512022-12-11,,165,,,,2
2,7180161307GL026,2022-12-202023-01-29001651,0016512022-12-20,,165,,,,2
3,7180161200GL026,2022-12-012022-12-10001653,0016532022-12-01,,165,,,,2
4,7180161200GL026,2022-12-112022-12-19001653,0016532022-12-11,,165,,,,2
...,...,...,...,...,...,...,...,...,...
15623,6000003216AV006,2022-12-242022-12-24950701,9507012022-12-24,,95070,,,,2
15624,6000003216AV006,2022-12-252023-02-05950701,9507012022-12-25,,95070,,,,2
15625,6000003216AV006,2023-02-062023-02-06950701,9507012023-02-06,,95070,,,,2
15626,6000003216AV006,2023-02-072023-02-07950701,9507012023-02-07,,95070,,,,2


In [17]:
x = renfe_schedules['trips'].route_id.values

x = len(set(x))

x

292

In [7]:
os.chdir("..")
from src.robin.offer.entities import *

s = Station(0, "MAD", "Madrid")

df = renfe_schedules['stops']

df['stop_id'] = df['stop_id'].astype(int)
df['stop_lat'] = df['stop_lat'].astype(float)
df['stop_lon'] = df['stop_lon'].astype(float)

stopsD = dict(zip(df.stop_name, zip(df.stop_id, zip(df.stop_lat, df.stop_lon))))

stations = [Station(stopsD[s][0], str(s), str(str(s)[:3].upper()), stopsD[s][1]) for s in stopsD]

print(stations[1])

[61307,Cartagena,CAR,(37.604967, -0.975122)]


In [20]:
x = renfe_schedules['trips'].route_id.values

x = len(set(x))

print(x)
print(len(renfe_schedules['trips'].route_id.values))
y = sum([1 for v in renfe_schedules['trips'].route_id.values if len(v) == 15])
print(y)

292
11935
11929


In [9]:
pairs = []
for v in renfe_schedules['trips'].route_id.values:
    if len(v) == 15:
        try:
            org = int(v[:5])
            dest = int(v[5:10])
        except:
            continue
        pairs.append((org, dest))

pairs

[(71801, 61307),
 (71801, 61307),
 (71801, 61307),
 (71801, 61200),
 (71801, 61200),
 (71801, 61200),
 (17000, 37606),
 (17000, 37606),
 (17000, 37606),
 (17000, 37606),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (17000, 37606),
 (17000, 37606),
 (17000, 37606),
 (17000, 37606),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (37606, 17000),
 (61307, 71801),
 (61307, 71801),
 (61307, 71801),
 (61200, 71801),
 (61200, 71801),
 (61200, 71801),
 (17000, 56312),
 (17000, 56312),
 (17000, 50300),
 (17000, 50300),
 (56312, 17000),
 (56312, 17000),
 (50300, 17000),
 (50300, 17000),
 (11200, 13200),
 (11200, 13200),
 (11511, 11208),
 (11511, 11208),
 (11208, 11511),
 (11208, 11511),
 (13200, 11200),
 (13200, 11200),
 (13200, 71801

In [10]:
renfe_schedules['trips'].route_id.values

array(['7180161307GL026', '7180161307GL026', '7180161307GL026', ...,
       '6000003216AV006', '6000003216AV006', '6000003216AV006'],
      dtype=object)

In [11]:
pairsStation = []

for p in pairs:
    keys = [s.id for s in stations]
    if p[0] in keys and p[1] in keys:
        o_index = keys.index(p[0])
        d_index = keys.index(p[1])
        pairsStation.append((stations[o_index], stations[d_index]))

In [12]:
for ps in pairsStation[:5]:
    for s in ps:
        print(s, end= ", ")
    print()

print(len(pairsStation))

[71801,Barcelona-Sants,BAR,(41.37922, 2.140624)], [61307,Cartagena,CAR,(37.604967, -0.975122)], 
[71801,Barcelona-Sants,BAR,(41.37922, 2.140624)], [61307,Cartagena,CAR,(37.604967, -0.975122)], 
[71801,Barcelona-Sants,BAR,(41.37922, 2.140624)], [61307,Cartagena,CAR,(37.604967, -0.975122)], 
[71801,Barcelona-Sants,BAR,(41.37922, 2.140624)], [61200,Murcia,MUR,(37.97465, -1.1299555)], 
[71801,Barcelona-Sants,BAR,(41.37922, 2.140624)], [61200,Murcia,MUR,(37.97465, -1.1299555)], 
11929


In [13]:
setPairs = set(pairsStation)

In [14]:
print(len(setPairs))

237


In [15]:
for sp in list(setPairs)[:5]:
    for s in sp:
        print(s, end=", ")
    print()

[10600,Valladolid,VAL,(41.642167, -4.726986)], [11014,Burgos-Rosa de Lima,BUR,(42.371197, -3.666308)], 
[17000,Madrid-Chamartin,MAD,(40.4720993, -3.6824687)], [60911,Alicante/alacant,ALI,(38.34445, -0.4950527)], 
[11208,Vitoria/gasteiz,VIT,(42.841517, -2.672601)], [11511,San Sebastian/donostia,SAN,(43.317669, -1.976704)], 
[4040,Zaragoza-Delicias,ZAR,(41.658649, -0.911615)], [70600,Calatayud,CAL,(41.346692, -1.63868)], 
[65312,Vinaros,VIN,(40.471682, 0.455727)], [3216,Valencia Joaquin Sorolla,VAL,(39.459051, -0.382923)], 


In [16]:
station_ids = set(pairs)

In [19]:
lines = s = {'GL', 'AV', 'LC'}

# Service id es único
# Route ids se repiten

renfe_schedules['trips'] = renfe_schedules['trips'][renfe_schedules['trips']['route_id'].str.contains('|'.join(s))]

[(51405, 60000), (71801, 13200), (81100, 60000), (2003, 78400), (31412, 21010)]

In [20]:
c = (((1, 2), (2, 3), (3, 4)), ((1, 2), (2, 7), (7, 9)), ((1, 2), (2, 3), (3, 4)))

set(c)

{((1, 2), (2, 3), (3, 4)), ((1, 2), (2, 7), (7, 9))}