In [1]:
import pandas as pd
import requests
import datetime
import shutil
import errno
import os

savepath = "datasets/"

In [2]:
# Renfe Horarios de alta velocidad, larga distancia y media distancia
url = 'https://ssl.renfe.com/gtransit/Fichero_AV_LD/google_transit.zip'

flags = os.O_CREAT | os.O_EXCL | os.O_WRONLY

# Download zip file
req = requests.get(url)

# Get file name from url
filename = url.split('/')[-1]

if not os.path.exists(savepath):
   os.makedirs(savepath)

try:
    file_handle = os.open(savepath+filename, flags)
except OSError as e:
    if e.errno == errno.EEXIST:  # Failed as the file already exists.
        print('File already exists!')
    else:  # Something unexpected went wrong so reraise the exception.
        raise
else:  # No exception, so the file must have been created successfully.
    # Writing the file to the local file system
    with open(savepath+filename, 'wb') as output_file:
        output_file.write(req.content)

    # Unzip
    shutil.unpack_archive(savepath+filename, savepath)

    print('File downloaded successfully!')

File already exists!


In [3]:
agency = pd.read_csv(savepath+"/agency.txt", sep=",")

agency.head()

Unnamed: 0,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone
0,1071,RENFE OPERADORA,http://www.renfe.com,Europe/Madrid,ES,902320320


In [4]:
calendar_dates = pd.read_csv(savepath+"/calendar_dates.txt", sep=",")

calendar_dates.head()

Unnamed: 0,service_id,date,exception_type
0,2022-11-302022-12-10001901,20221204,2
1,2022-12-112023-03-12001901,20221211,2
2,2022-12-112023-03-12001901,20221218,2
3,2022-12-112023-03-12001901,20221225,2
4,2022-12-112023-03-12001901,20230101,2


In [5]:
# Parse date column
calendar_dates.date = calendar_dates.date.apply(lambda d : datetime.datetime.strptime(str(d), '%Y%m%d'))

In [6]:
calendar_dates.head()

Unnamed: 0,service_id,date,exception_type
0,2022-11-302022-12-10001901,2022-12-04,2
1,2022-12-112023-03-12001901,2022-12-11,2
2,2022-12-112023-03-12001901,2022-12-18,2
3,2022-12-112023-03-12001901,2022-12-25,2
4,2022-12-112023-03-12001901,2023-01-01,2


In [7]:
res = []

# Iterate directory
for filename in os.listdir(savepath):
    # check only text files
    if filename.endswith('.txt'):
        res.append(filename)
print(res)

['agency.txt', 'calendar_dates.txt', 'stop_times.txt', 'trips.txt', 'stops.txt', 'calendar.txt', 'routes.txt']


In [8]:
renfe_schedules = {}

savepath = "datasets/"

for filename in os.listdir(savepath):
    if filename.endswith('.txt'):
        renfe_schedules[filename.split(".")[0]] = pd.read_csv(savepath+filename, delimiter=',')

In [9]:
renfe_schedules.keys()

dict_keys(['agency', 'calendar_dates', 'stop_times', 'trips', 'stops', 'calendar', 'routes'])

In [10]:
renfe_schedules['calendar_dates']

Unnamed: 0,service_id,date,exception_type
0,2022-11-302022-12-10001901,20221204,2
1,2022-12-112023-03-12001901,20221211,2
2,2022-12-112023-03-12001901,20221218,2
3,2022-12-112023-03-12001901,20221225,2
4,2022-12-112023-03-12001901,20230101,2
...,...,...,...
50122,2022-11-302022-12-10385232,20221205,2
50123,2022-11-302022-12-10385232,20221206,2
50124,2022-11-302022-12-10385232,20221207,2
50125,2022-11-302022-12-10385232,20221208,2


In [11]:
# Parse date of service to format: Day/Month/Year

parse_date = lambda d : datetime.datetime.strptime(str(d), '%Y%m%d').date().strftime("%d/%m/%Y")

renfe_schedules['calendar_dates'].date = renfe_schedules['calendar_dates'].date.apply(parse_date)

renfe_schedules['calendar_dates']

Unnamed: 0,service_id,date,exception_type
0,2022-11-302022-12-10001901,04/12/2022,2
1,2022-12-112023-03-12001901,11/12/2022,2
2,2022-12-112023-03-12001901,18/12/2022,2
3,2022-12-112023-03-12001901,25/12/2022,2
4,2022-12-112023-03-12001901,01/01/2023,2
...,...,...,...
50122,2022-11-302022-12-10385232,05/12/2022,2
50123,2022-11-302022-12-10385232,06/12/2022,2
50124,2022-11-302022-12-10385232,07/12/2022,2
50125,2022-11-302022-12-10385232,08/12/2022,2


In [12]:
renfe_schedules['routes']

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,7180161307GL026,1071,Intercity,,,2,,F2F5F5,...
1,7180161200GL026,1071,Intercity,,,2,,F2F5F5,...
2,1700037606GL026,1071,Intercity,,,2,,F2F5F5,...
3,3760617000GL026,1071,Intercity,,,2,,F2F5F5,...
4,1700037606GL023,1071,ALVIA,,,2,,F2F5F5,...
...,...,...,...,...,...,...,...,...,...
598,8111081100VRM,1071,MD,,,2,,F2F5F5,...
599,8110081110VRM,1071,MD,,,2,,F2F5F5,...
600,1510022308VRM,1071,MD,,,2,,F2F5F5,...
601,1510022100VRM,1071,MD,,,2,,F2F5F5,...


In [13]:
x = renfe_schedules['routes'].route_short_name.values

x = set(x)

x

{'ALVIA',
 'AVANT',
 'AVE',
 'AVE-TGV',
 'AVLO',
 'EUROMED',
 'Intercity',
 'MD',
 'REG.EXP.',
 'REGIONAL',
 'RODALIES',
 'TORRE ORO'}

In [14]:
# Servicios alta velocidad y larga distancia en territorio español
s = {'ALVIA', 'AVE', 'AVLO'}

renfe_schedules['routes'] = renfe_schedules['routes'][renfe_schedules['routes']['route_short_name'].isin(s)]

renfe_schedules['routes']

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
4,1700037606GL023,1071,ALVIA,,,2,,F2F5F5,...
5,3760617000GL023,1071,ALVIA,,,2,,F2F5F5,...
16,1320071801GL023,1071,ALVIA,,,2,,F2F5F5,...
17,0404071801GL023,1071,ALVIA,,,2,,F2F5F5,...
18,1320011200GL023,1071,ALVIA,,,2,,F2F5F5,...
...,...,...,...,...,...,...,...,...,...
231,0430760000LC112,1071,AVLO,,,2,,F2F5F5,...
232,7180160000LC112,1071,AVLO,,,2,,F2F5F5,...
233,6000004307LC112,1071,AVLO,,,2,,F2F5F5,...
234,6000003216LC112,1071,AVLO,,,2,,F2F5F5,...


In [15]:
# Filter Route id contains GL (ALVIA), AV (AVE) or LC (AVLO)

renfe_schedules['trips']

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible
0,7180161307GL026,2022-11-302022-12-10001651,0016512022-11-30,,165,,,,1
1,7180161200GL026,2022-11-302022-12-10001653,0016532022-11-30,,165,,,,2
2,1700037606GL026,2022-11-302022-12-10001901,0019012022-11-30,,190,,,,1
3,1700037606GL026,2022-12-112023-03-12001901,0019012022-12-11,,190,,,,1
4,1700037606GL026,2022-11-302022-12-10001902,0019022022-11-30,,190,,,,2
...,...,...,...,...,...,...,...,...,...
15724,6000003216AV006,2022-12-242022-12-24950701,9507012022-12-24,,95070,,,,2
15725,6000003216AV006,2022-12-252023-02-05950701,9507012022-12-25,,95070,,,,2
15726,6000003216AV006,2023-02-062023-02-06950701,9507012023-02-06,,95070,,,,2
15727,6000003216AV006,2023-02-072023-02-07950701,9507012023-02-07,,95070,,,,2


In [16]:
s = {'GL', 'AV', 'LC'}

# Service id es único
# Toute ids se repiten

renfe_schedules['trips'] = renfe_schedules['trips'][renfe_schedules['trips']['route_id'].str.contains('|'.join(s))]

renfe_schedules['trips']

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible
0,7180161307GL026,2022-11-302022-12-10001651,0016512022-11-30,,165,,,,1
1,7180161200GL026,2022-11-302022-12-10001653,0016532022-11-30,,165,,,,2
2,1700037606GL026,2022-11-302022-12-10001901,0019012022-11-30,,190,,,,1
3,1700037606GL026,2022-12-112023-03-12001901,0019012022-12-11,,190,,,,1
4,1700037606GL026,2022-11-302022-12-10001902,0019022022-11-30,,190,,,,2
...,...,...,...,...,...,...,...,...,...
15724,6000003216AV006,2022-12-242022-12-24950701,9507012022-12-24,,95070,,,,2
15725,6000003216AV006,2022-12-252023-02-05950701,9507012022-12-25,,95070,,,,2
15726,6000003216AV006,2023-02-062023-02-06950701,9507012023-02-06,,95070,,,,2
15727,6000003216AV006,2023-02-072023-02-07950701,9507012023-02-07,,95070,,,,2


In [17]:
x = renfe_schedules['trips'].route_id.values

x = len(set(x))

x

292