## Train schedules data from Digitraffic  (in JSON)

Data from Digitraffic's [https://rata.digitraffic.fi](https://rata.digitraffic.fi)-interface. License [CC BY 4.0](http://creativecommons.org/licenses/by/4.0/)


In [1]:
import json
import requests
import pandas as pd
import numpy as np
import datetime as dt

# The function for parsing timestamps
def parsetime(time):
    getdate = time.split('T')[0]
    gettime = time.split('T')[1].split('.')[0]
    return(pd.to_datetime(getdate+" "+gettime))

### Parsing the JSON

In [2]:
# Function to parse a train's data
def get_train_json(traindate, trainnro, realtime, schedtime, stations, types, delays, category, printit):
    request = 'https://rata.digitraffic.fi/api/v1/trains/'+traindate+'/'+trainnro
    data = requests.get(request)
    json = data.json()

    # Get the train category: cargo/test drive/long distance etc
    cat = json[0]['trainCategory']

    # Iterate through the json items in 'timeTableRows':
    for i in range(len(json[0]['timeTableRows'])):
        # get type: Arrival or Departure
        types.append(json[0]['timeTableRows'][i]['type'])
        # get scheduled time
        sched = parsetime(json[0]['timeTableRows'][i]['scheduledTime']) 
        
        # Get actual time, if exists...
        if 'actualTime' in json[0]['timeTableRows'][i]:                      
            arrdep_time = parsetime(json[0]['timeTableRows'][i]['actualTime']) 
        elif 'liveEstimateTime' in json[0]['timeTableRows'][i]:  #...otherwise get estimate
            arrdep_time = parsetime(json[0]['timeTableRows'][i]['liveEstimateTime'])
        else:
            print(trainnro, " cancelled")
            return 0 # if time not found, then the train was cancelled.
       
        # get station name
        stations.append(json[0]['timeTableRows'][i]['stationShortCode'])

        # print info, if printit = 1
        if printit:
            print(types[i] +" time:" , arrdep_time)
            print("Scheduled time:" , sched)
            print("Station: ",json[0]['timeTableRows'][i]['stationShortCode'])

        # late / early calculated
        ero = abs(sched - arrdep_time) 

        if (arrdep_time < sched):
            ero = -ero.seconds
            if printit:
                print("Early: ", ero)
        else:
            ero = ero.seconds
            if printit:
                print("Late: ",ero)

        # append times, schedules, delays and category to lists
        realtime.append(arrdep_time)
        schedtime.append(sched)
        delays.append(ero)
        category.append(cat)

### Get a train's arrivals and departures by it's number on a certain date

In [3]:
traindate = '2018-01-01'
trainnro  = '88'

realtime = []  # Actual times of arrivals 
schedtime = [] # Scheduled arrival times
stations = []  # Stations visited
types = []     # Departure or arrival
delays = []    # Contains information, of the trains being late or early (in seconds)
category = []  # Train's category: cargo/long distance/locomotive only, etc..

# Use the 'get_train_json' -function to get the train schedule data
get_train_json(traindate, trainnro, realtime, schedtime, stations, types, delays, category, printit=0)

print("\nTrain number", trainnro)
if len(stations) > 0: # If stations = 0, then the train was cancelled. Otherwise compose a dataframe.
    print("Number of arrivals/departures on the route: ", len(stations))
    print("Mean punctuality of arrivals/departures:  ", np.mean(delays))
    print("\nFinal destination (delay in seconds): ", delays[-1])
else:
    print("Cancelled")


Train number 88
Number of arrivals/departures on the route:  114
Mean punctuality of arrivals/departures:   248.385964912

Final destination (delay in seconds):  62


### Get the ID numbers of trains on a certain route

In [4]:
def trains_on_route(traindate, dep_station, arr_station):
    templist = []
    # Get the trains (train numbers) from Jyväskylä to Tampere
    request = 'https://rata.digitraffic.fi/api/v1/live-trains/station/'+dep_station+'/'+arr_station+'/?departure_date='+traindate
    data = requests.get(request)
    jy_tpe_trains = data.json()

    # List all trains on this route (JY-TPE)
    print("\nTrains on the route in", traindate ,": ",end="\t")
    for i in range(0,len(jy_tpe_trains)): 
        print(jy_tpe_trains[i]['trainNumber'], end="\t")
        templist.append(jy_tpe_trains[i]['trainNumber'])
        
    return templist


# Fetch trains by date, point of departure and target
traindate = '2018-01-01'
dep_station = 'JY'
arr_station = 'TPE'
trainslist = []
trainslist = trains_on_route(traindate, dep_station, arr_station)


Trains on the route in 2018-01-01 : 	142	144	86	146	88	148	152	4042	54042	

### Collect all the trains on a route the same date

In [5]:
def collect_train_data(traindate, trainslist):
    temptrains = pd.DataFrame()
    # Iterate through the 'trainslist' created earlier:
    for trainnro in trainslist:
        realtime = []  # Actual times of arrivals 
        schedtime = [] # Scheduled arrival times
        stations = []  # Stations visited
        types = []     # Departure or arrival
        delays = []    # Contains information on the trains being late or early (in seconds)
        category = []  # Train's category: cargo/long distance/locomotive only, etc..

        # Use the 'get_train_json' -function to get the train schedule data
        get_train_json(traindate, str(trainnro), realtime, schedtime, stations, types, delays, category, printit=0)

        if len(stations) > 0:
            #print(trainnro, " added", end="\t")
            trains = pd.DataFrame(realtime, columns=['Timestamp'])
            trains['Train ID'] = trainnro
            trains['Scheduled'] = schedtime
            trains['Scheduled'] = trains['Scheduled'].dt.time
            trains['Actual time'] = trains['Timestamp'].dt.time # get only time from date
            #trains['Date'] = trains['Date'].dt.date        # get only date from date
            trains['Delay'] = delays
            trains['Station'] = stations
            trains['Dep/Arr'] = types
            trains['Category'] = category

            #print(trainnro, schedtime, realtime)
            # Concatenate all queries to one dataframe: 'alltrains'
            temptrains = pd.concat([temptrains,trains], ignore_index=True)
        #else:
        #    print(trainnro, " cancelled", end="\t")
    return temptrains


# Create dataframe to hold all trains' data
alltrains = pd.DataFrame()
# Set a date to collect from
traindate = '2018-01-01'
alltrains = collect_train_data(traindate, trainslist) # trainslist was created earlier...
alltrains.sample(5)
#(trains['Real times'][0]).time()


4042  cancelled


Unnamed: 0,Timestamp,Train ID,Scheduled,Actual time,Delay,Station,Dep/Arr,Category
78,2018-01-01 11:16:16,142,11:12:00,11:16:16,256,ARP,DEPARTURE,Long-distance
760,2018-01-01 18:56:31,152,18:55:00,18:56:31,91,HKS,DEPARTURE,Long-distance
838,2018-01-01 23:01:24,152,22:01:00,23:01:24,3624,JK,DEPARTURE,Long-distance
616,2018-01-01 16:30:37,148,16:29:00,16:30:37,97,VKI,DEPARTURE,Long-distance
623,2018-01-01 17:07:47,148,17:06:31,17:07:47,76,RHL,ARRIVAL,Long-distance


### Trains from Jyväskylä to Tampere

In [6]:
# We want to examine the route from Jyväskylä to Tampere, so let's pick up the data between those stations:

# Create 'route' dataframe selecting the trains with JY as DEPARTURE station and trains with TPE as ARRIVAL station
route = alltrains.loc[((alltrains['Station'] == 'JY') & (alltrains['Dep/Arr'] == 'DEPARTURE')
                      | (alltrains['Station'] == 'TPE') & (alltrains['Dep/Arr'] == 'ARRIVAL'))]

# Get the trains' numbers on that route
#numbers = route.loc[(route['Station'] == 'JY') & (route['Dep/Arr'] == 'DEPARTURE'), 'Train ID']
#JY_TPE = JY_TPE.loc[JY_TPE['Train ID'].isin(list(numbers))]

route = route.sort_values('Train ID')
route.head()

Unnamed: 0,Timestamp,Train ID,Scheduled,Actual time,Delay,Station,Dep/Arr,Category
256,2018-01-01 13:18:12,86,13:18:00,13:18:12,12,JY,DEPARTURE,Long-distance
279,2018-01-01 14:51:34,86,14:50:00,14:51:34,94,TPE,ARRIVAL,Long-distance
498,2018-01-01 16:12:34,88,16:13:00,16:12:34,-26,JY,DEPARTURE,Long-distance
521,2018-01-01 17:52:30,88,17:44:00,17:52:30,510,TPE,ARRIVAL,Long-distance
14,2018-01-01 08:14:50,142,08:15:00,08:14:50,-10,JY,DEPARTURE,Long-distance


In [7]:
# Get only arrival times to Tampere
#route = route.loc[route['Dep/Arr'] == "ARRIVAL"]
#route['rounded time'] = route['Timestamp'].dt.round('60min')  # round up the timestamp to next hour
#route

---

## Weather data from the Finnish Meteorological Insititute  (in XML)
https://en.ilmatieteenlaitos.fi/open-data 

(MIT license)

---

http://ilmatieteenlaitos.fi/latauspalvelun-pikaohje / http://ilmatieteenlaitos.fi/tallennetut-kyselyt / http://www.finwx.net/forum/index.php?topic=3583.80


### Fetch the XML data with API-key and save to file

In [8]:
# Registration needed for the API-key...
infile = open('../../ilmat.api', 'r')
APIKEY = str(infile.readline().splitlines()[0])

# TESTING
#wdata = requests.get("http://data.fmi.fi/fmi-apikey/"+APIKEY+"/wfs?request=getFeature&storedquery_id=fmi::observations::weather::timevaluepair&place=Tampere&place=Pirkkala", stream=True)
#with open("testi.xml", 'wb') as fd:                           # WeatherSymbol3
#    for chunk in wdata.iter_content(chunk_size=128):
#        fd.write(chunk)


# The function to fetch weather data, timestep 1h. Print_success=1 -> print out request status
def get_weather_xml(starttime, endtime, place, savefile, print_success):
    we  = 'http://data.fmi.fi/fmi-apikey/'+APIKEY+'/wfs?request=getFeature&storedquery_id='
    ath = 'fmi::observations::weather::timevaluepair&parameters=temperature,r_1h,snow_aws'
    er = '&place='+place+'&timestep=60&starttime='+starttime+'&endtime='+endtime+'&'
    #print(we+ath+er)
    # Request data
    wdata = requests.get(we+ath+er, stream=True)

    # Save XML file
    with open(savefile, 'wb') as fd:
        for chunk in wdata.iter_content(chunk_size=128):
            fd.write(chunk)

    # Print out confirmation, that the request succeeded
    if print_success: 
        print(wdata.headers.get('content-type'), " / ", wdata.reason, " / Status: ", wdata.status_code)


# An example of fetching weather data for a certain period, and save using get_weather_xml -function
starttime = '2018-03-01T00:00:00Z'
endtime   = '2018-03-01T23:59:59Z' # no more data can be requested at once
place     = 'Jyväskylä'
savefile  = 'data/w_jkl.xml'

get_weather_xml(starttime, endtime, place, savefile, 1)

text/xml; charset=UTF-8  /  OK  / Status:  200


### Parse XML using the Minidom -library

In [9]:
#from xml.etree import ElementTree as ET # Another library to parse XML, perhaps more versatile than minidom
from xml.dom.minidom import parse
import xml.dom.minidom


def parse_weather_xml(times, temps, rains, snow, savefile):
    # Parse the earlier created XML with minidom parser
    DOMTree = xml.dom.minidom.parse(savefile)   
    # Get elements from parsed XML
    doc = DOMTree.documentElement

    # Get the nodes (items 0-2 = 'feature') that include 'om:result' and 'wml2:MeasurementTimeseries' tag inside it
    for feat in range(3):
        temprainsnow = doc.getElementsByTagName("om:result")[feat].getElementsByTagName('wml2:MeasurementTimeseries')
        for elements in temprainsnow:
            nodes = elements.childNodes
            # iterate through all childnodes
            for node in nodes:
                if node.nodeType == node.ELEMENT_NODE:  # if element type is "element", not "text"
                    # iterate through "time" -elements
                    nodestime = node.getElementsByTagName('wml2:time') # get element by tag "time"
                    for nodetime in nodestime:
                        if nodetime.nodeType == nodetime.ELEMENT_NODE:
                            if feat == 0:
                                #print(nodetime.firstChild.nodeValue)
                                times.append(parsetime(nodetime.firstChild.data)) # vain ekassa kierroksessa?


                    # iterate through "value" -elements
                    nodesvalue = node.getElementsByTagName('wml2:value') # get element by tag "value"(temp&rain&snow)
                    for nodevalue in nodesvalue:
                        if nodevalue.nodeType == nodevalue.ELEMENT_NODE:
                            # get the data from rounds of 'temprain' to correct lists
                            if feat == 0:
                                temps.append(nodevalue.firstChild.data)
                            elif feat == 1:
                                rains.append(nodevalue.firstChild.data)
                            else:
                                snow.append(nodevalue.firstChild.data)                        
                            #print(feat, "  " ,nodevalue.firstChild.nodeValue)

### Get weather XML, parse, compose to DataFrame

In [10]:
# Get weather XML and then parse ...
starttime = '2018-03-01T00:00:00Z'
endtime   = '2018-03-01T23:59:59Z' # no more data can be requested at once
place     = 'Härmälä' # Measuring point in Tampere, that also has snow depth and rain
savefile  = 'data/w_tre.xml'
get_weather_xml(starttime, endtime, place, savefile, 1)


# Create lists to contain the data
times = []
temps = []  #obs-obs-1-1-temperature
rains = []  #obs-obs-1-1-r_1h
snow  = []  #obs-1-1-snow_aws
parse_weather_xml(times, temps, rains, snow, savefile)


# Function to compose a weather dataframe
def compose_weather_dataframe(weatherdata, times, temps, rains, snow, place):
    weatherdata = pd.DataFrame(times, columns=['Timestamp'])
    weatherdata['Temperature'] = pd.to_numeric(temps, errors='coerce') # Convert to numeric, 
    weatherdata['Rain'] = pd.to_numeric(rains, errors='coerce')        # possible NaN values will remain
    weatherdata['Snow depth'] = pd.to_numeric(snow, errors='coerce')
    weatherdata['Place'] = place 
    weatherdata.fillna(method='ffill', inplace=True) # Forward fill = take the preceding value of the current column
    return weatherdata


# Compose dataframe by function    
weatherdata = compose_weather_dataframe("weatherdata", times, temps, rains, snow, place)
weatherdata.head()

text/xml; charset=UTF-8  /  OK  / Status:  200


Unnamed: 0,Timestamp,Temperature,Rain,Snow depth,Place
0,2018-03-01 00:00:00,-24.0,0.0,44.0,Härmälä
1,2018-03-01 01:00:00,-24.1,0.0,44.0,Härmälä
2,2018-03-01 02:00:00,-24.4,0.0,44.0,Härmälä
3,2018-03-01 03:00:00,-25.1,0.0,44.0,Härmälä
4,2018-03-01 04:00:00,-25.9,0.0,44.0,Härmälä


---

## Combining the weather and trains data

### Gather weather data from a longer period of time

In [11]:
# This function request weather data one day at a time, and then concatenates the data together
def get_weather_long(periodstart, periodend, place, savefile):
    # Get weather XML day by day, append requested data to 'weather_long'
    weather_long = pd.DataFrame() #if not isinstance(weather_long, type(None)): # if dataframe does not exist...
    day = periodstart
    while day < periodend:
        starttime = day
        # Parse date and add 24 hours (-1 sec), then convert back to required format (add T and Z)
        endtime = str(parsetime(day)+dt.timedelta(hours=23, minutes=59, seconds=59)).replace(' ', 'T', 1)+'Z'

        # Get data
        get_weather_xml(starttime, endtime, place, savefile, 0) # 0 = don't print status

        # The next day
        day = str(parsetime(day)+dt.timedelta(days=1)).replace(' ', 'T', 1)+'Z'

        # Create lists to contain the data
        times = []  #timestamps
        temps = []  #obs-obs-1-1-temperature
        rains = []  #obs-obs-1-1-r_1h
        snow  = []  #obs-1-1-snow_aws
        parse_weather_xml(times, temps, rains, snow, savefile)
        # Compose the data
        weatherdata = compose_weather_dataframe("weatherdata", times, temps, rains, snow, place)
        # Append daily data to 'weather_long' dataframe
        weather_long = weather_long.append(weatherdata)
    return weather_long


# Measuring point
#place     = 'Härmälä'

# Gather weather data for the period
#weather_long = get_weather_long(periodstart, periodend, place, "tempfile.xml")
#weather_long.sample(5)


# Convert datetime.time to string
def tim(row):
    return str(row)[7:15]

### Get train data from the same period

In [12]:
# Set the timeframe for our research
periodstart = '2018-03-29T00:00:00Z'
periodend   = '2018-03-31T23:59:59Z'

# Set departure and arrival stations
dep_station = 'JY'
arr_station = 'HKI'

# Set weather measurement locations (matching stations)
dep_place     = 'Jyväskylä'
arr_place     = 'Helsinki'

# Get days count through periodstart to periodend
days_count = (parsetime(periodend)+dt.timedelta(days=1)-parsetime(periodstart)).days
list_of_dates = []

# Get first date
day = parsetime(periodstart)

# Get all the dates in range to the list 'list_of_dates'
for i in range(days_count):
    list_of_dates.append(str(day)[0:10]) # append next day list (only 10 first chars = date)
    day = day+dt.timedelta(hours=24)     # add 24 hours


alltrains = pd.DataFrame() # the dataframe to hold all train data
trains = pd.DataFrame()    # temporary dataframe for trains on a certain day
deps = pd.DataFrame()      # temporary dataframe of departures

# Collect trains' data through the timeframe
for date in list_of_dates:
    # Get the list of trains on the date
    trainslist = trains_on_route(date, dep_station, arr_station) 
    
    # Get data about the trains listed on the date
    trains = collect_train_data(date, trainslist) 
    
    # Select trains departured from JY
    deps = trains.loc[(trains['Station'] == dep_station) & (trains['Dep/Arr'] == 'DEPARTURE')]
    
    # Select only the trains arriving to TPE
    trains = trains.loc[(trains['Station'] == arr_station) & (trains['Dep/Arr'] == 'ARRIVAL')]

    # Add delay at departure to the 'trains' dataframe
    trains['Delay at Dep.'] = deps['Delay'].values
    
    # Count trip time for each train
    temp = pd.merge(trains, deps, on=['Train ID'], how='inner').sort_values('Train ID')
    trains = trains.sort_values('Train ID')
    trains.reset_index(inplace=True, drop=True)
    trains['Trip time'] = temp['Timestamp_x'] - temp['Timestamp_y']
    
    # Get scheduled trip time
    temp['tempp'] = pd.to_datetime(temp['Scheduled_x'].apply(lambda x: "2017-01-01 " +str(x))
                                  ) - pd.to_datetime(temp['Scheduled_y'].apply(lambda x: "2017-01-01 " +str(x)))
    
    trains['Sched. trip time'] = temp['tempp'].apply(tim)
    
    
    # Compose dataframes to one big dataframe
    alltrains = pd.concat([alltrains, trains])
    
alltrains.reset_index(inplace=True, drop=True)
#print(trains.loc[trains['Train ID'] == 80])   


Trains on the route in 2018-03-29 : 	150	140	142	144	86	146	88	148	
Trains on the route in 2018-03-30 : 	150	140	142	144	86	146	148	
Trains on the route in 2018-03-31 : 	150	140	142	144	84	146	148	

### Combine weather with trains' arrival times

In [13]:
# These functions select weather data matching trains' arrival times and 12 hours in advance
def coldest(row): # The coldest temperature measured in past 12h from the timestamp
    row = weatherdata.loc[(weatherdata['Timestamp'] <= row) & 
                    (weatherdata['Timestamp'] > row - dt.timedelta(hours=12)), 'Temperature'].min()    
    return row
    
def rainmax(row): # The most rain/1h measured in past 12h from the timestamp
    row = weatherdata.loc[(weatherdata['Timestamp'] <= row) & 
                    (weatherdata['Timestamp'] > row - dt.timedelta(hours=12)), 'Rain'].max()
    return row
        
def raincount(row): # The amount of rain measured in past 12h from the timestamp
    row = weatherdata.loc[(weatherdata['Timestamp'] <= row) & 
                    (weatherdata['Timestamp'] > row - dt.timedelta(hours=12)), 'Rain'].sum()
    return row

def snowchange(row): # The highest depth of snow in past 12h from the timestamp (pitäisikö olla muutos?)
    row = weatherdata.loc[(weatherdata['Timestamp'] <= row) & 
                    (weatherdata['Timestamp'] > row - dt.timedelta(hours=12)), 'Snow depth'].max()# - weatherdata.loc[
    #(weatherdata['Timestamp'] <= row) & (weatherdata['Timestamp'] > row - dt.timedelta(hours=12)), 'Snow depth'].min()  
    return row

In [14]:
# Get weather from the ARRIVAL station
savefile  = 'data/w_hki.xml' # saves only temporarily, in this case
weatherdata = get_weather_long(periodstart, periodend, arr_place, savefile)

alltrains['12h Coldest Arrival'] = alltrains['Timestamp'].apply(coldest)
alltrains['12h Rain max Arrival'] = alltrains['Timestamp'].apply(rainmax)
alltrains['12h Rain total Arrival'] = alltrains['Timestamp'].apply(raincount)
alltrains['12h Snow depth Arrival'] = alltrains['Timestamp'].apply(snowchange)


# Get weather from the DEPARTURE station
savefile  = 'data/w_jkl.xml' # saves only temporarily, in this case
weatherdata = weatherdata.append(get_weather_long(periodstart, periodend, dep_place, savefile))

alltrains['12h Coldest Dep'] = alltrains['Timestamp'].apply(coldest)
alltrains['12h Rain max Dep'] = alltrains['Timestamp'].apply(rainmax)
alltrains['12h Rain total Dep'] = alltrains['Timestamp'].apply(raincount)
alltrains['12h Snow depth Dep'] = alltrains['Timestamp'].apply(snowchange)

# Convert timestamp to string
alltrains['Trip time'] = alltrains['Trip time'].apply(tim)


alltrains.loc[alltrains['Station'] == arr_station].head(5)

Unnamed: 0,Timestamp,Train ID,Scheduled,Actual time,Delay,Station,Dep/Arr,Category,Delay at Dep.,Trip time,Sched. trip time,12h Coldest Arrival,12h Rain max Arrival,12h Rain total Arrival,12h Snow depth Arrival,12h Coldest Dep,12h Rain max Dep,12h Rain total Dep,12h Snow depth Dep
0,2018-03-29 15:45:18,86,15:44:00,15:45:18,78,HKI,ARRIVAL,Long-distance,26,03:51:57,03:53:00,-5.9,0.0,0.0,13.0,-17.8,0.0,0.0,75.0
1,2018-03-29 18:40:24,88,18:39:00,18:40:24,84,HKI,ARRIVAL,Long-distance,-10,03:49:55,03:49:00,-1.6,0.0,0.0,13.0,-6.1,0.0,0.0,75.0
2,2018-03-29 08:10:28,140,08:09:00,08:10:28,88,HKI,ARRIVAL,Long-distance,33,03:53:20,03:54:00,-5.9,0.0,0.0,13.0,-17.8,0.0,0.0,75.0
3,2018-03-29 11:08:45,142,11:09:00,11:08:45,-15,HKI,ARRIVAL,Long-distance,25,03:51:33,03:56:00,-5.9,0.0,0.0,13.0,-17.8,0.0,0.0,75.0
4,2018-03-29 14:10:53,144,14:09:00,14:10:53,113,HKI,ARRIVAL,Long-distance,380,03:26:52,03:26:00,-5.9,0.0,0.0,13.0,-17.8,0.0,0.0,75.0


In [418]:
# Save the dataframe to CSV
print(len(alltrains))
alltrains.to_csv('data/JY-HKI-dec17-mar18.csv')

856
