In [None]:
import requests
import csv
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [110]:
api_link="https://rata.digitraffic.fi/api/v1/trains/{DATE}/{TRAIN_NUMBER}"

# **Task 1**

In [111]:
def fetch_train_data(date,train_number):
    url = api_link.format(DATE=date,TRAIN_NUMBER=train_number)
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch data for {date} (Status Code: {response.status_code})")
        return None

In [112]:
fetch_train_data("2024-11-02", 4) # test the fuction  on train_number=4 and date=2-nov-2024

[{'trainNumber': 4,
  'departureDate': '2024-11-02',
  'operatorUICCode': 10,
  'operatorShortCode': 'vr',
  'trainType': 'IC',
  'trainCategory': 'Long-distance',
  'commuterLineID': '',
  'runningCurrently': False,
  'cancelled': False,
  'version': 289661200119,
  'timetableType': 'REGULAR',
  'timetableAcceptanceDate': '2024-05-08T10:55:08.000Z',
  'timeTableRows': [{'stationShortCode': 'JNS',
    'stationUICCode': 460,
    'countryCode': 'FI',
    'type': 'DEPARTURE',
    'trainStopping': True,
    'commercialStop': True,
    'commercialTrack': '1',
    'cancelled': False,
    'scheduledTime': '2024-11-02T04:00:00.000Z',
    'actualTime': '2024-11-02T04:01:17.000Z',
    'differenceInMinutes': 1,
    'causes': [],
    'trainReady': {'source': 'KUPLA',
     'accepted': True,
     'timestamp': '2024-11-02T03:57:19.000Z'}},
   {'stationShortCode': 'PLT',
    'stationUICCode': 1070,
    'countryCode': 'FI',
    'type': 'ARRIVAL',
    'trainStopping': False,
    'commercialTrack': '',
 

In [113]:
def monthly_data_train(train_number, start_date, end_date):
    all_train_data = []
    current_date = start_date
    while current_date <= end_date:
        date_str = current_date.strftime("%Y-%m-%d")
        print(f"Fetching data for {date_str}...")
        daily_data = fetch_train_data(date_str,train_number)
        if daily_data:
            all_train_data.extend(daily_data)
        current_date += timedelta(days=1)
    return all_train_data

In [114]:
data=monthly_data_train(4,datetime(2024, 11, 1), datetime(2024, 11, 30))   # fetching data of train=4 from 2024-11-01 to 2024-11-30 which is requirement of given task

Fetching data for 2024-11-01...
Fetching data for 2024-11-02...
Fetching data for 2024-11-03...
Fetching data for 2024-11-04...
Fetching data for 2024-11-05...
Fetching data for 2024-11-06...
Fetching data for 2024-11-07...
Fetching data for 2024-11-08...
Fetching data for 2024-11-09...
Fetching data for 2024-11-10...
Fetching data for 2024-11-11...
Fetching data for 2024-11-12...
Fetching data for 2024-11-13...
Fetching data for 2024-11-14...
Fetching data for 2024-11-15...
Fetching data for 2024-11-16...
Fetching data for 2024-11-17...
Fetching data for 2024-11-18...
Fetching data for 2024-11-19...
Fetching data for 2024-11-20...
Fetching data for 2024-11-21...
Fetching data for 2024-11-22...
Fetching data for 2024-11-23...
Fetching data for 2024-11-24...
Fetching data for 2024-11-25...
Fetching data for 2024-11-26...
Fetching data for 2024-11-27...
Fetching data for 2024-11-28...
Fetching data for 2024-11-29...
Fetching data for 2024-11-30...


In [116]:
def normalize_data_train(json_data):
    records = []
    for train in json_data:
        train_number = train.get("trainNumber")
        departure_date = train.get("departureDate")
        for stop in train.get("timeTableRows", []):
            record = {
                "train_number": train_number,
                "departure_date": departure_date,
                "station": stop.get("stationShortCode"),
                "type": stop.get("type"),
                "scheduled_time": stop.get("scheduledTime"),
                "actual_time": stop.get("actualTime"),
                "difference_in_minutes": stop.get("differenceInMinutes"),
            }
            records.append(record)
    return pd.DataFrame(records)

In [117]:
df=normalize_data_train(data)  # normalize the data of given task

In [118]:
df.head()

Unnamed: 0,train_number,departure_date,station,type,scheduled_time,actual_time,difference_in_minutes
0,4,2024-11-01,JNS,DEPARTURE,2024-11-01T04:00:00.000Z,2024-11-01T04:01:07.000Z,1
1,4,2024-11-01,PLT,ARRIVAL,2024-11-01T04:00:34.000Z,2024-11-01T04:02:12.000Z,2
2,4,2024-11-01,PLT,DEPARTURE,2024-11-01T04:00:34.000Z,2024-11-01T04:02:12.000Z,2
3,4,2024-11-01,SUL,ARRIVAL,2024-11-01T04:01:14.000Z,2024-11-01T04:03:34.000Z,2
4,4,2024-11-01,SUL,DEPARTURE,2024-11-01T04:01:14.000Z,2024-11-01T04:03:34.000Z,2


In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3928 entries, 0 to 3927
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   train_number           3928 non-null   int64 
 1   departure_date         3928 non-null   object
 2   station                3928 non-null   object
 3   type                   3928 non-null   object
 4   scheduled_time         3928 non-null   object
 5   actual_time            3704 non-null   object
 6   difference_in_minutes  3928 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 214.9+ KB


In [120]:
output_csv = "normalized_train_data_november_2024.csv"
df.to_csv(output_csv, index=False)

# **Task 2**

In [122]:
df['actual_time'] = pd.to_datetime(df['actual_time'])
final_destination_times = df.groupby(['train_number', 'departure_date'])['actual_time'].last()
average_arrival_time = final_destination_times.mean()
print(f"The average actual arrival time at the final destination in November 2024 is: {average_arrival_time}")

The average actual arrival time at the final destination in November 2024 is: 2024-11-15 20:46:46.166666752+00:00
