The goal of this exercise was to access real-time train information to create a schedule display for SEPTA's Suburban Station.API acess documentation is found [here](http://www3.septa.org/hackathon/)

In [1]:
import re, requests, csv
from pprint import pprint
from datetime import datetime as dt

def get_current_schedule(station_code = '30th'):
    
    schedule_url = "http://www3.septa.org/ccstations/"+station_code+"/sched_data.csv"
    access_time = dt.now()
    response = requests.get(schedule_url)
        
    return list(csv.reader(response.text.strip().split("\n"))), access_time

schedule, access_time = get_current_schedule(station_code = "ss")
access_time, schedule[0:3], "...", schedule[-2:]

(datetime.datetime(2021, 11, 12, 12, 19, 41, 487538),
 [["EMG=' No Emg Message"],
  ['R4S=12:25',
   'Airport',
   '3B',
   'ON TIME',
   'LOCAL                    ',
   '433   ',
   '<_NEXT_MSG>12:55',
   'Airport',
   '3B',
   'ON TIME',
   'LOCAL                    ',
   '8435  ',
   '<_NEXT_MSG>01:25',
   'Airport',
   '3B',
   'ON TIME',
   'LOCAL                    ',
   '437   ',
   '<_NEXT_MSG>01:55',
   'Airport',
   '3B',
   'ON TIME',
   'LOCAL                    ',
   '8439  ',
   ''],
  ['R4N=01:05',
   'Warminster',
   '2A',
   'ON TIME',
   'LOCAL                    ',
   '432   ',
   '<_NEXT_MSG>02:05',
   'Warminster',
   '2A',
   'ON TIME',
   'LOCAL                    ',
   '436   ',
   '<_NEXT_MSG>03:05',
   'Warminster',
   '2A',
   'ON TIME',
   'LOCAL                    ',
   '440   ',
   '<_NEXT_MSG>04:05',
   'Warminster',
   '2A',
   'ON TIME',
   'LOCAL                    ',
   '444   ',
   '']],
 '...',
 [['SERVICE=Effective Sunday September 5 new schedules 

As you can see, there is not a single column devoted to _all_ of each type of data, e.g., a single timestamps column.

Next, I pre-process the data into a three-column format, as a list (rows) of lists (columns). I extract three pieces of information for each train: its scheduled arrival time, destination, and its lateness/timeliness status. 

In [2]:
def get_trains(schedule):
    trains = []
    
    for l in schedule[1:16]:
        ftrains = []
        #break up each list into four parts: first part
        first = l[:6]
        if len(first) > 3:
            ftrains.append((first[0])[-5:])
            ftrains.append(first[1])
            ftrains.append(first[3])
            trains.append(ftrains)

        #break up each list into four parts: second part
        second = l[6:12]
        if len(second) > 3:
            strains = []
            strains.append((second[0])[-5:])
            strains.append(second[1])
            strains.append(second[3])
            trains.append(strains)

        #break up each list into four parts: third part
        third = l[12:18]
        if len(third) > 3:
            ttrains = []
            ttrains.append((third[0])[-5:])
            ttrains.append(third[1])
            ttrains.append(third[3])
            trains.append(ttrains)

        #break up each list into four parts: fourth part
        fourth = l[18:24]
        #data error in API acquisition: one error from SEPTA sayint limited service for suberban station resulted
        #in a list with only 2 values, throwing off the code. The below if statement prevents these error messages
        #from being read in.
        if len(fourth) > 3:
            #print(fourth)
        #print(fourth)
            fourtrains = []
            fourtrains.append((fourth[0])[-5:])
            fourtrains.append(fourth[1])
            fourtrains.append(fourth[3])
            trains.append(fourtrains)         
                    
    return trains

trains = get_trains(schedule)
trains[:10]

[['12:25', 'Airport', 'ON TIME'],
 ['12:55', 'Airport', 'ON TIME'],
 ['01:25', 'Airport', 'ON TIME'],
 ['01:55', 'Airport', 'ON TIME'],
 ['01:05', 'Warminster', 'ON TIME'],
 ['02:05', 'Warminster', 'ON TIME'],
 ['03:05', 'Warminster', 'ON TIME'],
 ['04:05', 'Warminster', 'ON TIME'],
 ['12:35', 'Wilmington', ' 3 LATE'],
 ['01:09', '30th St', ' 1 LATE']]

The format uses 12 hour time, which will be an issue that needs correcting later on.

Next I parse the timestamp column using the `dateutil.parser` module-function. I then output the three values as a new list and sort according to arrival time.

In [3]:
from dateutil import parser as dateparser
today= (str(access_time))[:10]

def parse_times(trains):
    
    datetime_parsed_trains = []

    for line in trains:
        innerlist = []
        date_time = today + ' ' + line[0]
        innerlist.append(dateparser.parse(date_time))
        innerlist.append(line[1])
        innerlist.append(line[2])
        datetime_parsed_trains.append(innerlist)
    
    return sorted(datetime_parsed_trains, key = lambda x: x[0])

datetime_parsed_trains = parse_times(trains)
datetime_parsed_trains[:10]

[[datetime.datetime(2021, 11, 12, 1, 5), 'Warminster', 'ON TIME'],
 [datetime.datetime(2021, 11, 12, 1, 9), '30th St', ' 1 LATE'],
 [datetime.datetime(2021, 11, 12, 1, 9), 'Doylestown', 'ON TIME'],
 [datetime.datetime(2021, 11, 12, 1, 21), 'West Trenton', 'ON TIME'],
 [datetime.datetime(2021, 11, 12, 1, 25), 'Airport', 'ON TIME'],
 [datetime.datetime(2021, 11, 12, 1, 28), '30th St', 'ON TIME'],
 [datetime.datetime(2021, 11, 12, 1, 35), 'Wilmington', 'ON TIME'],
 [datetime.datetime(2021, 11, 12, 1, 35), 'Fox Chase', 'ON TIME'],
 [datetime.datetime(2021, 11, 12, 1, 40), 'Trenton', 'ON TIME'],
 [datetime.datetime(2021, 11, 12, 1, 47), 'Temple U', 'ON TIME']]

As mentioned before, the arrival times lack AM and PM information, which is problematic for the `dateutils.parser`, as it will treat the 12-hour format timestrings as 24-hour format timestrings. 

To solve this problem, I will 'fix' the original timestamps and use the `datetime` module to infer AM/PM information. I will use the current system time and the fact that the schedule information only contains trains arriving in the next few hours to fix the AM/PM problem.

In [4]:
def fix_times(trains, access_time):

    trains_24_hour = []
    access_hour = access_time.hour if access_time.hour else 12 ## 'fix' zero times
    if access_hour >= 12: ## now is PM
        access_hour -= 12
        current_am_or_pm = ["PM", "AM"]
    else: ## now is AM
        current_am_or_pm = ["AM", "PM"]
    access_date = access_time.strftime("%m/%d/%Y")
    
    for train in trains:

        innerlist = []
        hour = int(train[0][0:2]) #get the hour of each train arrival
        if hour < access_hour:
            newtime = train[0] + current_am_or_pm[1] #add AM if the time is less than access time
        else:
            newtime = train[0] + current_am_or_pm[0] #add PM if the time is not less than access time.
        innerlist.append(newtime)
        innerlist.append(train[1])
        innerlist.append(train[2])
        trains_24_hour.append(innerlist)
               
    return trains_24_hour

datetime_parsed_trains_24_hour = parse_times(fix_times(trains, access_time))
datetime_parsed_trains_24_hour[:10]

[[datetime.datetime(2021, 11, 12, 12, 21), 'West Trenton', ' 2 LATE'],
 [datetime.datetime(2021, 11, 12, 12, 21), 'West Trenton', ' 2 LATE'],
 [datetime.datetime(2021, 11, 12, 12, 25), 'Airport', 'ON TIME'],
 [datetime.datetime(2021, 11, 12, 12, 25), 'Airport', 'ON TIME'],
 [datetime.datetime(2021, 11, 12, 12, 28), '30th St', ' 7 LATE'],
 [datetime.datetime(2021, 11, 12, 12, 28), '30th St', ' 7 LATE'],
 [datetime.datetime(2021, 11, 12, 12, 35), 'Wilmington', ' 3 LATE'],
 [datetime.datetime(2021, 11, 12, 12, 35), 'Trenton', ' 2 LATE'],
 [datetime.datetime(2021, 11, 12, 12, 35), 'Fox Chase', 'ON TIME'],
 [datetime.datetime(2021, 11, 12, 12, 35), 'Fox Chase', 'ON TIME']]

Finally, I  create hourly log files with train information in my data/trains folder, and name it with the time stamp.

In [123]:
import os
hour = []
uniquehours = []
trainhour = []

def save_schedule(datetime_parsed_trains_24_hour):
    
    os.system("mkdir -p data/trains/")
    
    #get the set of unique hours
for train in datetime_parsed_trains_24_hour:
    timestamp = str(train[0])
    hour.append(timestamp[11:13])
    uniquehours = set(hour)

    #overwrite file path to an empty file, then close it
for hour in uniquehours:
    filename = 'data/trains/2021-09-25-'+ hour + '.txt'
    file_handle = 'file_name'+ hour
    file_handle = open(filename,"w") #concatination must be a string- check?   
    file_handle.close() #now close it
    
    #create the string to put inside each file
for train in datetime_parsed_trains_24_hour:
    trainhour = str(train[0])[11:13]
    hhmm = str(train[0])[11:16]
    inside = hhmm + ', ' + train[1] + ', ' + train[2] + '\n'
    
    #open the corresponding txt file, append, then close
    filename = 'data/trains/2021-09-25-'+ trainhour + '.txt'
    file_handle = open(filename, 'a')
    file_handle.write(inside)
    file_handle.close()
    
save_schedule(datetime_parsed_trains_24_hour)
[x for x in os.listdir("data/trains/") 
 if re.search(datetime_parsed_trains_24_hour[0][0].strftime("%Y-%m-%d-\d\d.txt"), x)]