# Notes

 - Do we know what timezone the files are for? For example if we have a westcoast train line, are the times within it in PDT? Or is it standardized UTC?
 - Are we happy with the structure of the dictionaries?
    ```
     date: timestamp
     train: {
       <name>: str,
       <number>: {
           <schedule_array> <-- Do we want to find a better structure
    ```
 - Given there are 'schedule' days that seem to add +1 or more days to the date - do we just want to include that in the timestamps vs having an entirely seperate column/element for them? If something departed actual time, can we assume that the scedule day applies to the actual time as well?
 - Do we want to take the train header line info (if present) and add that to our comments? If so - where?
 - Why are there so many missing 'arrival' times? What happens during those? Do we want to infer arrival to the next station based on departure or estimate time for travel?

# Setup functions

In [60]:
from datetime import datetime, timedelta

In [42]:
def read_file(file_path):
    '''
    Extracts and returns the train name, table index, and relevant schedule lines from the provided list of file lines, based on the known file structure
    '''
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()
            return lines
    except FileNotFoundError:
        return f"Error: File not found: {file_path}"

In [37]:
def process_lines(lines):
    '''
    Takes in file lines and returns the train name, the table index and the relevant schedule lines
    '''
    for idx, line in enumerate(lines):
        if line.startswith("* THIS TRAIN"):
            # Skip annoying header lines if present and only get the train line info
            next
        else:
            train_name_line = line.replace("* ", "").strip()
            table_index_line = lines[idx+9] # based on the structure it should always be this line - but if not we can serach for the VVV line
            # Skip the next lines after the train name to ignore the glossary which is not needed as we know the structure
            schedule_lines = lines[idx+10:]
            return train_name_line, table_index_line, schedule_lines
            break # we don't need to continue given we have all the info we actually want

In [33]:
def determin_table_indexing(table_index_line):
    '''
    assumes we have the right index line that looks similar to:
           "* V    V  V     V  V     V     V     V"
    '''
    return [i for i, char in enumerate(table_index_line) if char == 'V']
    

In [58]:
def timestamp_conversion(time_str, date_time, add_day = None):
    '''
    Converts a time string (e.g., '915A') to an epoch timestamp using the provided date, adjusting for AM/PM and optionally adding days if specified. 
    Returns None if the input time string is empty.
    '''
    if time_str:
        hour = int(time_str[:-3]) 
        minute = int(time_str[-3:-1])
        ampm = time_str[-1]
    
        if ampm == 'P' and hour != 12:
            hour += 12
        elif ampm == 'A' and hour == 12:
            hour = 0
        
        now = datetime.now()
        dt_new = date_time.replace(hour=hour, minute=minute, second=0, microsecond=0)
        # Account for the 
        if add_day:
            dt_new += timedelta(days=int(add_day))
        return(dt_new.timestamp())
    else:
        None

In [68]:
def parse_schedule_info(schedule_line, indices, train_date_time):
    '''
    Parses a single schedule line using the provided field indices and train date,
    extracting relevant information into a dictionary with station codes,
    arrival/departure times (converted to epoch), and any additional comments.
    '''
    slice_points = [0] + indices + [len(schedule_line)]

    elements = []
    for i in range(len(slice_points) - 1):
        field = schedule_line[slice_points[i]:slice_points[i+1]].strip()
        if not field or field == '*':
            elements.append(None)
        else:
            elements.append(field)

    # Extract fields based on positions
    # print(elements) # debug
    result = {
        "station_code": elements[1],
        "scheduled_arrival_day": elements[2],
        "schedule_arrival_time": timestamp_conversion(elements[3], train_date_time, elements[2]), # TODO - we can add the extra day to the schedule if possible?
        "schedule_departure_day": elements[4],
        "schedule_departure_time": timestamp_conversion(elements[5], train_date_time),
        "actual_arrival_time": timestamp_conversion(elements[6], train_date_time), # TODO - can we add schedule arrival day as well? How do we know it's true?
        "actual_departure_time": timestamp_conversion(elements[7], train_date_time), # TODO - can we add schedule departure day as well? How do we know it's true?
        "comments": ' '.join(elements[8:])  # Join the rest as comments
    }
    return result

--------

# Run example files

In [69]:
file_path = "20_20230129.txt"
# file_path = "28_20240119.txt"

file_lines = read_file(file_path)
metadata = file_path.split("_")
train_num = metadata[0] # TODO - we could just do all of this in our read file function and return to make this more concise
train_date = metadata[1].strip(".txt") # TODO - what are the timezones for these files? Can we account for that somehow?
train_date_time = datetime.strptime(train_date, "%Y%m%d")
train_dict = {
    "date": train_date_time.timestamp(), 
    "train": {train_name: 
              {train_num:[]}
             }
}

In [70]:
train_name, table_index_line, schedule = process_lines(file_lines)
table_index = determin_table_indexing(table_index_line)
if isinstance(schedule, list):
    for line in schedule:
        train_dict["train"][train_name][train_num].append(parse_schedule_info(line, table_index, train_date_time))
else:
    print('here')
    print(schedule) # Print obj to debug #TODO have error message and handle gracefully


In [73]:
train_dict

{'date': 1674968400.0,
 'train': {'Crescent': {'20': [{'station_code': 'NOL',
     'scheduled_arrival_day': None,
     'schedule_arrival_time': None,
     'schedule_departure_day': '1',
     'schedule_departure_time': 1675001700.0,
     'actual_arrival_time': None,
     'actual_departure_time': 1675001700.0,
     'comments': 'Departed:  On time.'},
    {'station_code': 'SDL',
     'scheduled_arrival_day': None,
     'schedule_arrival_time': None,
     'schedule_departure_day': '1',
     'schedule_departure_time': 1675005120.0,
     'actual_arrival_time': None,
     'actual_departure_time': 1675006380.0,
     'comments': 'Departed:  21 minutes late.'},
    {'station_code': 'PIC',
     'scheduled_arrival_day': None,
     'schedule_arrival_time': None,
     'schedule_departure_day': '1',
     'schedule_departure_time': 1675006620.0,
     'actual_arrival_time': None,
     'actual_departure_time': 1675008300.0,
     'comments': 'Departed:  28 minutes late.'},
    {'station_code': 'HBG',
   