In [18]:
import requests
import xml.etree.ElementTree as ET
import csv
from datetime import datetime, timedelta
import os
from dateutil.tz import tzlocal

# API authentication details
url_template = "https://webservices.iso-ne.com/api/v1.1/genfuelmix/day/{}"
username = 'alean@bu.edu'
password = 'Mq75eg8pxTBCEKY'

output_filename = 'genfuelmix_aggregatedyear.csv'

# Fuel categories to track
fuel_categories = [
    'Coal', 'Hydro', 'Natural Gas', 'Nuclear', 'Oil', 'Other', 
    'Landfill Gas', 'Refuse', 'Solar', 'Wind', 'Wood'
]

# Function to get the last timestamp from the existing CSV file
def get_last_updated_date(filename):
    if not os.path.exists(filename):
        return None  # If file doesn't exist, return None

    with open(filename, mode='r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header
        rows = list(reader)
        if rows:
            # Get the last row's date (assuming sorted or appending)
            last_row = rows[-1]
            print(f"Last row in the CSV: {last_row}")  # Debugging output

            # Ensure the row has the expected 'BeginDate' column at index 0
            if len(last_row) > 0 and last_row[0].strip():
                last_date = last_row[0].strip()  # The 'BeginDate' column
                try:
                    # Parse the date-time format with milliseconds and timezone offset
                    return datetime.strptime(last_date, '%Y-%m-%d %H:%M:%S%z')
                except ValueError as e:
                    print(f"Error parsing date: {e}")
                    return None
            else:
                print("The last row does not contain a valid BeginDate.")
                return None
        else:
            print("The CSV file is empty or has no valid rows.")
            return None


# Function to check if a timestamp exists in the CSV
def timestamp_exists(filename, timestamp):
    with open(filename, mode='r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header
        for row in reader:
            if len(row) > 0 and row[0] == timestamp:
                return True
    return False

# Function to get data from the API for a specific date
def get_fuelmix_data_for_date(date):
    url = url_template.format(date)
    response = requests.get(url, auth=(username, password))
    
    if response.status_code == 200:
        return response.content
    else:
        print(f"Failed to retrieve data for {date}: {response.status_code}")
        return None

# Function to parse the XML response and organize data by timestamp
def parse_fuelmix_data(xml_data):
    root = ET.fromstring(xml_data)
    namespace = {'ns': 'http://WEBSERV.iso-ne.com'}
    
    # List to hold each row's data (one row per timestamp)
    data_rows = []
    
    # Dictionary to keep track of generation values by fuel type for each timestamp
    timestamp_data = {}
    
    for gen_fuel_mix in root.findall('ns:GenFuelMix', namespace):
        begin_date = gen_fuel_mix.find('ns:BeginDate', namespace).text  # Full timestamp (date and time)
        begin_date = begin_date.replace('T', ' ') #???
        # Convert timestamp format from 'YYYY-MM-DDTHH:MM:SS.SSS±HH:MM' to 'YYYY-MM-DD HH:MM:SS±HH:MM'
        parsed_date = datetime.strptime(begin_date, '%Y-%m-%d %H:%M:%S.%f%z')
        formatted_date = parsed_date.strftime('%Y-%m-%d %H:%M:%S%z')  # New format with space
        formatted_date = formatted_date[:-2] + ':' + formatted_date[-2:]
        
        gen_mw = float(gen_fuel_mix.find('ns:GenMw', namespace).text)
        fuel_category = gen_fuel_mix.find('ns:FuelCategory', namespace).text
        
        # Initialize the dictionary for each timestamp
        if formatted_date not in timestamp_data:
            timestamp_data[formatted_date] = {category: 0.0 for category in fuel_categories}
        
        # Only track known fuel categories and accumulate power for this timestamp
        if fuel_category in timestamp_data[formatted_date]:
            timestamp_data[formatted_date][fuel_category] += gen_mw
    
    # Convert the dictionary to a list of rows for CSV writing
    for timestamp, fuel_data in timestamp_data.items():
        row = [timestamp] + [fuel_data.get(category, 0.0) for category in fuel_categories]
        data_rows.append(row)
    
    return data_rows

# Function to write aggregated data to CSV, ensuring no extra blank lines
def load_existing_timestamps(filename):
    timestamps = set()
    if os.path.exists(filename):
        with open(filename, mode='r') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header
            for row in reader:
                if len(row) > 0:
                    timestamps.add(row[0])
    return timestamps

# Updated write_to_csv
def write_to_csv(data, filename, append=False):
    mode = 'a' if append else 'w'
    write_header = not append or not os.path.exists(filename)
    
    # Pre-load all existing timestamps
    existing_timestamps = load_existing_timestamps(filename)
    
    with open(filename, mode=mode, newline='') as file:
        writer = csv.writer(file)
        
        # Only write the header if it's a new file or the first write
        if write_header:
            header = ['BeginDate'] + fuel_categories
            writer.writerow(header)
        
        # Write the data rows in the required format, avoiding duplicate timestamps
        for row in data:
            if row[0] not in existing_timestamps:  # Avoid double writes
                writer.writerow(row)
# Main logic to iterate over date range and aggregate data
def main():
    # Check the last updated date from the CSV
    last_updated_date = get_last_updated_date(output_filename)
    
    if last_updated_date:
        # Set the start date as the day after the last recorded date in the CSV
        start = last_updated_date + timedelta(days=1)
    else:
        # If no previous data, start from a default date and make it timezone-aware
        start = datetime.strptime('20241001', '%Y%m%d').replace(tzinfo=tzlocal())
    
    # Use today's date with local timezone (offset-aware)
    end = datetime.now(tz=tzlocal())
    
    delta = timedelta(days=1)
    aggregated_data = []
    
    while start <= end:
        # if start.date() == end.date():
        #     date_str = start.strftime('%Y%m%dT%H%M%S')
        # else:
        #     date_str = start.strftime('%Y%m%d')
        date_str = start.strftime('%Y%m%dT%H%M%S')      # if does not work, revert back to above
        xml_data = get_fuelmix_data_for_date(date_str)
        
        if xml_data:
            daily_data = parse_fuelmix_data(xml_data)
            aggregated_data.extend(daily_data)  # Add all rows for this day
        
        start += delta
    
    # Append the new data to the existing CSV
    write_to_csv(aggregated_data, output_filename, append=True)
    print(f"Data aggregation complete. Output written to {output_filename}")

if __name__ == "__main__":
    main()
    # get_last_updated_date(filename=output_filename)



Last row in the CSV: ['2024-10-24 19:08:01-04:00', '0.0', '860.0', '9983.0', '1231.0', '0.0', '9.0', '20.0', '320.0', '15.0', '772.0', '81.0']
Data aggregation complete. Output written to genfuelmix_aggregatedyear.csv
