In [None]:
import os
import pandas as pd
import zipfile
import tempfile

In [None]:
def unzip(zip_file):
    temp_directory = tempfile.TemporaryDirectory()  # use a temporary directory for file unzipping
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(temp_directory.name)
    return temp_directory

In [None]:
# filepaths for output csvs:
data_csv_path = r"[insert your filepath]" 

# main zip filepath 
main_zip = r'[insert filepath of downloaded eqr file]'
main_temp_directory = unzip(main_zip)

# creating df for data needed
data = pd.DataFrame()

In [None]:
def check_for_empty_values(df, seller_zip):
    if df[['seller_company_name', 'product_name', 'point_of_delivery_balancing_authority', 'increment_name']].isnull().any().any():
        print(f"Null values in: {seller_zip}")
    elif df[['seller_company_name', 'product_name', 'point_of_delivery_balancing_authority', 'increment_name']].isna().any().any():
        print(f"NA values in: {seller_zip}")
    else: 
        return None

In [None]:
file_count = 0
for seller_zip in os.listdir(main_temp_directory.name):  # iterate through seller zip files
    if seller_zip.endswith('.zip') or seller_zip.endswith('.ZIP'):
        seller_zip_path = os.path.join(main_temp_directory.name, seller_zip)
        # print(f"Unzipping seller zip file: {seller_zip_path}")

        seller_temp_directory = unzip(seller_zip_path)  # unzip seller zip files to a secondary temp directory --> 4 mins
        # print(f"File names: {os.listdir(seller_temp_directory.name)}")

        for transaction_file in os.listdir(seller_temp_directory.name):  # iterate over unzipped transaction files
            if transaction_file.endswith('transactions.csv') or transaction_file.endswith('transactions.CSV'):
                transaction_filepath = os.path.join(seller_temp_directory.name, transaction_file)
                # print(f"Processing transaction file: {transaction_filepath}")

                file_count = file_count + 1
                # print(f"File Count: {file_count}")
                
                try:
                    transaction_df = pd.read_csv(transaction_filepath, encoding='cp1252', low_memory=False)  # read in csv and filter data
                    # print(f"Read {len(transaction_df)} rows from {transaction_filepath}")
                    # print(transaction_df.head())  
 
                    invalid_zip = check_for_empty_values(transaction_df, seller_zip)
                    if invalid_zip:
                        print(f"Invalid data in {invalid_zip}")
                        continue
                    

                    # add in if statement to see if rows are > 0 
                    filtered_data = transaction_df[
                        (transaction_df['product_name'] == 'CAPACITY') &
                        (transaction_df['increment_name'].isin(['M', 'Y'])) &
                        (transaction_df['point_of_delivery_balancing_authority'] == 'CISO')
                    ]
                    filtered_data.reset_index(drop = True)
                    
                    # print(f"filtered data: {filtered_data.head()}")

                    # print(f"Filtered down to {len(filtered_data)} rows")
                    # print(filtered_data.head())  

                    # check if filtered data exists before doing if length statement --> fix syntax.
                    if filtered_data is not None:
                        if len(filtered_data) > 0:
                            data = pd.concat([data, filtered_data], ignore_index=True)  # append filtered data to data
                    
                except Exception as e:
                    print(f"Error processing {seller_zip}: {e}")
 
                
columns_to_delete = ['ferc_tariff_reference', 'contract_service_agreement', 'transaction_unique_identifier', 'exchange_brokerage_service', 'product_name', 'total_transmission_charge']
data.drop(columns=columns_to_delete, inplace=True, errors='ignore') 

In [None]:
print(len(data)) 
data

In [None]:
data.to_csv(data_csv_path, index=False)