In [5]:
import pandas as pd
import glob
import os

print("Starting the combination process...")

# 1. Define your paths
source_data_path = '../data/electricity/'
output_folder = '../data/electricity/processed/'
output_parquet_path = os.path.join(output_folder, 'demanddata_combined.parquet')

# --- Define your dtypes ---
data_types = {
    'SETTLEMENT_PERIOD': 'category',
    'ND': 'category',
    'TSD': 'float32',
    'ENGLAND_WALES_DEMAND': 'float32',
    'EMBEDDED_WIND_GENERATION': 'float32',
    'EMBEDDED_SOLAR_GENERATION': 'float32',
    'EMBEDDED_WIND_CAPACITY': 'float32',
    'EMBEDDED_SOLAR_CAPACITY': 'float32',
    # Add any other columns you want to specify...
}
parse_dates = ['SETTLEMENT_DATE']

# Create the output directory if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Created directory: {output_folder}")

# 2. Find all the CSV files
csv_pattern = os.path.join(source_data_path, "demanddata_*.csv")
all_csv_files = glob.glob(csv_pattern)
all_csv_files.sort()

if not all_csv_files:
    print(f"❌ Error: Found 0 CSV files. Check your path: {csv_pattern}")
else:
    print(f"Found {len(all_csv_files)} CSV files to combine.")

    # 3. Read and collect all DataFrames
    dfs_list = []
    total_csv_rows = 0  # <--- NEW: Initialize row counter
    
    for csv_file in all_csv_files:
        print(f"Reading {csv_file}...")
        try:
            df = pd.read_csv(
                csv_file,
                dtype=data_types,
                parse_dates=parse_dates,
                low_memory=False 
            )
            
            # --- NEW: Add rows of this file to the total count ---
            current_rows = len(df)
            total_csv_rows += current_rows
            print(f"    ...found {current_rows} rows.")
            
            dfs_list.append(df)
        except Exception as e:
            print(f"Error reading {csv_file}: {e}")

    # 4. Concatenate all DataFrames
    if dfs_list:
        print("Concatenating all DataFrames...")
        combined_df = pd.concat(dfs_list, ignore_index=True)
        
        # --- NEW: Validation Step ---
        print("\n--- VALIDATION ---")
        print(f"Total rows read from all CSVs: {total_csv_rows}")
        print(f"Total rows in combined DataFrame: {len(combined_df)}")
        
        if total_csv_rows == len(combined_df):
            print("✅ Success: Row counts match!")
        else:
            print("❌ Mismatch: Row counts do NOT match! Check your files.")
        print("--------------------\n")

        print("DataFrame info after combining:")
        combined_df.info(memory_usage='deep')

        # 5. Save the combined DataFrame to a single Parquet file
        print(f"Saving combined data to {output_parquet_path}...")
        combined_df.to_parquet(
            output_parquet_path,
            engine='pyarrow',
            compression='snappy'
        )
        print("✅ Successfully created combined Parquet file!")
    else:
        print("No data was loaded to concatenate.")

Starting the combination process...
Created directory: ../data/electricity/processed/
Found 25 CSV files to combine.
Reading ../data/electricity/demanddata_2001.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2002.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2003.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2004.csv...
    ...found 17568 rows.
Reading ../data/electricity/demanddata_2005.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2006.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2007.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2008.csv...
    ...found 17568 rows.
Reading ../data/electricity/demanddata_2009.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2010.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2011.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2012.csv

  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(


    ...found 17520 rows.
Reading ../data/electricity/demanddata_2015.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2016.csv...
    ...found 17568 rows.
Reading ../data/electricity/demanddata_2017.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2018.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2019.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2020.csv...
    ...found 17568 rows.
Reading ../data/electricity/demanddata_2021.csv...


  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(


    ...found 17520 rows.
Reading ../data/electricity/demanddata_2022.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2023.csv...
    ...found 17520 rows.
Reading ../data/electricity/demanddata_2024.csv...
    ...found 17568 rows.
Reading ../data/electricity/demanddata_2025.csv...
    ...found 13822 rows.
Concatenating all DataFrames...


  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(



--- VALIDATION ---
Total rows read from all CSVs: 434590
Total rows in combined DataFrame: 434590
✅ Success: Row counts match!
--------------------

DataFrame info after combining:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434590 entries, 0 to 434589
Data columns (total 22 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   SETTLEMENT_DATE            434590 non-null  datetime64[ns]
 1   SETTLEMENT_PERIOD          434590 non-null  object        
 2   ND                         434590 non-null  object        
 3   TSD                        364462 non-null  float32       
 4   ENGLAND_WALES_DEMAND       434590 non-null  float32       
 5   EMBEDDED_WIND_GENERATION   329422 non-null  float32       
 6   EMBEDDED_WIND_CAPACITY     329422 non-null  float32       
 7   EMBEDDED_SOLAR_GENERATION  294334 non-null  float32       
 8   EMBEDDED_SOLAR_CAPACITY    294334 non-null  float32       
 9 