In [2]:
import re
import camelot
import os
import csv
import pandas as pd
from pathlib import Path



In [None]:
pdf_folder = '../../../tariefkaarten/TOTALENERGIES/PIXEL NEXT VAST'
pdf_files = [str(file) for file in Path(pdf_folder).glob("*.pdf")]
csv_file = "pixel_next_vast.csv"
file_exists = os.path.isfile(csv_file)

In [4]:
header = ["DateKey", "ContractKey", "SingleMeterFixed", "DualMeterDayFixed",
"DualMeterNightFixed", "ExclusiveNightMeterFixed", "SingleMeterVariableMeterFactor",
"SingleMeterVariableBalancingCost", "DualMeterDayVariableMeterFactor",
"DualMeterDayVariableBalancingCost", "DualMeterNightVariableMeterFactor",
"DualMeterNightVariableBalancingCost", "ExclusiveNightMeterVariableMeterFactor",
"ExclusiveNightMeterVariableBalancingCost", "DynamicMeterCost", "DynamicBalancingCost",
"SingleMeterInjectionMeterFactor", "SingleMeterInjectionBalancingCost",
"DualMeterDayInjectionMeterFactor", "DualMeterDayInjectionBalancingCost",
"DualMeterNightInjectionMeterFactor", "DualMeterNightInjectionBalancingCost",
"AdministrativeCosts", "GreenElectricity", "WKK"]

df_tepnv = pd.DataFrame(columns=header)

df_tepnv.head()

Unnamed: 0,DateKey,ContractKey,SingleMeterFixed,DualMeterDayFixed,DualMeterNightFixed,ExclusiveNightMeterFixed,SingleMeterVariableMeterFactor,SingleMeterVariableBalancingCost,DualMeterDayVariableMeterFactor,DualMeterDayVariableBalancingCost,...,DynamicBalancingCost,SingleMeterInjectionMeterFactor,SingleMeterInjectionBalancingCost,DualMeterDayInjectionMeterFactor,DualMeterDayInjectionBalancingCost,DualMeterNightInjectionMeterFactor,DualMeterNightInjectionBalancingCost,AdministrativeCosts,GreenElectricity,WKK


In [5]:
for file_path in pdf_files:
    try:
        print(f"Processing {file_path}")
        tables = camelot.read_pdf(file_path, pages='1')
        
        match = re.search(r'(\d{4})-(\d{2})', os.path.basename(file_path))
        date_key = f"{match.group(1)}{match.group(2)}01" if match else "Unknown"
        year = int(match.group(1)) if match else 0
        month = int(match.group(2)) if match else 0
        
        # Handle different PDF formats based on date
        if len(tables) >= 3:  # Original format (pre-August 2024)
            verbruik = tables[0].df
            bijdrageWkk = tables[1].df
            vastevergoeding = tables[2].df
            
            # Split the concatenated value for DualMeterDayFixed
            day_night_values = verbruik.at[3, 1].split('\\n') if '\\n' in verbruik.at[3, 1] else verbruik.at[3, 1].split('\n')
            
            day_fixed = day_night_values[0] if len(day_night_values) > 0 else verbruik.at[3, 1]
            night_fixed = day_night_values[1] if len(day_night_values) > 1 else verbruik.at[3, 2]
            
            new_row = {
                "DateKey": date_key,
                "ContractKey": "13",
                "SingleMeterFixed": verbruik.at[3, 0],
                "DualMeterDayFixed": day_fixed,
                "DualMeterNightFixed": night_fixed,
                "ExclusiveNightMeterFixed": verbruik.at[3, 2],
                "GreenElectricity": bijdrageWkk.at[1, 0],
                "AdministrativeCosts": vastevergoeding.at[1, 0],
            }
            
        elif len(tables) == 1:  # New format (August 2024 onwards)
            # Logging this for verification
            print(f"Processing newer format PDF for {date_key}")
            
            # For now, skip these files until we determine their structure
            print(f"Skipping file with new format: {file_path}")
            continue
            
            # When you determine the new structure, implement the extraction here
            # Example (you'll need to adjust based on actual structure):
            # all_data = tables[0].df
            # new_row = {
            #     "DateKey": date_key,
            #     "ContractKey": "13",
            #     "SingleMeterFixed": all_data.at[specific_row, specific_col],
            #     # ... and so on
            # }
        
        else:
            print(f"Unexpected number of tables ({len(tables)}) in {file_path}")
            continue
        
        for col in df_tepnv.columns:
            if col not in new_row:
                new_row[col] = ''
        
        new_row_df = pd.DataFrame([new_row])
        df_tepnv = pd.concat([df_tepnv, new_row_df], ignore_index=True)
        print(f"Successfully processed {file_path}")
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

Processing ..\..\..\tariefkaarten\TOTALENERGIES\PIXEL NEXT VAST\totalenergies-pixel-next-vast-2023-08.pdf
Successfully processed ..\..\..\tariefkaarten\TOTALENERGIES\PIXEL NEXT VAST\totalenergies-pixel-next-vast-2023-08.pdf
Processing ..\..\..\tariefkaarten\TOTALENERGIES\PIXEL NEXT VAST\totalenergies-pixel-next-vast-2023-09.pdf
Successfully processed ..\..\..\tariefkaarten\TOTALENERGIES\PIXEL NEXT VAST\totalenergies-pixel-next-vast-2023-09.pdf
Processing ..\..\..\tariefkaarten\TOTALENERGIES\PIXEL NEXT VAST\totalenergies-pixel-next-vast-2023-10.pdf
Successfully processed ..\..\..\tariefkaarten\TOTALENERGIES\PIXEL NEXT VAST\totalenergies-pixel-next-vast-2023-10.pdf
Processing ..\..\..\tariefkaarten\TOTALENERGIES\PIXEL NEXT VAST\totalenergies-pixel-next-vast-2023-11.pdf
Successfully processed ..\..\..\tariefkaarten\TOTALENERGIES\PIXEL NEXT VAST\totalenergies-pixel-next-vast-2023-11.pdf
Processing ..\..\..\tariefkaarten\TOTALENERGIES\PIXEL NEXT VAST\totalenergies-pixel-next-vast-2023-12.pd

In [None]:
new_data = [
    # August 2024
    {
        "DateKey": 20240801,
        "ContractKey": 13,
        "SingleMeterFixed": 14.30,
        "DualMeterDayFixed": 15.80,
        "DualMeterNightFixed": 12.95,
        "ExclusiveNightMeterFixed": 13.04,  # Using meter excl. night value
        "SingleMeterInjectionMeterFactor": -2.25,
        "DualMeterDayInjectionMeterFactor": -2.25,
        "DualMeterNightInjectionMeterFactor": -2.25,
        "GreenElectricity": 1.58,
        "AdministrativeCosts": 160.00
    },
    # September 2024
    {
        "DateKey": 20240901,
        "ContractKey": 13,
        "SingleMeterFixed": 14.59,
        "DualMeterDayFixed": 16.24,
        "DualMeterNightFixed": 13.12,
        "ExclusiveNightMeterFixed": 13.24,
        "SingleMeterInjectionMeterFactor": -3.72,
        "DualMeterDayInjectionMeterFactor": -3.72,
        "DualMeterNightInjectionMeterFactor": -3.72,
        "GreenElectricity": 1.58,
        "AdministrativeCosts": 160.00
    },
    # October 2024
    {
        "DateKey": 20241001,
        "ContractKey": 13,
        "SingleMeterFixed": 13.56,
        "DualMeterDayFixed": 15.04,
        "DualMeterNightFixed": 12.23,
        "ExclusiveNightMeterFixed": 12.32,
        "SingleMeterInjectionMeterFactor": -1.96,
        "DualMeterDayInjectionMeterFactor": -1.96,
        "DualMeterNightInjectionMeterFactor": -1.96,
        "GreenElectricity": 1.58,
        "AdministrativeCosts": 160.00
    },
    # November 2024
    {
        "DateKey": 20241101,
        "ContractKey": 13,
        "SingleMeterFixed": 14.03,
        "DualMeterDayFixed": 15.57,
        "DualMeterNightFixed": 12.66,
        "ExclusiveNightMeterFixed": 12.77,
        "SingleMeterInjectionMeterFactor": -2.28,
        "DualMeterDayInjectionMeterFactor": -2.28,
        "DualMeterNightInjectionMeterFactor": -2.28,
        "GreenElectricity": 1.58,
        "AdministrativeCosts": 160.00
    },
    # December 2024
    {
        "DateKey": 20241201,
        "ContractKey": 13,
        "SingleMeterFixed": 15.22,
        "DualMeterDayFixed": 16.74,
        "DualMeterNightFixed": 13.85,
        "ExclusiveNightMeterFixed": 13.99,
        "SingleMeterInjectionMeterFactor": -3.47,
        "DualMeterDayInjectionMeterFactor": -3.47,
        "DualMeterNightInjectionMeterFactor": -3.47,
        "GreenElectricity": 1.58,
        "AdministrativeCosts": 160.00
    },
    # January 2025
    {
        "DateKey": 20250101,
        "ContractKey": 13,
        "SingleMeterFixed": 15.48,
        "DualMeterDayFixed": 16.88,
        "DualMeterNightFixed": 14.22,
        "ExclusiveNightMeterFixed": 14.41,
        "SingleMeterInjectionMeterFactor": 3.33,
        "DualMeterDayInjectionMeterFactor": 3.33,
        "DualMeterNightInjectionMeterFactor": 3.33,
        "GreenElectricity": 1.58,
        "AdministrativeCosts": 160.00
    },
    # February 2025
    {
        "DateKey": 20250201,
        "ContractKey": 13,
        "SingleMeterFixed": 16.16,
        "DualMeterDayFixed": 17.84,
        "DualMeterNightFixed": 14.65,
        "ExclusiveNightMeterFixed": 14.44,
        "SingleMeterInjectionMeterFactor": -2.41,
        "DualMeterDayInjectionMeterFactor": -2.41,
        "DualMeterNightInjectionMeterFactor": -2.41,
        "GreenElectricity": 1.58,
        "AdministrativeCosts": 160.00
    },
    # March 2025
    {
        "DateKey": int(20250301),
        "ContractKey": 13,
        "SingleMeterFixed": 15.45,
        "DualMeterDayFixed": 17.49,
        "DualMeterNightFixed": 13.61,
        "ExclusiveNightMeterFixed": 13.50,
        "SingleMeterInjectionMeterFactor": -2.86,
        "DualMeterDayInjectionMeterFactor": -2.86,
        "DualMeterNightInjectionMeterFactor": -2.86,
        "GreenElectricity": 1.58,
        "AdministrativeCosts": 160.00
    }
]

# Append the new data to the dataframe
for entry in new_data:
    df_tepnv = df_tepnv._append(entry, ignore_index=True)

# Display the updated dataframe
df_tepnv.head(20)

Unnamed: 0,DateKey,ContractKey,SingleMeterFixed,DualMeterDayFixed,DualMeterNightFixed,ExclusiveNightMeterFixed,SingleMeterVariableMeterFactor,SingleMeterVariableBalancingCost,DualMeterDayVariableMeterFactor,DualMeterDayVariableBalancingCost,...,DynamicBalancingCost,SingleMeterInjectionMeterFactor,SingleMeterInjectionBalancingCost,DualMeterDayInjectionMeterFactor,DualMeterDayInjectionBalancingCost,DualMeterNightInjectionMeterFactor,DualMeterNightInjectionBalancingCost,AdministrativeCosts,GreenElectricity,WKK
0,20230801.0,13.0,211744.0,244093.0,181346.0,187788.0,,,,,...,,,,,,,,1650.0,22190.0,
1,20230901.0,13.0,215357.0,245411.0,186874.0,188803.0,,,,,...,,,,,,,,1750.0,22190.0,
2,20231001.0,13.0,197159.0,223403.0,172494.0,172883.0,,,,,...,,,,,,,,1500.0,22190.0,
3,20231101.0,13.0,12962.0,142948.0,117058.0,119516.0,,,,,...,,,,,,,,900.0,22238.0,
4,20231201.0,13.0,179024.0,202859.0,156493.0,157696.0,,,,,...,,,,,,,,1500.0,22238.0,
5,20240101.0,13.0,164575.0,189493.0,14216.0,142035.0,,,,,...,,,,,,,,1500.0,15638.0,
6,20240201.0,13.0,138597.0,157204.0,121869.0,123135.0,,,,,...,,,,,,,,1600.0,15831.0,
7,20240301.0,13.0,116804.0,132487.0,102748.0,103472.0,,,,,...,,,,,,,,1600.0,15831.0,
8,20240401.0,13.0,131952.0,147568.0,117936.0,119597.0,,,,,...,,,,,,,,1600.0,15831.0,
9,20240501.0,13.0,1320.0,1476.0,1179.0,1196.0,,,,,...,,,,,,,,16000.0,158.0,


In [None]:
from decimal import Decimal, InvalidOperation

# Zet in de juiste datatypes en format voor inladen in de database

df_tepnv['DateKey'] = df_tepnv['DateKey'].astype(int)
df_tepnv['ContractKey'] = df_tepnv['ContractKey'].astype(int)

for col in df_tepnv.columns:
    if col not in ['DateKey', 'ContractKey']:
        df_tepnv[col] = df_tepnv[col].apply(
            lambda x: '{:.6f}'.format(float(str(x).replace(',', '.'))) if pd.notna(x) and (str(x).replace(',', '.').replace('.', '', 1).isdigit() or (str(x).startswith('-') and str(x).replace('-', '', 1).replace(',', '.').replace('.', '', 1).isdigit())) else ''
        )

csv_path = 'pixel_next_vast.csv'
df_tepnv.to_csv(
    csv_path,
    index=False,
    sep=',',
    decimal='.',
    quoting=3,
    escapechar='\\',
    date_format='%Y%m%d'
)

print(f"CSV file created at: {csv_path}")
df_tepnv.head(20)


CSV file created at: total_energies_pixel_next_vast.csv


Unnamed: 0,DateKey,ContractKey,SingleMeterFixed,DualMeterDayFixed,DualMeterNightFixed,ExclusiveNightMeterFixed,SingleMeterVariableMeterFactor,SingleMeterVariableBalancingCost,DualMeterDayVariableMeterFactor,DualMeterDayVariableBalancingCost,...,DynamicBalancingCost,SingleMeterInjectionMeterFactor,SingleMeterInjectionBalancingCost,DualMeterDayInjectionMeterFactor,DualMeterDayInjectionBalancingCost,DualMeterNightInjectionMeterFactor,DualMeterNightInjectionBalancingCost,AdministrativeCosts,GreenElectricity,WKK
0,20230801,13,21.1744,24.4093,18.1346,18.7788,,,,,...,,,,,,,,165.0,2.219,
1,20230901,13,21.5357,24.5411,18.6874,18.8803,,,,,...,,,,,,,,175.0,2.219,
2,20231001,13,19.7159,22.3403,17.2494,17.2883,,,,,...,,,,,,,,150.0,2.219,
3,20231101,13,12.962,14.2948,11.7058,11.9516,,,,,...,,,,,,,,90.0,2.2238,
4,20231201,13,17.9024,20.2859,15.6493,15.7696,,,,,...,,,,,,,,150.0,2.2238,
5,20240101,13,16.4575,18.9493,14.216,14.2035,,,,,...,,,,,,,,150.0,1.5638,
6,20240201,13,13.8597,15.7204,12.1869,12.3135,,,,,...,,,,,,,,160.0,1.5831,
7,20240301,13,11.6804,13.2487,10.2748,10.3472,,,,,...,,,,,,,,160.0,1.5831,
8,20240401,13,13.1952,14.7568,11.7936,11.9597,,,,,...,,,,,,,,160.0,1.5831,
9,20240501,13,13.2,14.76,11.79,11.96,,,,,...,,,,,,,,160.0,1.58,
