### (Short) EDA of bids data and Upload to DB

- download units and permits
- extract Anlagen EEG from open mastr dump
- units-table: How do the mastr nrs start? Are units starting with A among the units, as they are in earlier years of the bid-data?
- can all units with a Anlagen_Registernr be linked to units of the units-table?
- can any columns be omitted? Following Database Design rules, administrative/locational data should not be kept in the the bid data if they can be linked to via foreign key to the units data
- If they can be linked, delete the data in the bids-table -> since this will not be updated

In [None]:
### Download, Inspect and Upload Permit Data
import pandas as pd
import psycopg2
import os
import numpy as np
from dotenv import load_dotenv
from pandas.api.types import is_datetime64_any_dtype
from datetime import datetime
import time
import random
import re
import pickle
import sqlalchemy
from open_mastr import Mastr

In [None]:
### Get Units data

# Load environment variables from .env file
load_dotenv()

# Get connection parameters from environment variables
dbname = os.getenv("DB_NAME")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
ssl_cert_path = os.getenv("SSL_CERT_PATH")

# Construct the connection string
conn_str = f"dbname={dbname} user={user} password={password} host={host} port={port} sslmode=require sslrootcert={ssl_cert_path}"

# Etablish connection object

#conn.close()
conn = psycopg2.connect(conn_str)

sql_select = "SELECT * FROM public.wind_extended;"
df_wind = pd.read_sql(sql_select, conn)

In [None]:
### Get bid data
with open("../data/mastr_bids/bids_cleaned_2017_2023.pkl", mode = "rb") as pkl_file:
   df_bids_all = pickle.load(pkl_file)

In [None]:
def find_unique_starting_letters(x):
    x_2 = x.fillna('')
    series_match = x_2.apply(lambda ser: re.findall(r"^[A-Za-z]+", ser))

    vals = []
    for match_re in series_match.values:
        if len(match_re) == 1:
            vals.append(match_re[0])

    return pd.unique(vals).tolist()

In [None]:
### How do mastr_nr s in df_wind and Anlagen_Registernr in df_bids_all start?
cols_nr = [col for col in df_wind.columns if re.search("mastr_nummer", col)]

cols_nr_start = {}

### Find all beginning unique Letters in the mastr_nummer columns 
for col in cols_nr:
    cols_nr_start[col] = find_unique_starting_letters(df_wind[col])

# List to store tuples, where each tuple represents one row for the long df    
rows_list = []

# loop through each nr-column
for col, values in cols_nr_start.items():
    # loop through each unique value of the number column and create tuple with column name in it
    for value in values:
        rows_list.append((col, value))

pd.DataFrame(rows_list, columns = ["mastr-column", "starting-letters"])

In [None]:
find_unique_starting_letters(df_bids_all["Register_Anlagennr"])

### Discrepancy in df_bids identifiers

- No foreign key nr in wind_extended start with only A
- Try: Can they be linked to the eeg_anlagen table?
- Try: When A in df_bids.Anlagen_Registernr is exchanged by SEE -> can it be linked to the units table

In [None]:
# Try: When A in df_bids.Anlagen_Registernr is exchanged by SEE -> can it be linked to the units table

# copy df
df_bids_a = df_bids_all.copy()
# .str is an acessor used to apply simple string-methods (used on one string object "foo") on a series
ind = df_bids_a["Register_Anlagennr"].fillna("").str.startswith("A")
df_bids_a = df_bids_a[ind]
df_bids_a = df_bids_a[['Name des Bieters', 
       'Landkreis', 'Postleitzahl', 'Gemeinde', 'Gemarkung',
       'Flur / Flurstück', 'Register_Anlagennr', 'Gebotsdatum', 'Zuschlags-Nr']]
# Create column A -> SEE
df_bids_a["A_to_SEE"] = df_bids_a["Register_Anlagennr"].str.replace("A", "SEE")
# Create column firs three positions -> SEE
df_bids_a["pos_3_to_SEE"] = df_bids_a["Register_Anlagennr"].str.replace(r"^.{3}", "SEE")

# Create column A -> SEE
df_bids_a["A_to_EEG"] = df_bids_a["Register_Anlagennr"].str.replace("A", "EEG")
# Create column firs three positions -> SEE
df_bids_a["pos_3_to_EEG"] = df_bids_a["Register_Anlagennr"].str.slice_replace(0,3, "EEG")

In [None]:
rows_list = []
for key, row in df_wind.iterrows():
    rows_list.append((row["einheit_mastr_nummer"], 
    row["einheit_mastr_nummer"] in df_bids_a["A_to_SEE"].tolist(),
    row["einheit_mastr_nummer"] in df_bids_a["pos_3_to_SEE"].tolist(),
    row["eeg_mastr_nummer"] in df_bids_a["A_to_EEG"].tolist(),
    row["eeg_mastr_nummer"] in df_bids_a["pos_3_to_EEG"].tolist()))

df_test = pd.DataFrame(rows_list, columns=["einheit_mastr_nr", "A_to_SEE", "pos_3_to_SEE", "A_to_EEG", "pos_3_to_EEG"])

In [None]:
for col in ["A_to_SEE", "pos_3_to_SEE", "A_to_EEG", "pos_3_to_EEG"]:
    print(col, df_test[col].sum())

In [None]:
### Can the registernr at least be found?
# copy df
df_bids_see = df_bids_all.copy()
# .str is an acessor used to apply simple string-methods (used on one string object "foo") on a series
ind = df_bids_see["Register_Anlagennr"].fillna("").str.startswith("SEE")
df_bids_see = df_bids_see[ind]
df_bids_see = df_bids_see[['Name des Bieters', 
       'Landkreis', 'Postleitzahl', 'Gemeinde', 'Gemarkung',
       'Flur / Flurstück', 'Register_Anlagennr', 'Gebotsdatum', 'Zuschlags-Nr']]

rows_list = []
for key, row in df_wind.iterrows():
    rows_list.append((row["einheit_mastr_nummer"], 
    row["einheit_mastr_nummer"] in df_bids_see["Register_Anlagennr"].tolist()))

df_test = pd.DataFrame(rows_list, columns=["einheit_mastr_nr", "Register_Anlagennr"])
print(df_test["Register_Anlagennr"].sum(), len(df_bids_see), len(df_bids_all))

### 2854 unit_mastr_nrs from wind_extended can be found in the 2868 mastr_nrs starting with SEE from df_bids_see of all 4418 bid-units

In [None]:
### Retrieve the Anlagentable
### Data was already downloaded with open_mastr into local sqllite DB
db = Mastr()
conn = db.engine # Connection engine

tables = pd.read_sql_query('SELECT name from sqlite_master where type= "table";', conn)
df_eeg = pd.read_sql_table("wind_eeg", con=conn)

In [None]:
df_eeg.columns
### Interesting columns with possible link to Anlagennr in Bids:

df_eeg[['Zuschlagsnummer', 'VerknuepfteEinheit', 'AnlagenschluesselEeg',
       'AnlagenkennzifferAnlagenregister',
       'AnlagenkennzifferAnlagenregister_nv']]

### AnlagenkennzifferAnlagenregister seems interesting

In [None]:
print(df_bids_a["Register_Anlagennr"].isin(df_eeg["AnlagenkennzifferAnlagenregister"]).sum(), len(df_bids_a))
# 529 units in the bids_a table can be linked to the Anlagen-Table via the ANlagennr starting with a

In [None]:
### How many can be linked to the eeg table via the Zuschlagsnr
len(df_bids_all) # 4418 units
### How do they relate to the Zuschlagsdatum in df_bids_all

### 1. Can all bids be linked via the Zuschlagsnr? Do all have a Zuschlagsnr?
df_bids_all.info() # Zuschlags-Nr 4418 non-null - All have a Zuschlagsnr

df_bids_all["Zuschlags-Nr"].isin(df_eeg["Zuschlagsnummer"]).sum() ### 1392 / 4418

len(df_eeg.Zuschlagsnummer.unique().tolist()) # 1220 unique Zuschlagsnr in df_eeg

len(df_bids_all["Zuschlags-Nr"].unique().tolist()) # 2059 unique Zuschlagsnr in df_bids_all

In [None]:
df_bids_all["A_SEE_none"] = df_bids_all["Register_Anlagennr"].str.extract(r"^([A-Za-z]).*")

In [None]:
df_bids_all["Zuschlags-Nr"].apply(lambda x: len(x)).unique() # All Zuschlagsnr of len 11
len_zsnr = df_eeg["Zuschlagsnummer"].fillna("").apply(lambda x: len(x))

In [None]:
pattern_1 = r'^(WIN\d{2}-\d{1,2}-\d{3})$'
scheme_1 = "1_WIN%Y-%M-nr"
pattern_2 = r'^(WIN\d{2}-\d{1,2}/\d{3})$'
scheme_2 = "2_WIN%Y-%M/nr"

def retrieve_nr_scheme(item):
    if len(item) == 0:
        return "0 -"
    elif re.match(pattern_1, item):
        return scheme_1
    elif re.match(pattern_2, item):
        return scheme_2
    else:
        return "3_X"

df_bids_all.copy()    
df_bids_all["Zuschlags-Nr"].fillna("").apply(lambda x: retrieve_nr_scheme(x)) # '1_WIN%Y-%M-nr', '2_WIN%Y-%M/nr'        
df_eeg["Zuschlagsnummer"].fillna("").apply(lambda x: retrieve_nr_scheme(x)).unique() # '0 -', '2_WIN%Y-%M/nr', '3_X'

# Mismatch in how the Zuschlagsnr are structured
# Replace - with / in Zuschlagsnr in df_bids_all -> then compare merging



In [None]:
df_bids_all["Zuschlagsnummer"] = df_bids_all["Zuschlags-Nr"].apply(lambda x: re.sub(r"-(\d{3})$", r"/\1", x))
df_bids_all["ZS_nr_in_eeg"] = df_bids_all["Zuschlagsnummer"].isin(df_eeg["Zuschlagsnummer"])
df_bids_all["Anl_nr_in_eeg"] = df_bids_all["Register_Anlagennr"].isin(df_eeg["AnlagenkennzifferAnlagenregister"])
df_bids_all["Mastr_nr_in_eeg"] = df_bids_all["Register_Anlagennr"].isin(df_eeg["VerknuepfteEinheit"])
df_bids_all["A_SEE_none"] = df_bids_all["A_SEE_none"].fillna("x")

In [None]:
### Overview (by Gebotsdatum)

# Nr of units, Nr of Zuschlagsnr, Nr of Zuschlagsnr found, Nr of See found, Nr of A found, Nr of Zuschlagsnr and See/A found, Nr of Zuschlagsnr found but not See/A. Nr of See/A found but not Zuschlagsnr
group_gebdatum_df_bids_all = df_bids_all.groupby(["Gebotsdatum", "A_SEE_none"])

In [None]:
df_bids_sum = pd.DataFrame()

for name, df_group in group_gebdatum_df_bids_all:
    # Nr of units
    n_units = len(df_group)
    n_zsnr = len(df_group["Zuschlagsnummer"].unique())
    
    # n_zsnr_in_eeg = df_group.groupby("Zuschlagsnr")
    n_zsnr_in_eeg = len(df_group[df_group["ZS_nr_in_eeg"]]["Zuschlagsnummer"].unique())
    
    n_mastr_nr_in_eeg = df_group["Mastr_nr_in_eeg"].sum()
    n_anl_nr_in_eeg = df_group["Anl_nr_in_eeg"].sum()
    
    n_zsnr_unit_nr_in_eeg = 0
    n_only_zsnr_in_eeg = 0
    n_only_unit_nr_in_eeg = 0
    
    for ind, row in df_group.iterrows():
        
        if name[1] == "S":
        
            if row["ZS_nr_in_eeg"] == True and row["Mastr_nr_in_eeg"] == True:
                  n_zsnr_unit_nr_in_eeg += 1
            
            if row["ZS_nr_in_eeg"] == True and row["Mastr_nr_in_eeg"] == False:
                  n_only_zsnr_in_eeg += 1
            
            if row["ZS_nr_in_eeg"] == False and row["Mastr_nr_in_eeg"] == True:
                  n_only_unit_nr_in_eeg += 1
        
        elif name[1] == "A":
            if row["ZS_nr_in_eeg"] == True and row["Anl_nr_in_eeg"] == True:
                  n_zsnr_unit_nr_in_eeg += 1
            
            if row["ZS_nr_in_eeg"] == True and row["Anl_nr_in_eeg"] == False:
                  n_only_zsnr_in_eeg += 1
            
            if row["ZS_nr_in_eeg"] == False and row["Anl_nr_in_eeg"] == True:
                  n_only_unit_nr_in_eeg += 1
    
    dict_row = {"Gebotsdatum": name[0], "n_units": n_units, "n_zsnr":n_zsnr, "type_unit_nr":name[1], 
                    "n_zsnr_in_eeg":n_zsnr_in_eeg, "n_mastr_in_eeg":n_mastr_nr_in_eeg, "n_anl_nr_in_eeg":n_anl_nr_in_eeg,
                    "n_zsnr_unit_nr_in_eeg":n_zsnr_unit_nr_in_eeg,
                    "n_only_zsnr_in_eeg":n_only_zsnr_in_eeg, 
                    "n_only_unit_nr_in_eeg":n_only_unit_nr_in_eeg}
    
    if df_bids_sum.empty:
          df_bids_sum = pd.DataFrame(dict_row, index = [0])
    else:
          df_bids_sum = pd.concat([df_bids_sum, pd.DataFrame(dict_row, index = [0])], ignore_index=True)


In [None]:
df_bids_sum.sort_values(by=["type_unit_nr", "Gebotsdatum"])

In [None]:
df_eeg.info()

In [None]:
### Learnings from df_bids_sum
# Seldomly all bid-nrs of a bid-date can be linked to the eeg-table
# Very bad quota linked bid-nrs / all bid-nrs for earliest bids (2017) and latest bids (2022-2023)
# Quota can not be enhanced by using the mastr or anlagen-nr

### Summary Zuschlagsdaten
print(len(df_bids_all))                                                             # 4418 Bids extracted 
print(len(df_bids_all["Zuschlagsnummer"].unique()))                                 # 2059 Unique bid nrs 
print(len(df_bids_all[df_bids_all["ZS_nr_in_eeg"]]["Zuschlagsnummer"].unique()))    # 1197 Bid nrs in eeg 
print(df_bids_all["ZS_nr_in_eeg"].sum())                                            # 2381 Units linked to eeg by bid nr

print(df_bids_all["Anl_nr_in_eeg"].sum())                                           # 1208 Units linked to eeg by anlagen_nr
print(df_bids_all["Mastr_nr_in_eeg"].sum())                                         # 1605 Units linked to eeg by mastr_nr

### All entries of df_eeg have an inbetriebnahmedatum -> explains why latest units can from bid date can not be linked (~530 of 862 bid nrs not linked)
### 856 units from anlagen_eeg build later than 2017-05-01 which can not be linked via the Zuschlagsnr to df_bids_all. Quite comparable.
### Only 70 of those have Zuschlagsnr. But a lot of these have a Scheme of the Zuschlagsnr not matching the (BK6...) Scheme of the Zuschlagsnr in df_bids_all (WIN%Y)

In [None]:
# How do the bid-nrs not found look like -> Sort unique by date
df_bids_not_eeg = df_bids_all[~df_bids_all["ZS_nr_in_eeg"]]
df_bids_not_eeg = df_bids_not_eeg[["Gebotsdatum", "Zuschlagsnummer", "Zuschlags-Nr"]].drop_duplicates().sort_values("Gebotsdatum")

In [None]:
df_eeg_not_bids = df_eeg[~df_eeg["Zuschlagsnummer"].isin(df_bids_all["Zuschlagsnummer"])]
later_date = pd.to_datetime("2017-05-01")
df_eeg_not_bids = df_eeg_not_bids[df_eeg_not_bids["EegInbetriebnahmedatum"] > later_date]
# df_eeg_not_bids["Inbetriebmonat"] = df_eeg_not_bids["EegInbetriebnahmedatum"].dt.strftime("%Y-%m")
df_eeg_not_bids = df_eeg_not_bids[["EegInbetriebnahmedatum", "Zuschlagsnummer"]].drop_duplicates().sort_values("EegInbetriebnahmedatum")

### Create Upload Anlagen_EEG table from mastr in goal:100 database

In [None]:
### Follow the scheme of the mastr insert/create
columns_eeg = df_eeg.columns

# Meldedatum, Netzbetreiberzuordnungen, Datenquelle, DatumDownload -> have either no info or are remove
cols_remove = ["Meldedatum", "Netzbetreiberzuordnungen", "DatenQuelle", "DatumDownload"]

columns_eeg = [col for col in columns_eeg if col not in cols_remove]
df_eeg = df_eeg[columns_eeg]

In [None]:
### Set schema_name and table_name
schema_name = "public"
table_name = "wind_eeg"

### Get python data types
pd_types = [df_eeg[col].dtype for col in columns_eeg]

# dictionary mapping the data types: Postgres Data type = pandas data type
map_types = {'bool': 'bool', 
                 'float8': 'float64', 
                 'date' : '<M8[ns]', 
                 'varchar':'O'}

# set the primary and foreign key columns
pk_column = "EegMastrNummer"
fk_column = "VerknuepfteEinheit"
fk_table = "wind_extended"

In [None]:
### function to turn CamelCase to snake_case
def change_case(str):
    # List comprehension, starts with an _ wich is removed by lstrip("_")
    # loops through word, if upper, _ first "_"+
    # and i.lower() as a string method
    # just return i else
    ret_str = ''.join(['_'+i.lower() if i.isupper() 
               else i for i in str]).lstrip('_')
    return ret_str.replace(" ", "_").replace("-", "_").replace("__", "_")
    
def dtype_sqltype(str, map_dict):
    
    # next() jumps through the iterator until a match is found
    # with a an iterator generated by the comprehension inside ()
    return next((key for key, val in map_dict.items() if val == str), None)

In [None]:
sql_columns = []
for col in columns_eeg:
    if col not in [pk_column, fk_column]:
    # SQL create column statement for this column: 'column_name pgsql-type,'. Leave out constraints for now
    # to lowercase, underscore at uppercase
        name = change_case(col)
        sql_type = dtype_sqltype(df_eeg[col].dtype, map_types)
        sql_columns.append(f"{name} {sql_type}")
        
sql_columns = ", \n ".join(sql_columns)

sql_pk = f"{change_case(pk_column)} {dtype_sqltype(df_eeg[pk_column].dtype, map_types)} PRIMARY KEY"
sql_fk = f"einheit_mastr_nummer {dtype_sqltype(df_eeg[fk_column].dtype, map_types)} REFERENCES {schema_name}.{fk_table}(einheit_mastr_nummer)"

In [None]:
### PK-column
# leave out geo-columns -> added later on
# add primary key
sql_drop = f"DROP TABLE IF EXISTS {schema_name}.{table_name};"

sql_create = f"""
CREATE TABLE {schema_name}.{table_name} (
{sql_pk},
{sql_fk},
{sql_columns} 
);""" 

In [None]:
# Reestablish postgres/supabase connection
# Load environment variables from .env file
load_dotenv()

# Get connection parameters from environment variables
dbname = os.getenv("DB_NAME")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
ssl_cert_path = os.getenv("SSL_CERT_PATH")

# Construct the connection string
conn_str = f"dbname={dbname} user={user} password={password} host={host} port={port} sslmode=require sslrootcert={ssl_cert_path}"

# Etablish connection object
#conn.close()
conn = psycopg2.connect(conn_str)

In [None]:
print(conn)
conn_cursor = conn.cursor()

# drop table if it already exists
conn_cursor.execute(sql_drop)

# table creation
conn_cursor.execute(sql_create)
conn.commit()

conn_cursor.close()

In [None]:
### Helper Function for one preprocessed row without geodata 
### to generate column names and values

def row_data_to_sql(row_data, columns_data):
    ### Lists for column names as needed for the postgres-table and the values as given to the sql statement
    columns_sql = []
    values_sql = []

    for col in columns_data:
        val = row_data[col].values[0]
        # print(val)
        # print(type(val))
        
    # Test wether the column holds a value and is not empty
        if pd.notna(val):
        # add column name
            columns_sql.append(change_case(col))
        # Apply date to string transformation
            if isinstance(val, str):
                #print("is_str")
                values_sql.append(f"$${val}$$")   # add a pair of parentheses to keep for the join
            elif is_datetime64_any_dtype(val):
                #print("is_datetime")
                val = np.datetime_as_string(val, unit="D")
                values_sql.append(f"'{val}'")
            else:
                #print("is_float_bool")
                values_sql.append(str(val))     # cast to str without adding parentheses
    
    return columns_sql, values_sql

### helper function to construct the INSERT query for one row 
#   from the columns_sql and values_sql lists

def join_insert_sql(columns_sql, values_sql):
    # join sql-column names and values respectively into a single string
    columns_sql = ", \n ".join(columns_sql) 
    values_sql = ", \n ".join(values_sql)

    # Create INSERT-Query for one row
    sql_insert = f"""INSERT INTO {schema_name}.{table_name} (
        {columns_sql} )
    VALUES (
        {values_sql}
        );
    """
    
    return sql_insert

### Function for all rows
def df_to_sql_insert(df_upload, conn_db):
    # join sql-column names and values respectively into a single string
    ### Loop through columns
    ### How should the geo-insert look like
    columns_wind = df_upload.columns

    ### column names where each name corresponds to one value (not true for db column geom) 
    columns_data = columns_wind #[col for col in columns_wind if col not in geo_columns]
    
    ### List to hold all INSERT Statements
    inserts_all = []
    
    for i in range(len(df_upload)):
        row_wind = df_upload.iloc[[i],:]
        
        row_data = row_wind[columns_data]    
        #row_geo = row_wind[geo_columns]
        
        columns_sql, values_sql = row_data_to_sql(row_data, columns_data)
        #columns_sql, values_sql = row_geo_to_sql(row_geo, geo_columns, columns_sql, values_sql)
        insert_sql = join_insert_sql(columns_sql, values_sql)
        
        inserts_all.append(insert_sql)
    
    inserts_all_sql = " \n ".join(inserts_all)
    
        # Establish a connection to the database    
    try:
        # Create a cursor
        cur = conn_db.cursor()
    
        # Execute your SQL statement
        cur.execute(inserts_all_sql)
    
        # Commit the transaction
        conn.commit()
    
    except Exception as e:
        # Handle the exception
        print(f"Error: {e}")
        conn.rollback()
    
    finally:
        # Close the cursor and connection
        cur.close()
        #conn.close()

In [None]:
### Insert Data for eeg table
batch_size = 1000  # Set the desired batch size
total_rows = len(df_eeg)

for i in range(0, total_rows, batch_size):
    df_batch = df_eeg[i:i+batch_size]
    # rename fk column
    df_batch.rename(columns={"VerknuepfteEinheit":"einheit_mastr_nummer"}, inplace=True)
    df_to_sql_insert(df_batch, conn)

### Zuschlagsnr

- a table bids must be created to link wind_eeg and bids:
- In both tables zuschlagsnr is neither NOT NULL nor unique - can appear multiple times
- More extensive in df_bids_all -> thus create it from df_bids_all

In [None]:
### Check wether the columns ['Name des Bieters', 'Gebots-Nr', 'Gebotsdatum', 'Zuschlagsdatum']
### Which hold only one value for each zuschlagsnummer really hold only one value for each nr
df_groups = df_bids_all.groupby("Zuschlagsnummer")

df_zsnr_sum = pd.DataFrame()

for bid_nr, df in df_groups:
    row_vals = [bid_nr]
    col_names = ['Name des Bieters', 'Gebots-Nr', 'Gebotsdatum',
       'Zuschlagsdatum']
    for col in col_names:
        row_vals.append(len(df[col].unique()))
    
    col_names.insert(0, "Zuschlagsnr")
    
    df_row = pd.DataFrame([row_vals], columns=col_names)

    if df_zsnr_sum.empty:
        df_zsnr_sum = df_row
    else: 
        df_zsnr_sum = pd.concat([df_zsnr_sum, df_row],
                                ignore_index=True)

# Summary
df_zsnr_sum[['Name des Bieters', 'Gebots-Nr', 'Gebotsdatum',
       'Zuschlagsdatum']].apply(lambda x: len(x.unique()), axis=0)

# for all zuschlagsnr there is really only one value in the respective column -> 
# these can be included in the link table

In [None]:
### Create link table
cols_link = ['Zuschlagsnummer', 'Name des Bieters', 'Gebots-Nr', 'Gebotsdatum',
       'Zuschlagsdatum']

df_link = df_bids_all[cols_link].drop_duplicates(ignore_index=True)
df_link.rename(columns={"Gebots-Nr": "Gebotsnummer"}, inplace=True)

### Set schema_name and table_name
schema_name = "public"
table_name = "link_wind_bids"

# set the primary and foreign key columns
pk_column = "Zuschlagsnummer"

sql_columns = []
for col in df_link.columns:
    if col not in [pk_column]:
    # SQL create column statement for this column: 'column_name pgsql-type,'. Leave out constraints for now
    # to lowercase, underscore at uppercase
        name = change_case(col)
        sql_type = dtype_sqltype(df_link[col].dtype, map_types)
        sql_columns.append(f"{name} {sql_type}")
        
sql_columns = ", \n ".join(sql_columns)

sql_pk = f"{change_case(pk_column)} {dtype_sqltype(df_link[pk_column].dtype, map_types)} PRIMARY KEY"

### PK-column
# leave out geo-columns -> added later on
# add primary key
sql_drop = f"DROP TABLE IF EXISTS {schema_name}.{table_name};"

sql_create = f"""
CREATE TABLE {schema_name}.{table_name} (
{sql_pk},
{sql_columns}
);""" 

print(conn)
conn_cursor = conn.cursor()

# drop table if it already exists
conn_cursor.execute(sql_drop)

# table creation
conn_cursor.execute(sql_create)
conn.commit()

conn_cursor.close()

In [None]:
df_to_sql_insert(df_link, conn)

### Clean Create and Upload wind_bid_units table
- remove column already present in link table
- add foreign keys in eeg table and wind_bid_units

In [None]:
### One row where the flurnr ended up in the Anlagennr
df_bids_all["Register_Anlagennr"][ind] = 'A3144210207303'
df_bids_all["Flur / Flurstück"][ind] = "Flur 7:" + " " + df_bids_all["Flur / Flurstück"][ind]

In [None]:
cols_keep = ['Zuschlagsnummer','Bundesland',
 'Landkreis',
 'Postleitzahl',
 'Gemeinde',
 'Gemarkung',
 'Flur / Flurstück',
 'Register_Anlagennr']

df_bids_upload = df_bids_all[cols_keep]

df_bids_upload["einheit_mastr_nummer"] = None
df_bids_upload["AnlagenkennzifferAnlagenregister"] = None

for ind, row in df_bids_upload.iterrows():
    reg_anl_nr = row["Register_Anlagennr"]
    
    if pd.notna(reg_anl_nr):
        if reg_anl_nr.startswith("A"):
            df_bids_upload.loc[ind,"AnlagenkennzifferAnlagenregister"] = reg_anl_nr
        else:
            df_bids_upload.loc[ind,"einheit_mastr_nummer"] = reg_anl_nr

df_bids_upload.drop(columns = ["Register_Anlagennr"], inplace=True)

In [None]:
df_bids_upload.columns

In [None]:
### Create link table
# df_bids_upload.rename(columns={"Flur / Flurstück": "FlurFlurstück"}, inplace=True)

### Set schema_name and table_name
schema_name = "public"
table_name = "wind_bids_units"

# set the primary and foreign key columns
fk_column = "Zuschlagsnummer"
fk_table = "link_wind_bids"

sql_columns = []
for col in df_bids_upload.columns:
    if col not in [fk_column]:
    # SQL create column statement for this column: 'column_name pgsql-type,'. Leave out constraints for now
    # to lowercase, underscore at uppercase
        name = change_case(col)
        sql_type = dtype_sqltype(df_bids_upload[col].dtype, map_types)
        sql_columns.append(f"{name} {sql_type}")
        
sql_columns = ", \n ".join(sql_columns)
sql_fk = f"{change_case(fk_column)} {dtype_sqltype(df_eeg[fk_column].dtype, map_types)} REFERENCES {schema_name}.{fk_table}({change_case(fk_column)})"

### PK-column
# leave out geo-columns -> added later on
# add primary key
sql_drop = f"DROP TABLE IF EXISTS {schema_name}.{table_name};"

sql_create = f"""
CREATE TABLE {schema_name}.{table_name} (
    id bigint generated by default as identity PRIMARY KEY,
    {sql_fk},
    {sql_columns}
);"""

print(conn)
print(sql_create)
conn_cursor = conn.cursor()

# drop table if it already exists
conn_cursor.execute(sql_drop)

# table creation
conn_cursor.execute(sql_create)
conn.commit()

conn_cursor.close()

In [None]:
df_bids_upload

In [None]:
### Insert Data for eeg table
batch_size = 1000  # Set the desired batch size
total_rows = len(df_bids_upload)

for i in range(0, total_rows, batch_size):
    df_batch = df_bids_upload[i:i+batch_size]
    # rename fk column
    df_to_sql_insert(df_batch, conn)

In [16]:
### Check wether any zuschlagsnummer from wind_eeg are not present in link_wind_bids and Insert those into link_wind_bids
### One time directly in sql, otherwise beforehand with python, before wind_eeg is updated -> check if value is present
schema_name = "public"
table_name = "wind_eeg"
fk_column = "zuschlagsnummer"
fk_table = "link_wind_bids"

sql_select = f"""SELECT DISTINCT {fk_column}
FROM {schema_name}.{table_name}
WHERE {fk_column} IS NOT NULL
  AND {fk_column} NOT IN (SELECT {fk_column} FROM {schema_name}.{fk_table});"""
### And Insert values for gebotsdatum and zuschlagsdatum if WIN\d{2}-\d{1,2}/ are present and have a date

### Retrieve the values
# Create a cursor
cur = conn.cursor()
    
# Execute your SQL statement
cur.execute(sql_select)
list_missing_bid_nr = [row[0] for row in cur.fetchall()]
    
# 
cur.close()
list_missing_bid_nr

### The following are present in link bids but where written false in wind_eeg
list_correct_bids = [val.replace("l", "I") for val in list_missing_bid_nr if re.match(r"^WlN", val)]
list_correct_bids = list_correct_bids + ['WIN21-3/210', 'WIN21-3/212'] 
list_correct_bids

#['Win21-3/210', 'Win21-3/212', 'Inn20-1/064', 'WIN00-6/797'] 
# Idea first wln to win, than try rest automatically later
list_error_bids = [val for val in list_missing_bid_nr if re.match(r"^WlN", val)] + ['Win21-3/210', 'Win21-3/212']

df_error_bids = pd.DataFrame({"correct":list_correct_bids, "error":list_error_bids})
# Change these values in wind_eeg
# add the res of the values as empty in link_wind_bids

# Create a cursor
cur = conn.cursor()

# Iterate over the DataFrame rows
for _, row in df_error_bids.iterrows():
    error_value = row['error']
    correct_value = row['correct']

    # Update the error value with the correct value in wind_eeg table
    sql_update = f"""
    UPDATE {schema_name}.{table_name}
    SET {fk_column} = '{correct_value}'
    WHERE {fk_column} = '{error_value}';
    """
    cur.execute(sql_update)

# Commit the transaction
conn.commit()

# Close the cursor
cur.close()

In [20]:
list_missing_bid_nr = [val for val in list_missing_bid_nr if val not in list_error_bids]

In [32]:
# The remaining Zuschlagsnr are not within link_wind_bids (wind_bids_units)
# To be able to make wind_eeg.zuschlagsnr a fk-column, list_missing_bid_nr must be written into link_wind_bids
# to make it as complete as possible, at least add a date where possible.
# The date is queried from similar bid-nrs of the same bid-date
df_dates = pd.DataFrame()
        # Create a cursor
cur = conn.cursor()
for val in list_missing_bid_nr:
    dict_row = {}
    
    val_6 = val[0:7]
    dict_row['zuschlagsnummer']= val
    
    if re.match("^WIN[12]\d-\d", val_6):
        sql_dates = f"""SELECT DISTINCT zuschlagsdatum, gebotsdatum
        FROM {schema_name}.{fk_table}
        WHERE {fk_column} LIKE '{val_6}%';"""
        
        print(sql_dates)
        cur.execute(sql_dates)
        res = cur.fetchall()
        print(res)
        # res to a good row object
        if len(res)!=0:
            dict_row['zuschlagsdatum']=res[0][0]
            dict_row['gebotsdatum']=res[0][1]
        else: 
            dict_row['zuschlagsdatum']=None
            dict_row['gebotsdatum']=None
            
    else: 
        dict_row['zuschlagsdatum']=None
        dict_row['gebotsdatum']=None
    
    df_row = pd.DataFrame(dict_row, index=[0])
    if df_dates.empty:
        df_dates = df_row
    else: 
        df_dates = pd.concat([df_dates, df_row], ignore_index=True)

print(df_dates)
        

SELECT DISTINCT zuschlagsdatum, gebotsdatum
        FROM public.link_wind_bids
        WHERE zuschlagsnummer LIKE 'WIN17-2%';
[('2017-08-22', '2017-08-01')]
SELECT DISTINCT zuschlagsdatum, gebotsdatum
        FROM public.link_wind_bids
        WHERE zuschlagsnummer LIKE 'WIN23-3%';
[]
SELECT DISTINCT zuschlagsdatum, gebotsdatum
        FROM public.link_wind_bids
        WHERE zuschlagsnummer LIKE 'WIN17-2%';
[('2017-08-22', '2017-08-01')]
SELECT DISTINCT zuschlagsdatum, gebotsdatum
        FROM public.link_wind_bids
        WHERE zuschlagsnummer LIKE 'WIN17-2%';
[('2017-08-22', '2017-08-01')]
SELECT DISTINCT zuschlagsdatum, gebotsdatum
        FROM public.link_wind_bids
        WHERE zuschlagsnummer LIKE 'WIN17-2%';
[('2017-08-22', '2017-08-01')]
SELECT DISTINCT zuschlagsdatum, gebotsdatum
        FROM public.link_wind_bids
        WHERE zuschlagsnummer LIKE 'WIN17-2%';
[('2017-08-22', '2017-08-01')]
SELECT DISTINCT zuschlagsdatum, gebotsdatum
        FROM public.link_wind_bids
       

In [None]:
### Check schema and links!
### Add foreign-Key wind_eeg.zuschlagsnummer -> link_wind_bids.zuschlagsnummer

schema_name = "public"
table_name = "wind_eeg"
fk_column = "zuschlagsnummer"
fk_table = "link_wind_bids"

sql_foreign_key = f"""
    ALTER TABLE {schema_name}.{table_name}
    ADD CONSTRAINT foreign_key_{fk_column} 
    FOREIGN KEY ({fk_column})
    REFERENCES {schema_name}.{fk_table} ({fk_column});
    """

try:
        # Create a cursor
        cur = conn.cursor()
    
        # Execute your SQL statement
        cur.execute(sql_foreign_key)
    
        # Commit the transaction
        conn.commit()
    
except Exception as e:
        # Handle the exception
        print(f"Error: {e}")
        conn.rollback()
        
        # Error: insert or update on table "wind_eeg" violates foreign key constraint "foreign_key_zuschlagsnummer"
        # DETAIL:  Key (zuschlagsnummer)=(WIN17-2/183) is not present in table "link_wind_bids".
    
finally:
        # Close the cursor and connection
        cur.close()