1: DimDate Table

In [None]:
%pip install --upgrade sqlalchemy
%pip install --upgrade pyodbc

In [None]:
import pyodbc
print(pyodbc.drivers())

In [2]:
import re
import pandas as pd
from sqlalchemy import create_engine

In [None]:
# Gegevens voor de verbinding
server = "xxx"  # Servernaam of IP-adres van je SQL Server
database = "DEP1_DWH"  # Naam van je database

# Maak de verbindingsstring met Windows Authenticatie (Integrated Security)
engine = create_engine("mssql+pyodbc://@{}/{}?driver=ODBC+Driver+17+for+SQL+Server".format(server, database))

In [4]:
# Maak een range van datums
date_list = pd.date_range(start="01-01-2010", end="31-12-2025", freq='D')

months_translation = {
    'January': 'Januari', 'February': 'Februari', 'March': 'Maart', 'April': 'April',
    'May': 'Mei', 'June': 'Juni', 'July': 'Juli', 'August': 'Augustus', 
    'September': 'September', 'October': 'Oktober', 'November': 'November', 'December': 'December'
}

days_translation = {
    'Monday': 'Maandag', 'Tuesday': 'Dinsdag', 'Wednesday': 'Woensdag', 'Thursday': 'Donderdag',
    'Friday': 'Vrijdag', 'Saturday': 'Zaterdag', 'Sunday': 'Zondag'
}

dim_date_df = pd.DataFrame({
    'DateKey': date_list.strftime('%Y%m%d').astype(int),  # YYYYMMDD als key
    'FullDate': date_list.date,  # Volledige datum
    'MonthNameDutch': date_list.strftime('%B').map(months_translation),  # Maandnaam (kan vertaald worden)
    'MonthNameEN': date_list.strftime('%B'),  # Maandnaam in Engels
    'DayNameDutch': date_list.strftime('%A').map(days_translation),  # Dagnaam in Nederlands
    'DayNameEN': date_list.strftime('%A'),  # Dagnaam in Engels
    'QuarterName': 'Q' + date_list.quarter.astype(str),  # Kwartaal als 'Q1', 'Q2', ...
    'QuarterNumber': date_list.quarter  # Kwartaalnummer (1-4)
})

# Schrijf naar SQL Server
dim_date_df.to_sql('DimDate', con=engine, if_exists='append', index=False)

80

In [5]:
dim_date_df.head()

Unnamed: 0,DateKey,FullDate,MonthNameDutch,MonthNameEN,DayNameDutch,DayNameEN,QuarterName,QuarterNumber
0,20100101,2010-01-01,Januari,January,Vrijdag,Friday,Q1,1
1,20100102,2010-01-02,Januari,January,Zaterdag,Saturday,Q1,1
2,20100103,2010-01-03,Januari,January,Zondag,Sunday,Q1,1
3,20100104,2010-01-04,Januari,January,Maandag,Monday,Q1,1
4,20100105,2010-01-05,Januari,January,Dinsdag,Tuesday,Q1,1


2: DimTime Table

In [6]:
def generate_dim_time():
    time_data = []

    for hour in range(0, 24):
        for minute in range(0, 60):
            am_pm = 'AM' if hour < 12 else 'PM'
            hour_12 = hour if 1 <= hour <= 12 else (12 if hour == 0 or hour == 24 else hour - 12)
            time_key = f"{hour:02}{minute:02}"
            full_time = f"{hour:02}:{minute:02}:00"
            
            time_data.append({
                "TimeKey": time_key,
                "Hour": hour_12,
                "Minutes": minute,
                "FullTime": full_time,
                "TimeAM_PM": am_pm
            })
    
    return pd.DataFrame(time_data)

# Data genereren
dim_time_df = generate_dim_time()

# Data naar SQL Server schrijven
dim_time_df.to_sql("DimTime", con=engine, if_exists="append", index=False)

183

In [7]:
dim_time_df.head()

Unnamed: 0,TimeKey,Hour,Minutes,FullTime,TimeAM_PM
0,0,12,0,00:00:00,AM
1,1,12,1,00:01:00,AM
2,2,12,2,00:02:00,AM
3,3,12,3,00:03:00,AM
4,4,12,4,00:04:00,AM


3: DimWeatherStation Table

In [8]:
# Lees de CSV voor weerstations
weather_station_df = pd.read_csv('../data/input/aws_station.csv')

# Verwerk de kolommen
weather_station_df.rename(columns={
    "code": "WeatherStationID",
    "name": "WeatherStationName",
    "altitude": "Altitude",
    "the_geom": "Coordinates"
}, inplace=True)

# Functie om Latitude en Longitude te extraheren uit 'the_geom' kolom
def extract_lat_lon(geom):
    match = re.search(r"POINT \(([\d\.-]+) ([\d\.-]+)\)", geom)
    if match:
        lon, lat = match.groups()
        return float(lat), float(lon)
    return None, None

# Latitude en Longitude kolommen toevoegen
weather_station_df["Latitude"], weather_station_df["Longitude"] = zip(*weather_station_df["Coordinates"].apply(extract_lat_lon))

# Onnodige kolom verwijderen
weather_station_df.drop(columns=["Coordinates"], inplace=True)
weather_station_df = weather_station_df.drop(['FID', 'date_begin', 'date_end'], axis = 1)

# Data naar SQL Server schrijven
weather_station_df.to_sql("DimWeatherStation", con=engine, if_exists="append", index=False)

8

In [9]:
weather_station_df.head()

Unnamed: 0,WeatherStationID,WeatherStationName,Altitude,Latitude,Longitude
0,6472,HUMAIN,295.3,5.255,50.194
1,6438,STABROEK,4.0,4.364,51.325
2,6464,RETIE,21.5,5.027,51.221
3,6447,UCCLE,100.6,4.358,50.797
4,6434,MELLE,15.0,3.816,50.98


4: FactWeather Table

In [10]:
# Lees de CSV voor weerdata
weather_data_df = pd.read_csv('../data/input/aws_1day.csv')

weather_data_df = weather_data_df.drop(['FID', 'the_geom', 'qc_flags'], axis = 1)
weather_data_df = weather_data_df.merge(weather_station_df, how='inner', left_on="code", right_on='WeatherStationID')
weather_data_df = weather_data_df.drop(['WeatherStationName', 'Latitude', 'Longitude', 'Altitude', 'WeatherStationID'], axis = 1)
weather_data_df['DateKey'] = weather_data_df['timestamp'].str[0:4] + weather_data_df['timestamp'].str[5:7] + weather_data_df['timestamp'].str[8:10]
weather_data_df['Time'] = weather_data_df['timestamp'].str[-8:]
weather_data_df = weather_data_df.merge(dim_time_df, how='inner', left_on="Time", right_on='FullTime')
weather_data_df = weather_data_df.drop(['timestamp', 'Hour', 'Minutes', 'FullTime', 'TimeAM_PM'], axis = 1)

weather_data_df = weather_data_df.rename(columns={"precip": "PrecipQuantity","temp_avg": "TempAvg","temp_max": "TempMax","temp_min": "TempMin",
                                                  "temp_grass": "TempGrassPt100Avg","temp_soil_avg": "TempSoilAvg","temp_soil_5cm": "TempSoilAvg5cm",
                                                  "temp_soil_10cm": "TempSoilAvg10cm","temp_soil_20cm": "TempSoilAvg20cm",
                                                  "temp_soil_50cm": "TempSoilAvg50cm","wind_speed_10m": "WindSpeed10m",
                                                  "wind_speed_avg_30m": "WindSpeedAvg30m","wind_gust_speed": "WindGustsSpeed",
                                                  "humidity_avg": "HumidityRelShelterAvg","pressure": "Pressure","sun_duration": "SunDuration",
                                                  "short_wave": "ShortWaveFromSkyAvg","sun_intensity": "SunIntAvg"})

weather_data_df = weather_data_df.reindex(columns=["DateKey", "TimeKey", "WeatherStationKey", "PrecipQuantity", "TempAvg", "TempMax", "TempMin",
                                                    "TempGrassPt100Avg", "TempSoilAvg", "TempSoilAvg5cm", "TempSoilAvg10cm", 
                                                    "TempSoilAvg20cm", "TempSoilAvg50cm", "WindSpeed10m", "WindSpeedAvg30m", 
                                                    "WindGustsSpeed", "HumidityRelShelterAvg", "Pressure", "SunDuration", "ShortWaveFromSkyAvg", 
                                                    "SunIntAvg"])

# Data naar SQL Server schrijven
weather_data_df.to_sql("FactWeather", con=engine, if_exists="append", index=False)

53

In [11]:
weather_data_df.head()

Unnamed: 0,DateKey,TimeKey,WeatherStationKey,PrecipQuantity,TempAvg,TempMax,TempMin,TempGrassPt100Avg,TempSoilAvg,TempSoilAvg5cm,...,TempSoilAvg20cm,TempSoilAvg50cm,WindSpeed10m,WindSpeedAvg30m,WindGustsSpeed,HumidityRelShelterAvg,Pressure,SunDuration,ShortWaveFromSkyAvg,SunIntAvg
0,20201014,0,,,11.29,12.8,10.0,,,,...,,,7.74,,,,1017.57,82.33,,
1,20201014,0,,,8.94,12.23,5.15,,8.9,,...,,,2.6,,,,1017.32,11.75,,
2,20201014,0,,,9.06,12.33,5.91,,8.88,,...,,,3.0,,,,1018.37,33.08,,
3,20201016,0,,,9.01,10.72,5.62,,9.42,,...,,,,2.67,,,1010.36,,,
4,20201017,0,,,7.97,13.09,4.0,,9.96,,...,,,0.46,,,,1019.12,219.67,,


5: FactBelpex Table

In [None]:
# Lees de CSV voor BELPEX
belpex_df = pd.read_csv('../data/input/BelpexFilter.csv')

#TODO
belpex_df['DateKey'] = belpex_df['Date'].str[5:9] + belpex_df['Date'].str[5:7] + belpex_df['Date'].str[8:10]
belpex_df['Time'] = belpex_df['Date'].str[-8:]
belpex_df = belpex_df.merge(dim_time_df, how='inner', left_on="Time", right_on='FullTime')
belpex_df = belpex_df.drop(['Hour', 'Minutes', 'FullTime', 'TimeAM_PM', 'Date'], axis = 1)

# Schrijf naar SQL Server
belpex_df.to_sql('FactBelpex', con=engine, if_exists='append', index=False)


6: Verbruikersdata

- Verwerk de verbruikersdata naar het juiste formaat.
- Gebruik SQL Server's bulk-insert om grote hoeveelheden data efficiënt te verwerken.

7: Overige Tabellen

- Voor de overige tabellen, volg dezelfde logica:
    - Lees de CSV’s.
    - Voeg de benodigde foreign keys toe.
    - Schrijf de data weg naar de juiste tabellen via bulk-insert of andere batch methoden.

Algemeen:
Voor alle bulk-insert taken moet je zorgen voor een efficiënte schrijfmethode naar SQL Server, bijvoorbeeld:

- to_sql() in combinatie met een SQLAlchemy engine.
- Bulk-insert via pyodbc of tools zoals bcp.
- Gebruik maken van BULK INSERT in SQL Server voor het snel inladen van grote datasets.