# Code Documentation of Group2 | Datalake and Datawarehouse

## Lamdba Functions for API

### group2_currency_api

In [None]:
import json
import pandas as pd
import datetime as dt
import requests
import psycopg2
import os
import boto3
from psycopg2.extras import execute_values

# Get Environment Variables

endpoint = os.getenv('ENDPOINT')
db_name = os.getenv('DB_NAME')
username = os.getenv('USERNAME')
password = os.getenv('PASSWORD')
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID_INTERNAL')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY_INTERNAL')
bucket_name = os.getenv('BUCKET_NAME')



def upload_dataframe_to_db(df, table_name, conn):
    cur = conn.cursor()

    # Replace invalid characters in column names and convert index to a column
    df = df.copy()
    df.index.name = 'timestamp'
    df.reset_index(inplace=True)
    df.columns = [col.lower().replace(" ", "_").replace("-", "_") for col in df.columns]

    # Generate CREATE TABLE query
    column_defs = ', '.join([
        f"{col} {'timestamp' if col == 'timestamp' else ('text' if col in ['unit', 'country'] else 'float')}"
        for col in df.columns
    ])
    create_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({column_defs});"
    cur.execute(create_query)

    # INSERT query
    placeholders = ', '.join(['%s'] * len(df.columns))
    insert_query = f"INSERT INTO {table_name} ({', '.join(df.columns)}) VALUES ({placeholders})"

    # Convert NaN to None for SQL compatibility
    values = [tuple(None if pd.isna(x) else x for x in row) for row in df.values]
    cur.executemany(insert_query, values)

    cur.close()

def upload_dataframe_to_bucket(df, foldername,s3 ,bucket_name):
    today = str(dt.datetime.today().date())
    key = foldername + '/' + foldername + '_' + today + '.json'  # Key = path in the bucket

    data = df.reset_index().to_json(orient="records", date_format="iso")

    s3.put_object(
        Bucket=bucket_name,
        Key=key,
        Body=json.dumps(data),
        ContentType='application/json'
    )

def fetch_cbet_data(date: str) -> dict:
    base_url = "https://api.energy-charts.info/cbet"
    params = {
        "country": "ch",
        "start": date
    }
    headers = {
        'accept': 'application/json'
    }

    response = requests.get(base_url, params=params, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error during requesting CBET-Data for {date}: {response.status_code}")
        return {}


def extract_cbet_data(json_data: dict) -> pd.DataFrame:
    idx = pd.to_datetime(json_data["unix_seconds"], unit="s", utc=True)
    idx.name = "timestamp"

    frames = []

    for country in json_data.get("countries", []):
        values = country.get("data", [])
        if len(values) != len(idx):
            # length guard – optional but helpful for debugging mismatches
            raise ValueError(
                f"Length mismatch for {country.get('name')}: "
                f"{len(values)} values vs {len(idx)} timestamps"
            )
        df = pd.DataFrame(
            {"value": values, "country": country.get("name")},
            index=idx
        )
        frames.append(df)

    return pd.concat(frames).reset_index()

def insert_cbet_data_to_db(df_cbet, table_name, conn):
    cur = conn.cursor()
    df_cbet = df_cbet.fillna(0)

    column_defs = ', '.join([
        f"{col} {'timestamp' if col == 'timestamp' else ('text' if col in ['unit', 'country'] else 'float')}"
        for col in df_cbet.columns
        ])

    create_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({column_defs});"
    cur.execute(create_query)

    insert_query = f"""
    INSERT INTO {table_name} (timestamp, country, value)
    VALUES %s
    """
    data_tuples = [
        (row['timestamp'], row['country'], row['value'])
        for _, row in df_cbet.iterrows()
    ]

    try:
        execute_values(cur, insert_query, data_tuples)
        conn.commit()
        print("CBET-Data successfully implemented.")
    except Exception as e:
        print(f"Error in implementing CBET-Data: {e}")
        conn.rollback()
    finally:
        cur.close()

def lambda_handler(event, context):
    date = dt.datetime.today().date()
    start_date = str(date-dt.timedelta(days=1))

    api_result_power = requests.get(f'https://api.energy-charts.info/public_power?country=ch&start={start_date}')
    api_result_prices = requests.get(f'https://api.energy-charts.info/price?bzn=CH&start={start_date}')

    api_response_power = api_result_power.json()
    api_response_prices = api_result_prices.json()

    cbet_json = fetch_cbet_data(start_date)

    api_df_power = pd.DataFrame(api_response_power["production_types"])
    api_df_power_t = pd.DataFrame(api_df_power['data'].tolist()).T
    api_df_power_t.columns = api_df_power['name']
    api_df_power_t.index = pd.to_datetime(api_response_power["unix_seconds"], unit='s', utc=True)

    api_df_price = pd.DataFrame(api_response_prices["price"])
    api_df_price['Unit'] = api_response_prices["unit"]
    api_df_price.columns = ['Price', 'Unit']
    api_df_price.index = pd.to_datetime(api_response_prices["unix_seconds"], unit='s', utc=True)

    api_df_cbet = extract_cbet_data(cbet_json)

    try:
        print("Connecting to DB & Bucket...")
        conn = psycopg2.connect(
            host=endpoint,
            dbname=db_name,
            user=username,
            password=password
        )
        conn.set_session(autocommit=True)

        print("Connection to DB successful.")

        s3 = boto3.client('s3',
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
            )

        print("Connection to Bucket successful.")

        # Upload DataFrames
        upload_dataframe_to_db(api_df_power_t, "tbl_energy_production_data", conn)
        upload_dataframe_to_db(api_df_price, "tbl_energy_price_data", conn)
        insert_cbet_data_to_db(api_df_cbet, "tbl_energy_cbet_data", conn)



        upload_dataframe_to_bucket(api_df_power_t, "energy_production",s3,bucket_name)
        upload_dataframe_to_bucket(api_df_price, "energy_price",s3,bucket_name)
        upload_dataframe_to_bucket(api_df_cbet, "energy_cbet",s3,bucket_name)

        conn.close()
        print("Data uploaded and connection closed.")

    except Exception as e:
        print("Error:", e)






    return {
        'statusCode': 200,
        'body': json.dumps("Inport succesfully")
    }


### group2_energy_api

In [None]:
import json
import pandas as pd
import datetime as dt
import requests
import psycopg2
import os
import boto3
from psycopg2.extras import execute_values

# Get Environment Variables

endpoint = os.getenv('ENDPOINT')
db_name = os.getenv('DB_NAME')
username = os.getenv('USERNAME')
password = os.getenv('PASSWORD')
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID_INTERNAL')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY_INTERNAL')
bucket_name = os.getenv('BUCKET_NAME')



def upload_dataframe_to_db(df, table_name, conn):
    cur = conn.cursor()

    # Replace invalid characters in column names and convert index to a column
    df = df.copy()
    df.index.name = 'timestamp'
    df.reset_index(inplace=True)
    df.columns = [col.lower().replace(" ", "_").replace("-", "_") for col in df.columns]

    # Generate CREATE TABLE query
    column_defs = ', '.join([
        f"{col} {'timestamp' if col == 'timestamp' else ('text' if col == 'unit' else 'float')}"
        for col in df.columns
    ])
    create_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({column_defs});"
    cur.execute(create_query)

    # INSERT query
    placeholders = ', '.join(['%s'] * len(df.columns))
    insert_query = f"INSERT INTO {table_name} ({', '.join(df.columns)}) VALUES ({placeholders})"

    # Convert NaN to None for SQL compatibility
    values = [tuple(None if pd.isna(x) else x for x in row) for row in df.values]
    cur.executemany(insert_query, values)

    cur.close()

def upload_dataframe_to_bucket(df, foldername,s3 ,bucket_name):
    today = str(dt.datetime.today().date())
    key = foldername + '/' + foldername + '_' + today + '.json'  # Key = path in the bucket

    data = df.reset_index().to_json(orient="records", date_format="iso")

    s3.put_object(
        Bucket=bucket_name,
        Key=key,
        Body=json.dumps(data),
        ContentType='application/json'
    )

def fetch_cbet_data(date: str) -> dict:
    base_url = "https://api.energy-charts.info/cbet"
    params = {
        "country": "ch",
        "start": date
    }
    headers = {
        'accept': 'application/json'
    }

    response = requests.get(base_url, params=params, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error during requesting CBET-Data for {date}: {response.status_code}")
        return {}


def extract_cbet_data(json_data: dict) -> pd.DataFrame:
    records = []
    timestamps = json_data.get('unix_seconds', [])
    for country in json_data.get('countries', []):
        country_name = country.get('name')
        values = country.get('data', [])

        for ts, value in zip(timestamps, values):
            dtime = dt.datetime.utcfromtimestamp(ts)
            records.append({
                'timestamp': dtime,
                'country': country_name,
                'value': value
            })
            df_records = pd.DataFrame(records)
    return df_records

def insert_cbet_data_to_db(df_cbet, table_name, conn):
    cur = conn.cursor()
    df_cbet = df_cbet.fillna(0)

    column_defs = ', '.join([
        f"{col} {'timestamp' if col == 'timestamp' else ('text' if col == 'country' else 'float')}"
        for col in df_cbet.columns
        ])

    create_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({column_defs});"
    cur.execute(create_query)

    insert_query = f"""
    INSERT INTO {table_name} (timestamp, country, value)
    VALUES %s
    """
    data_tuples = [
        (row['timestamp'], row['country'], row['value'])
        for _, row in df_cbet.iterrows()
    ]

    try:
        execute_values(cur, insert_query, data_tuples)
        conn.commit()
        print("CBET-Data successfully implemented.")
    except Exception as e:
        print(f"Error in implementing CBET-Data: {e}")
        conn.rollback()
    finally:
        cur.close()

def lambda_handler(event, context):
    date = dt.datetime.today().date()
    start_date = str(date-dt.timedelta(days=1))

    api_result_power = requests.get(f'https://api.energy-charts.info/public_power?country=ch&start={start_date}')
    api_result_prices = requests.get(f'https://api.energy-charts.info/price?bzn=CH&start={start_date}')

    api_response_power = api_result_power.json()
    api_response_prices = api_result_prices.json()

    cbet_json = fetch_cbet_data(start_date)

    api_df_power = pd.DataFrame(api_response_power["production_types"])
    api_df_power_t = pd.DataFrame(api_df_power['data'].tolist()).T
    api_df_power_t.columns = api_df_power['name']
    api_df_power_t.index = pd.to_datetime(api_response_power["unix_seconds"], unit='s')

    api_df_price = pd.DataFrame(api_response_prices["price"])
    api_df_price['Unit'] = api_response_prices["unit"]
    api_df_price.columns = ['Price', 'Unit']
    api_df_price.index = pd.to_datetime(api_response_prices["unix_seconds"], unit='s')

    api_df_cbet = extract_cbet_data(cbet_json)

    try:
        print("Connecting to DB & Bucket...")
        conn = psycopg2.connect(
            host=endpoint,
            dbname=db_name,
            user=username,
            password=password
        )
        conn.set_session(autocommit=True)

        print("Connection to DB successful.")

        s3 = boto3.client('s3',
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
            )

        print("Connection to Bucket successful.")

        # Upload DataFrames
        upload_dataframe_to_db(api_df_power_t, "tbl_energy_production_data", conn)
        upload_dataframe_to_db(api_df_price, "tbl_energy_price_data", conn)
        insert_cbet_data_to_db(api_df_cbet, "tbl_energy_cbet_data", conn)



        upload_dataframe_to_bucket(api_df_power_t, "energy_production",s3,bucket_name)
        upload_dataframe_to_bucket(api_df_price, "energy_price",s3,bucket_name)
        upload_dataframe_to_bucket(api_df_cbet, "energy_cbet",s3,bucket_name)

        conn.close()
        print("Data uploaded and connection closed.")

    except Exception as e:
        print("Error:", e)






    return {
        'statusCode': 200,
        'body': json.dumps("Inport succesfully")
    }


### group2_weather_api

In [None]:
import requests
import pandas as pd
import json
from datetime import datetime, timedelta
import psycopg2
from psycopg2.extras import execute_values
import boto3
import os
# Env Variable
var_apikey = os.getenv("var_apikey")
var_database = os.getenv("var_database")
var_database2 = os.getenv("var_database2")
var_host = os.getenv("var_host")
var_host2 = os.getenv("var_host2")
var_password = os.getenv("var_password")
var_password2 = os.getenv("var_password2")
var_port = os.getenv("var_port")
var_user = os.getenv("var_user")
var_bucketname = os.getenv("var_bucketname")
var_aws_access_key_id = os.getenv("var_aws_access_key_id")
var_aws_secret_access_key = os.getenv("var_aws_secret_access_key")

# === PARAMETER ============================================
DAYS_BACK = 1  # defining the amount of days from yesterday backwards

# configuration parameter for db and bucket
BUCKET_NAME = var_bucketname
DB_CONFIG = {
    "host": var_host,
    "port": var_port,
    "database": var_database,
    "user": var_user,
    "password": var_password
}
DB_CONFIG2 = {
    "host": os.getenv("var_host2"),
    "port": os.getenv("var_port"),
    "database": os.getenv("var_database2"),
    "user": os.getenv("var_user"),
    "password": os.getenv("var_password2")
}
API_KEY = var_apikey
AWS_ACCESS_KEY_ID = var_aws_access_key_id
AWS_SECRET_ACCESS_KEY = var_aws_secret_access_key

# === FUNCTIONS ==============================================
#Connecting to database
def connect_to_db():
    return psycopg2.connect(**DB_CONFIG)
def connect_to_db2():
    return psycopg2.connect(**DB_CONFIG2)

#Reading in location data from dim_locations
def get_city_data():
    conn = connect_to_db2()
    try:
        query = "SELECT city, lat, long FROM dim_locations;"
        df = pd.read_sql_query(query, conn)
        return df
    finally:
        conn.close()

#Getting data from API
def fetch_weather_data(city, lat, lon, start, end, api_key):
    url = f"https://history.openweathermap.org/data/2.5/history/city?lat={lat}&lon={lon}&type=hour&start={start}&end={end}&appid={api_key}"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error on {city}: {response.status_code}")
        return None
    return response.json()

#Loading data into AWS bucket
def upload_to_s3(s3_client, city, date_str, data):
    object_key = f"weather_data/weatherdata_{city.lower()}_{date_str}.json"
    s3_client.put_object(
        Bucket=BUCKET_NAME,
        Key=object_key,
        Body=json.dumps(data, ensure_ascii=False).encode('utf-8'),
        ContentType='application/json'
    )
#Extracting data from recieved json of API

def extract_weather_data(city, weather_json):
    entries = []
    for entry in weather_json.get("list", []):
        dt = datetime.utcfromtimestamp(entry["dt"])
        main = entry["main"]
        wind = entry.get("wind", {})
        weather = entry["weather"][0]
        rain = entry.get("rain", {}).get("1h", 0.0)
        clouds = entry.get("clouds", {}).get("all", 0)
        entries.append({
            'City': city,
            'datetime': dt,
            'temp': main['temp'],
            'pressure': main['pressure'],
            'humidity': main['humidity'],
            'temp_min': main['temp_min'],
            'temp_max': main['temp_max'],
            'wind_speed': wind.get('speed', 0.0),
            'weather_main': weather['main'],
            'weather_description': weather['description'],
            'rain_1h': rain,
            'clouds_all': clouds
        })
    return entries

#writing data into datalake database
def insert_weather_data(conn, df_weather):
    cur = conn.cursor()
    df_weather = df_weather.fillna(0)
    data_tuples = [
        (
            row['City'],
            row['datetime'],
            row['temp'],
            row['pressure'],
            row['humidity'],
            row['temp_min'],
            row['temp_max'],
            row['wind_speed'],
            row['weather_main'],
            row['weather_description'],
            row['rain_1h'],
            row['clouds_all']
        )
        for _, row in df_weather.iterrows()
    ]
    insert_query = """
    INSERT INTO tbl_weather_data (
        city, datetime, temp, pressure, humidity,
        temp_min, temp_max, wind_speed,
        weather_main, weather_description, rain_1h, clouds_all
    ) VALUES %s
    """
    try:
        execute_values(cur, insert_query, data_tuples)
        conn.commit()
        print("Data successfully inserted.")
    except Exception as e:
        print(f"Error during inserting data: {e}")
        conn.rollback()
    finally:
        cur.close()

# === MAIN LAMBDA HANDLER ====================================
def lambda_handler(event, context):
    #-> defined function <get_city_data>
    df_coords = get_city_data()

    #Opening bucket connection
    s3 = boto3.client('s3',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
    )
    END_DATE = datetime.utcnow() - timedelta(days=1)
    all_data = []

    #iterating through day requests of the past (usual 1 for yesterday -> manually adaptable for historical data)
    for days_ago in range(DAYS_BACK):
        target_day = END_DATE - timedelta(days=days_ago)
        start = int(datetime(target_day.year, target_day.month, target_day.day, 0, 0).timestamp())
        end = int(datetime(target_day.year, target_day.month, target_day.day, 23, 59).timestamp())
        date_str = target_day.strftime('%Y%m%d')

        #iterating through weather station locations (location data from dim_locations)
        for _, row in df_coords.iterrows():
            city = row["city"]
            lat = row["lat"]
            lon = row["long"]

            #-> defined function <fetch_weather_data()>
            weather_json = fetch_weather_data(city, lat, lon, start, end, API_KEY)
            if not weather_json:
                continue

            #-> defined function <upload_to_s3>
            upload_to_s3(s3, city, date_str, weather_json)

            #-> defined function <extract_weather_data()>
            city_data = extract_weather_data(city, weather_json)
            all_data.extend(city_data)
    df_weather = pd.DataFrame(all_data)

    #Inserting data into datalake database
    if not df_weather.empty:
        try:
            conn = connect_to_db()

            #-> defined function <inser_weather_data>
            insert_weather_data(conn, df_weather)
        except Exception as e:
            print(f"Connection error: {e}")
        finally:
            if conn:
                conn.close()
    else:
        print("No weather data available.")



### group2_weather_sun_api

In [None]:
import json
import pandas as pd
import datetime as dt
import requests
import psycopg2
import os
import boto3
from psycopg2.extras import execute_values
from zoneinfo import ZoneInfo
from typing import Optional, Dict, Any

# Load environment variables for DB and AWS credentials
endpoint: str = os.getenv('ENDPOINT')
db_name: str = os.getenv('DB_NAME')
username: str = os.getenv('USERNAME')
password: str = os.getenv('PASSWORD')
aws_access_key_id: str = os.getenv('AWS_ACCESS_KEY_ID_INTERNAL')
aws_secret_access_key: str = os.getenv('AWS_SECRET_ACCESS_KEY_INTERNAL')
bucket_name: str = os.getenv('BUCKET_NAME')

# Secondary DB config dictionary
DB_CONFIG2: Dict[str, Optional[str]] = {
    "host": os.getenv("var_host2"),
    "port": os.getenv("var_port"),
    "database": os.getenv("var_database2"),
    "user": os.getenv("USERNAME"),
    "password": os.getenv("var_password2")
}

# Upload a DataFrame to a PostgreSQL table with upsert logic
def upload_dataframe_to_db(df: pd.DataFrame, table_name: str, conn: psycopg2.extensions.connection) -> None:
    cur = conn.cursor()

    # SQL to create table if it does not exist
    CREATE_TABLE_SQL = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        city         text          NOT NULL,
        date         date          NOT NULL,
        sunrise      timestamp     NOT NULL,
        sunset       timestamp     NOT NULL,
        first_light  timestamp     NOT NULL,
        last_light   timestamp     NOT NULL,
        dawn         timestamp     NOT NULL,
        dusk         timestamp     NOT NULL,
        solar_noon   timestamp     NOT NULL,
        golden_hour  timestamp     NOT NULL,
        day_length   interval      NOT NULL,
        PRIMARY KEY (city, date)
    );
    """
    cur.execute(CREATE_TABLE_SQL)

    # UPSERT statement using ON CONFLICT
    INSERT_SQL = f"""
    INSERT INTO {table_name} (
        city, date, sunrise, sunset, first_light, last_light,
        dawn, dusk, solar_noon, golden_hour, day_length
    ) VALUES %s
    ON CONFLICT (city, date) DO UPDATE SET
        sunrise      = EXCLUDED.sunrise,
        sunset       = EXCLUDED.sunset,
        first_light  = EXCLUDED.first_light,
        last_light   = EXCLUDED.last_light,
        dawn         = EXCLUDED.dawn,
        dusk         = EXCLUDED.dusk,
        solar_noon   = EXCLUDED.solar_noon,
        golden_hour  = EXCLUDED.golden_hour,
        day_length   = EXCLUDED.day_length;
    """

    cols = [
        "city", "date", "sunrise", "sunset", "first_light", "last_light",
        "dawn", "dusk", "solar_noon", "golden_hour", "day_length"
    ]
    rows = df[cols].where(pd.notnull(df), None).to_numpy().tolist()

    execute_values(cur, INSERT_SQL, rows, page_size=500)
    cur.close()

# Upload DataFrame to S3 bucket as JSON
def upload_dataframe_to_bucket(df: pd.DataFrame, foldername: str, s3: Any, bucket_name: str) -> None:
    today = str(dt.datetime.today().date())
    key = f"{foldername}/{foldername}_{today}.json"  # S3 key/path

    data = df.reset_index().to_json(orient="records", date_format="iso")

    s3.put_object(
        Bucket=bucket_name,
        Key=key,
        Body=json.dumps(data),
        ContentType='application/json'
    )

# Connect to secondary PostgreSQL DB
def connect_to_db2() -> psycopg2.extensions.connection:
    return psycopg2.connect(**DB_CONFIG2)

# Get city name and coordinates from `dim_locations`
def get_city_data() -> pd.DataFrame:
    conn = connect_to_db2()
    try:
        query = "SELECT city, lat, long FROM dim_locations;"
        df = pd.read_sql_query(query, conn)
        return df
    finally:
        conn.close()

# Fetch sunrise/sunset JSON data for one city
def fetch_sun_data(city: str, lat: float, lng: float, start: str) -> Optional[Dict[str, Any]]:
    url = f"https://api.sunrisesunset.io/json?lat={lat}&lng={lng}&date={start}"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error on {city}: {response.status_code}")
        return None
    return response.json()

# Normalize JSON API result into a single-row DataFrame
def extract_sun_data(city: str, sun_json: Dict[str, Any]) -> pd.DataFrame:
    df = pd.json_normalize(sun_json["results"])
    df["date"] = pd.to_datetime(df["date"])
    df["city"] = city
    return df

# Lambda entry point for AWS execution
def lambda_handler(event: dict, context: Any) -> Dict[str, Any]:
    # Step 1: Retrieve city data (name + coordinates)
    df_coords = get_city_data()
    all_sun_data = pd.DataFrame([])

    # Step 2: Loop through each city and request yesterday’s sun data
    for _, row in df_coords.iterrows():
        city = row["city"]
        lat = row["lat"]
        lng = row["long"]
        sun_json = fetch_sun_data(city, lat, lng, start=str(dt.datetime.now().date() - dt.timedelta(days=1)))
        if sun_json:
            all_sun_data = pd.concat([all_sun_data, extract_sun_data(city, sun_json)], ignore_index=True)

    # Step 3: Convert time columns to UTC timezone-aware timestamps
    time_cols = [
        "sunrise", "sunset", "first_light", "last_light",
        "dawn", "dusk", "solar_noon", "golden_hour"
    ]

    for col in time_cols:
        all_sun_data[col] = pd.to_datetime(
            all_sun_data["date"].dt.strftime("%Y-%m-%d") + " " + all_sun_data[col],
            format="%Y-%m-%d %I:%M:%S %p"
        ).dt.tz_localize(ZoneInfo("UTC"))

    # Step 4: Convert duration string to timedelta
    all_sun_data["day_length"] = pd.to_timedelta(all_sun_data["day_length"])

    # Step 5: Connect to PostgreSQL and S3
    try:
        print("Connecting to DB & Bucket...")
        conn = psycopg2.connect(
            host=endpoint,
            dbname=db_name,
            user=username,
            password=password
        )
        conn.set_session(autocommit=True)

        print("Connection to DB successful.")

        s3 = boto3.client(
            's3',
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
        )

        print("Connection to Bucket successful.")

        # Step 6: Upload to PostgreSQL
        upload_dataframe_to_db(all_sun_data, "tbl_weather_sun_data", conn)
        print("Data uploaded to DB successfully.")

        # Step 7: Upload to S3 bucket
        upload_dataframe_to_bucket(all_sun_data, "weather_sun", s3, bucket_name)
        print("Data uploaded to Bucket successfully.")

    except Exception as e:
        print("Error:", e)

    return {
        'statusCode': 200,
        'body': json.dumps('Hello from Lambda!')
    }


### group2_currency_api

In [None]:
import json
import requests
import psycopg2
import boto3
import os
from datetime import datetime

# Load sensitive configuration values from environment variables.
# This keeps credentials and connection details out of the codebase.
ENDPOINT = os.environ['ENDPOINT']
DB_NAME = os.environ['DB_NAME']
USERNAME = os.environ['USERNAME']
PASSWORD = os.environ['PASSWORD']
S3_BUCKET = os.environ['S3_BUCKET'] 
API_KEY = os.environ['API_KEY']

def fetch_currency_data(source_currency="CHF", target_currencies="EUR"):
    """Fetch live exchange rates with customizable source and target currencies"""

    # Ensure API key is available
    if not API_KEY:
        return {"error": "API key not found"}

    # Build API request URL
    base_url = "https://api.apilayer.com/currency_data/"
    endpoint = "live"
    api_url = f"{base_url}{endpoint}?source={source_currency}&currencies={target_currencies}"
    headers = {"apikey": API_KEY}

    # Send request to currency API & return parsed JSON if successful, otherwise return error message
    response = requests.get(api_url, headers=headers)
    return response.json() if response.status_code == 200 else {"error": response.text}

def store_currency_data_in_rds(currency_data):
    """Store live exchange rates in an AWS RDS PostgreSQL database."""
    try:
        print("Connecting to DB...")

        # Establish DB connection using psycopg2
        conn = psycopg2.connect(
            host=ENDPOINT,
            dbname=DB_NAME,
            user=USERNAME,
            password=PASSWORD
        )
        cur = conn.cursor()
        conn.set_session(autocommit=True)
        print("DB connection successful.")

        # Define the table schema and columns
        table_name = "tbl_currency_data"
        columns = {
            "timestamp": "TIMESTAMP NOT NULL",
            "source_currency": "VARCHAR(10)",
            "target_currency": "VARCHAR(10)",
            "exchange_rate": "FLOAT"
        }

        # Create table dynamically if it doesn't exist
        create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ("
        create_table_query += ", ".join([f"{col} {datatype}" for col, datatype in columns.items()])
        create_table_query += ", PRIMARY KEY (timestamp, source_currency, target_currency));"
        cur.execute(create_table_query)

        # Prepare INSERT statement with conflict handling
        insert_query = f"""
        INSERT INTO {table_name} ({', '.join(columns.keys())})
        VALUES (%s, %s, %s, %s)
        ON CONFLICT (timestamp, source_currency, target_currency) DO NOTHING;
        """

        # Get current timestamp in milliseconds (trimmed to 3 decimals)
        now = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]  

        # Parse currency pairs and insert them into the DB
        for pair, rate in currency_data.get("quotes", {}).items():
            source_currency, target_currency = pair[:3], pair[3:]
            cur.execute(
                insert_query,
                (now, source_currency, target_currency, rate)
            )

        # Clean up DB connection
        cur.close()
        conn.close()

    except psycopg2.Error as e:
        print("Database error:", e)

def convert_unix_to_iso(timestamp):
    """Convert Unix timestamp to ISO 8601 format"""
    return datetime.utcfromtimestamp(timestamp).isoformat()

def preprocess_currency_data(data):
    """Fix timestamps in the currency data JSON"""
    if "timestamp" in data:
        data["timestamp"] = convert_unix_to_iso(data["timestamp"])
    return data

def store_json_to_s3(currency_data):
    """Store the JSON data to S3 bucket in a structured way"""
    # Initialize S3 client
    s3 = boto3.client('s3')

    # Generate file key based on today's date
    today = str(datetime.today().date())
    foldername = "currency_data"
    key = f"{foldername}/{foldername}_{today}.json"

    # Convert timestamps to ISO format
    processed_data = preprocess_currency_data(currency_data)

    # Upload JSON to S3
    s3.put_object(
        Bucket=S3_BUCKET,
        Key=key,
        Body=json.dumps(processed_data, indent=2),
        ContentType='application/json'
    )

def lambda_handler(event, context):
    """AWS Lambda Entry Point"""
    
    # Step 1: Fetch exchange rate data from external API
    currency_data = fetch_currency_data()
    
    # Step 2: Return early if fetch failed
    if "error" in currency_data:
        return {
            "statusCode": 400,
            "body": json.dumps({"error": currency_data["error"]}, indent=2)
        }
    
    # Step 3: Store data in PostgreSQL RDS
    store_currency_data_in_rds(currency_data)

    # Step 4: Store JSON data in S3
    store_json_to_s3(currency_data)

    # Step 5: Return success response
    return {
        "statusCode": 200,
        "body": json.dumps({
            "message": "Live exchange rates stored successfully in RDS and S3.",
            "s3_path": f"s3://{S3_BUCKET}/currency_data/currency_rates_{datetime.today().strftime('%Y-%m-%d')}.json",
            "currency_rates": currency_data
        }, indent=2)
    }

### group2_historical_data_currency

In [None]:
# ===================================================================================
# AWS Lambda Function: Historical Currency Data Pipeline
# Description: Fetches historical exchange rates from an external API and stores
#              the data in an RDS PostgreSQL database and in an S3 bucket as JSON.
# Technologies: AWS Lambda, S3, RDS (PostgreSQL), apilayer API
# Author: Joshua Hügli
# ===================================================================================

import json
import requests
import psycopg2
import boto3
import os
from datetime import datetime

# ------------------------------------------------------------------------------
# Environment Configuration
# Load sensitive configuration values from Lambda environment variables.
# ------------------------------------------------------------------------------

ENDPOINT = os.environ['ENDPOINT']       # RDS database host
DB_NAME = os.environ['DB_NAME']         # Database name
USERNAME = os.environ['USERNAME']       # Database username
PASSWORD = os.environ['PASSWORD']       # Database password
S3_BUCKET = os.environ['S3_BUCKET']     # S3 bucket name for JSON export
API_KEY = os.environ['API_KEY']         # API key for currency data provider


# ------------------------------------------------------------------------------
# Fetch Historical Currency Data
# ------------------------------------------------------------------------------

def fetch_historical_currency_data(start_date="2024-01-01", end_date="2025-04-29", source_currency="CHF", target_currencies="EUR"):
    """
    Fetch historical exchange rate data from the apilayer API.

    Args:
        start_date (str): Start date in YYYY-MM-DD format.
        end_date (str): End date in YYYY-MM-DD format.
        source_currency (str): The base currency (default: CHF).
        target_currencies (str): Comma-separated target currencies (default: EUR).

    Returns:
        dict: JSON response with historical exchange rates or error message.
    """
    if not API_KEY:
        return {"error": "API key not found"}

    base_url = "https://api.apilayer.com/currency_data/timeframe"
    params = f"?start_date={start_date}&end_date={end_date}&source={source_currency}&currencies={target_currencies}"
    url = base_url + params
    headers = {"apikey": API_KEY}

    response = requests.get(url, headers=headers)
    return response.json() if response.status_code == 200 else {"error": response.text}


# ------------------------------------------------------------------------------
# Store Historical Currency Data in RDS
# ------------------------------------------------------------------------------

def store_historical_currency_data_in_rds(currency_data):
    """
    Store historical exchange rates in a PostgreSQL RDS instance.

    Args:
        currency_data (dict): JSON object with exchange rates from API.
    """
    try:
        print("Connecting to DB...")
        conn = psycopg2.connect(
            host=ENDPOINT,
            dbname=DB_NAME,
            user=USERNAME,
            password=PASSWORD
        )
        cur = conn.cursor()
        conn.set_session(autocommit=True)
        print("DB connection successful.")

        # Define schema
        table_name = "tbl_currency_data"
        columns = {
            "timestamp": "TIMESTAMP NOT NULL",
            "source_currency": "VARCHAR(10)",
            "target_currency": "VARCHAR(10)",
            "exchange_rate": "FLOAT"
        }

        # Create table if not exists
        create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ("
        create_table_query += ", ".join([f"{col} {datatype}" for col, datatype in columns.items()])
        create_table_query += ", PRIMARY KEY (timestamp, source_currency, target_currency));"
        cur.execute(create_table_query)

        # Prepare INSERT statement
        insert_query = f"""
        INSERT INTO {table_name} ({', '.join(columns.keys())})
        VALUES (%s, %s, %s, %s)
        ON CONFLICT (timestamp, source_currency, target_currency) DO NOTHING;
        """

        # Insert all exchange rate records
        for date_str, rates in currency_data.get("quotes", {}).items():
            for pair, rate in rates.items():
                source_currency, target_currency = pair[:3], pair[3:]
                timestamp = datetime.strptime(date_str, "%Y-%m-%d")
                cur.execute(insert_query, (timestamp, source_currency, target_currency, rate))

        cur.close()
        conn.close()

    except psycopg2.Error as e:
        print("Database error:", e)


# ------------------------------------------------------------------------------
# Store JSON Data in S3
# ------------------------------------------------------------------------------

def store_json_to_s3(currency_data, start_date, end_date):
    """
    Store the fetched historical currency data as a JSON file in an S3 bucket.

    Args:
        currency_data (dict): JSON data from the API.
        start_date (str): Start date of the data range.
        end_date (str): End date of the data range.
    """
    s3 = boto3.client('s3')
    foldername = "currency_data"
    key = f"{foldername}/currency_data_{start_date}_to_{end_date}.json"

    s3.put_object(
        Bucket=S3_BUCKET,
        Key=key,
        Body=json.dumps(currency_data, indent=2),
        ContentType='application/json'
    )


# ------------------------------------------------------------------------------
# AWS Lambda Entry Point
# ------------------------------------------------------------------------------

def lambda_handler(event, context):
    """
    Lambda function entry point for fetching and storing historical currency data.

    Args:
        event (dict): Event payload with optional 'start_date' and 'end_date'.
        context (LambdaContext): Runtime information.

    Returns:
        dict: HTTP-style status and message with S3 path reference.
    """
    # Default values if not provided in event
    start_date = event.get("start_date", "2025-05-22")
    end_date = event.get("end_date", "2025-05-23")

    currency_data = fetch_historical_currency_data(start_date=start_date, end_date=end_date)

    if "error" in currency_data:
        return {
            "statusCode": 400,
            "body": json.dumps({"error": currency_data["error"]}, indent=2)
        }

    store_historical_currency_data_in_rds(currency_data)
    store_json_to_s3(currency_data, start_date, end_date)

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message": "Historical exchange rates stored successfully in RDS and S3.",
            "s3_path": f"s3://{S3_BUCKET}/currency_data/currency_data_{start_date}_to_{end_date}.json",
            "currency_rates": currency_data
        }, indent=2)
    }


## Lamdba Functions for Data Processing in Datawarehouse

### group2_dwh_fact_energy_production

In [None]:
import requests
import pandas as pd
import json
from datetime import date, datetime, timedelta
import psycopg2
from psycopg2.extras import execute_values
import boto3
import os

# Load environment variables for both databases
DB_CONFIG = {
    "host": os.getenv("var_host"),
    "port": os.getenv("var_port"),
    "database": os.getenv("var_database"),
    "user": os.getenv("var_user"),
    "password": os.getenv("var_password")
}
DB_CONFIG2 = {
    "host": os.getenv("var_host2"),
    "port": os.getenv("var_port"),
    "database": os.getenv("var_database2"),
    "user": os.getenv("var_user"),
    "password": os.getenv("var_password2")
}
def lambda_handler(event, context):

    try:
        # Establish database connections
        conn1 = psycopg2.connect(**DB_CONFIG)
        conn2 = psycopg2.connect(**DB_CONFIG2)
        cur2 = conn2.cursor()

        # Define the date for which to fetch data (2 days before today)
        yesterday = date.today() - timedelta(days=2)
        start_date = date.today() - timedelta(days=2)

        # Query data from the data lake
        query_energy = f"""
            SELECT *
            FROM tbl_energy_production_data
            WHERE timestamp >= '{start_date}'::date
            AND timestamp < '{yesterday + timedelta(days=1)}'::date
        """
        df_energy = pd.read_sql_query(query_energy, conn1)

        # Load dimension tables from the data warehouse
        df_locations = pd.read_sql_query("SELECT * FROM dim_locations", conn2)
        df_countries = pd.read_sql_query("SELECT * FROM dim_countries", conn2)
        df_time = pd.read_sql_query("SELECT * FROM dim_time", conn2)

        # Merge with time dimension to get time_id
        df_merged = df_energy.merge(
            df_time[['time_id', 'timestamp_utc']],
            left_on='timestamp',
            right_on='timestamp_utc',
            how='left'
        )

        # Select and rename relevant columns
        df_fact = df_merged[[
            'time_id', 'timestamp', 'nuclear', 'solar', 'wind_onshore', 'load',
            'residual_load', 'hydro_run_of_river',
            'hydro_water_reservoir', 'hydro_pumped_storage'
        ]].copy()
        df_fact.rename(columns={
            'nuclear': 'nuclear_output',
            'solar': 'solar_output',
            'wind_onshore': 'wind_output',
            'hydro_run_of_river': 'hydro_run_of_river_output',
            'hydro_water_reservoir': 'hydro_water_reservoir_output',
            'hydro_pumped_storage': 'hydro_pumped_storage_output'
        }, inplace=True)

        # Add country_id for Switzerland
        switzerland_id = df_countries.loc[
            df_countries['iso_code'] == 'CH', 'country_id'
        ].values[0]
        df_fact.insert(
            loc=df_fact.columns.get_loc('time_id') + 1,
            column='country_id',
            value=switzerland_id
        )

        # Fetch existing time_ids from the target table to avoid duplicates
        cur2.execute("SELECT time_id FROM fact_energy_production")
        existing_time_ids = {row[0] for row in cur2.fetchall()}

        # Keep only records with time_ids not already in the table
        df_fact = df_fact[~df_fact['time_id'].isin(existing_time_ids)]

        # If no new data to insert, exit early
        if df_fact.empty:
            cur2.close()
            conn1.close()
            conn2.close()
            return {
                'statusCode': 200,
                'body': json.dumps('No new records to insert.')
            }

        # Get the current maximum production_id to generate unique keys
        cur2.execute("SELECT COALESCE(MAX(production_id), 0) FROM fact_energy_production")
        max_production_id = cur2.fetchone()[0]

        # Add production_id as a running index starting from the max + 1
        df_fact.insert(0, 'production_id', range(max_production_id + 1, max_production_id + 1 + len(df_fact)))

        # Prepare columns and values for insertion
        columns = [
            "production_id", "time_id", "country_id", "timestamp", "solar_output",
            "wind_output", "load", "residual_load", 'hydro_run_of_river_output',
            'hydro_water_reservoir_output', 'hydro_pumped_storage_output', 'nuclear_output'
        ]
        values = [tuple(x) for x in df_fact[columns].to_numpy()]

        # Create SQL INSERT statement
        insert_sql = f"""
        INSERT INTO fact_energy_production ({', '.join(columns)})
        VALUES %s
        """

        # Execute batch insert using execute_values for efficiency
        execute_values(cur2, insert_sql, values)

        # Commit transaction and close connections
        conn2.commit()
        cur2.close()
        conn1.close()
        conn2.close()
        return {
            'statusCode': 200,
            'body': json.dumps('New records successfully inserted.')
        }
    except Exception as e:
        return {
            'statusCode': 500,
            'body': json.dumps(f'Error during processing: {str(e)}')
        }

### group2_dwh_fact_energy_trade

In [None]:
# STEP 1: Connect to Data Sources
def connect_to_db(endpoint: str, db_name: str, username: str, password: str) -> psycopg2.extensions.connection:
    """
    Establish a connection to a PostgreSQL database.
    """
    conn = psycopg2.connect(
        host=endpoint,
        dbname=db_name,
        user=username,
        password=password
    )
    conn.set_session(autocommit=True)
    return conn

# STEP 2: Utility for Extracting Data from DB
def get_data_from_db(
    conn: psycopg2.extensions.connection,
    table_name: str,
    expression: str = '*',
    condition: str = ''
) -> pd.DataFrame:
    """
    Query data from a database table and return as a pandas DataFrame.
    """
    query = f"SELECT {expression} FROM {table_name}"
    if condition != '':
        query += f" WHERE {condition}"
    query += ';'
    print("Query Performed: " + query)
    df = pd.read_sql_query(query, conn)
    return df

# STEP 4: Calculate Sunshine Percentages by Hour
def calculate_sunshine_percentage(df_sun: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the percentage of sunshine for each hour of the day.
    """

    def time_to_float(t) -> float:
        if isinstance(t, str):
            h, m, s = map(int, t.split(':'))
        else:
            h, m, s = t.hour, t.minute, t.second
        return h + m/60 + s/3600

    df = df_sun.copy()
    df['sunrise_float'] = df['sunrise'].apply(time_to_float)
    df['sunset_float'] = df['sunset'].apply(time_to_float)

    for hour in range(24):
        def hour_percentage(row) -> float:
            start = row['sunrise_float']
            end = row['sunset_float']
            hour_start = hour
            hour_end = hour + 1
            if end <= hour_start or start >= hour_end:
                return 0.0
            overlap_start = max(start, hour_start)
            overlap_end = min(end, hour_end)
            return max(0.0, (overlap_end - overlap_start)) * 100

        df[f'sunshine_pct_hour_{hour}'] = df.apply(hour_percentage, axis=1)

    # Reshape into long format with datetime
    df = df.melt(
        id_vars=['city', 'date'],
        value_vars=[f'sunshine_pct_hour_{hour}' for hour in range(24)],
        var_name='hour',
        value_name='sunshine_pct_hour'
    )
    df['hour'] = df['hour'].str.extract(r'(\d+)').astype(int)
    df['datetime'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')
    df = df.drop(['date', 'hour'], axis=1)
    return df

# STEP 9: Upload Final Data to Warehouse
def upload_to_db(conn: psycopg2.extensions.connection, table_name: str, df: pd.DataFrame) -> None:
    """
    Upload a DataFrame to a database table.
    """
    with conn.cursor() as cur:
        cur.execute(f'''
            CREATE TABLE IF NOT EXISTS {table_name} (
                weather_id       INTEGER PRIMARY KEY,
                time_id          INTEGER REFERENCES dim_time(time_id),
                location_id      INTEGER REFERENCES dim_location(location_id),
                wind_speed       FLOAT,
                sunshine_minutes FLOAT
            );
        ''')
        conn.commit()

    print(f"Table '{table_name}' created successfully.")

    insert_query = f'''
        INSERT INTO {table_name} (weather_id, time_id, location_id, wind_speed, sunshine_minutes)
        VALUES (%s, %s, %s, %s, %s)
    '''
    data_tuples = df.where(pd.notnull(df), None).values.tolist()

    with conn.cursor() as cur:
        for row in data_tuples:
            cur.execute(insert_query, row)
        conn.commit()

    print(f"Table '{table_name}' created and data uploaded successfully.")

# Main ETL Handler
def lambda_handler():
    """
    Main ETL handler function to process and upload weather data.
    """
    try:
        # STEP 1: Connect to Data Lake and Warehouse
        print("Connecting to Datalake ")
        dl_conn = connect_to_db(dl_endpoint, dl_db_name, username, dl_password)
        print("Connection to Datalake successful.")

        print("Connecting to WareHouse ")
        wh_conn = connect_to_db(wh_endpoint, wh_db_name, username, wh_password)
        print("Connection to WareHouse successful.")

        # STEP 2: Identify Last Stored Data in Warehouse
        df_init_fact = get_data_from_db(wh_conn, 'fact_weather', 'max(time_id),max(weather_id)')
        df = get_data_from_db(wh_conn, 'dim_time', 'timestamp_utc', f'time_id = {df_init_fact.iloc[0,0]}')
        print("Last timestamp in warehouse: ", df.iloc[0,0])

        # STEP 3: Extract New Data from Data Lake
        df_sun = get_data_from_db(conn=dl_conn, table_name='tbl_weather_sun_data', condition= f"date >= '{df.iloc[0,0].date()}'")
        df_sun = calculate_sunshine_percentage(df_sun)
        print("Sunshine data downloaded.")

        df_weather = get_data_from_db(conn=dl_conn, table_name='tbl_weather_data', condition= f"datetime > '{df.iloc[0,0]}'")
        print("Weather data downloaded.")

        if df_weather.empty:
            print("No new weather data available.")
            return None
        else:
            print("New weather data available.")

        # STEP 5: Merge Weather and Sunshine
        df_ws = df_weather.merge(df_sun, on=['city', 'datetime'], how='inner')
        df_ws['sunshine_minutes'] = (100 - df_ws['clouds_all']) * df_ws['sunshine_pct_hour'] / 10000 * 60

        # STEP 6: Clean and Filter Data
        df_ws = df_ws.drop(['id','pressure','humidity','temp_min','temp_max','temp','weather_main','weather_description','rain_1h','clouds_all','sunshine_pct_hour'], axis=1)
        print("Data successfully processed.")

        # STEP 7: Enrich with Location Info
        df_city = get_data_from_db(wh_conn, 'dim_locations')
        df_city_s = df_city[['location_id', 'city']]
        df_joined_wl = df_ws.merge(df_city_s, on=['city'], how='outer')
        df_joined_wl = df_joined_wl.drop(['city'], axis=1)

        # STEP 7: Enrich with Time Info
        df_time = get_data_from_db(wh_conn, 'dim_time')
        df_time_s = df_time[['time_id', 'timestamp_utc']]
        df_time_s.columns = ['time_id', 'datetime']
        df_joined_wld = df_joined_wl.merge(df_time_s, on=['datetime'], how='inner')
        df_joined_wld = df_joined_wld.drop(['datetime'], axis=1)

        # STEP 8: Assign Identifiers
        df_joined_wld['weather_id'] = df_joined_wld.index + df_init_fact.iloc[0,1] + 1

        # STEP 9: Load to Warehouse
        upload_to_db(wh_conn, 'fact_weather', df_joined_wld[['weather_id', 'time_id', 'location_id', 'wind_speed', 'sunshine_minutes']])

    except Exception as e:
        print("Error:", e)
        return None

    return

### group2_dwh_fact_weather

In [None]:

def connect_to_db(endpoint: str, db_name: str, username: str, password: str) -> psycopg2.extensions.connection:
    """
    Establish a connection to a PostgreSQL database.
    """
    conn = psycopg2.connect(
        host=endpoint,
        dbname=db_name,
        user=username,
        password=password
    )
    conn.set_session(autocommit=True)
    return conn

def get_data_from_db(
    conn: psycopg2.extensions.connection,
    table_name: str,
    expression: str = '*',
    condition: str = ''
) -> pd.DataFrame:
    """
    Query data from a database table and return as a pandas DataFrame.
    """
    query = f"SELECT {expression} FROM {table_name}"
    if condition != '':
        query += f" WHERE {condition}"
    query += ';'
    print("Query Performed: " + query)
    df = pd.read_sql_query(query, conn)
    return df

def calculate_sunshine_percentage(df_sun: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the percentage of sunshine for each hour of the day.
    """

    def time_to_float(t) -> float:
        # Convert time string or datetime.time to float hours
        if isinstance(t, str):
            h, m, s = map(int, t.split(':'))
        else:
            h, m, s = t.hour, t.minute, t.second
        return h + m/60 + s/3600

    df = df_sun.copy()
    df['sunrise_float'] = df['sunrise'].apply(time_to_float)
    df['sunset_float'] = df['sunset'].apply(time_to_float)

    for hour in range(24):
        def hour_percentage(row) -> float:
            start = row['sunrise_float']
            end = row['sunset_float']
            hour_start = hour
            hour_end = hour + 1
            # No sun during this hour
            if end <= hour_start or start >= hour_end:
                return 0.0
            # Sun is up for part or all of this hour
            overlap_start = max(start, hour_start)
            overlap_end = min(end, hour_end)
            return max(0.0, (overlap_end - overlap_start)) * 100

        df[f'sunshine_pct_hour_{hour}'] = df.apply(hour_percentage, axis=1)

    # Melt the DataFrame to have columns: city, date, hour, sunshine_pct_hour
    df = df.melt(
        id_vars=['city', 'date'],
        value_vars=[f'sunshine_pct_hour_{hour}' for hour in range(24)],
        var_name='hour',
        value_name='sunshine_pct_hour'
    )
    # Extract the hour as integer from the column name
    df['hour'] = df['hour'].str.extract(r'(\d+)').astype(int)
    df['datetime'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')
    df = df.drop(['date', 'hour'], axis=1)
    return df

def upload_to_db(conn: psycopg2.extensions.connection, table_name: str, df: pd.DataFrame) -> None:
    """
    Upload a DataFrame to a database table.
    """
    with conn.cursor() as cur:
        # Create table if it does not exist
        cur.execute(f'''
            CREATE TABLE IF NOT EXISTS {table_name} (
                weather_id       INTEGER PRIMARY KEY,
                time_id          INTEGER REFERENCES dim_time(time_id),
                location_id      INTEGER REFERENCES dim_location(location_id),
                wind_speed       FLOAT,
                sunshine_minutes FLOAT
            );
        ''')
        conn.commit()

    print(f"Table '{table_name}' created successfully.")

    # Insert data row by row
    insert_query = f'''
        INSERT INTO {table_name} (weather_id, time_id, location_id, wind_speed, sunshine_minutes)
        VALUES (%s, %s, %s, %s, %s)
    '''
    data_tuples = df.where(pd.notnull(df), None).values.tolist()

    with conn.cursor() as cur:
        for row in data_tuples:
            cur.execute(insert_query, row)
        conn.commit()

    print(f"Table '{table_name}' created and data uploaded successfully.")

def lambda_handler():
    """
    Main ETL handler function to process and upload weather data.
    """
    try:
        print("Connecting to Datalake ")
        dl_conn = connect_to_db(dl_endpoint, dl_db_name, username, dl_password)
        print("Connection to Datalake successful.")

        # Connect to warehouse
        print("Connecting to WareHouse ")
        wh_conn = connect_to_db(wh_endpoint, wh_db_name, username, wh_password)
        print("Connection to WareHouse successful.")

        # Get the current situation in the warehouse
        df_init_fact = get_data_from_db(wh_conn, 'fact_weather', 'max(time_id),max(weather_id)')
        df = get_data_from_db(wh_conn, 'dim_time', 'timestamp_utc', f'time_id = {df_init_fact.iloc[0,0]}')
        print("Last timestamp in warehouse: ", df.iloc[0,0])

        # Download sunshine data from datalake
        df_sun = get_data_from_db(conn=dl_conn, table_name='tbl_weather_sun_data', condition= f"date >= '{df.iloc[0,0].date()}'")
        df_sun = calculate_sunshine_percentage(df_sun)
        print("Sunshine data downloaded.")

        # Download weather data from datalake
        df_weather = get_data_from_db(conn=dl_conn, table_name='tbl_weather_data', condition= f"datetime > '{df.iloc[0,0]}'")
        print("Weather data downloaded.")

        if df_weather.empty:
            print("No new weather data available.")
            return None
        else:
            print("New weather data available.")

        # Calculate the sunshine duration in minutes
        df_ws = df_weather.merge(df_sun, on=['city', 'datetime'], how='inner')
        df_ws['sunshine_minutes'] = (100 - df_ws['clouds_all']) * df_ws['sunshine_pct_hour'] / 10000 * 60

        # Clean up the DataFrame
        df_ws = df_ws.drop(['id','pressure','humidity','temp_min','temp_max','temp','weather_main','weather_description','rain_1h','clouds_all','sunshine_pct_hour'], axis=1)
        print("Data successfully processed.")

        # Join with city data
        df_city = get_data_from_db(wh_conn, 'dim_locations')
        df_city_s = df_city[['location_id', 'city']]
        df_joined_wl = df_ws.merge(df_city_s, on=['city'], how='outer')
        df_joined_wl = df_joined_wl.drop(['city'], axis=1)

        # Join with time data
        df_time = get_data_from_db(wh_conn, 'dim_time')
        df_time_s = df_time[['time_id', 'timestamp_utc']]
        df_time_s.columns = ['time_id', 'datetime']
        df_joined_wld = df_joined_wl.merge(df_time_s, on=['datetime'], how='inner')
        df_joined_wld = df_joined_wld.drop(['datetime'], axis=1)
        df_joined_wld['weather_id'] = df_joined_wld.index + df_init_fact.iloc[0,1] + 1

        # Upload data to database
        upload_to_db(wh_conn, 'fact_weather', df_joined_wld[['weather_id', 'time_id', 'location_id', 'wind_speed', 'sunshine_minutes']])

    except Exception as e:
        print("Error:", e)
        return None

    return df_joined_wld

df = lambda_handler()

## SQL Queries for DataLake and Datawarehoue tables

### DataLake tables

tbl_currency_data

In [None]:
CREATE TABLE public.tbl_currency_data (
	"timestamp" timestamp NOT NULL,
	source_currency varchar(10) NOT NULL,
	target_currency varchar(10) NOT NULL,
	exchange_rate float8 NULL,
	CONSTRAINT tbl_currency_data_pkey PRIMARY KEY ("timestamp", source_currency, target_currency)
);

tbl_energy_cbet_data

In [None]:
CREATE TABLE public.tbl_energy_cbet_data (
	"timestamp" timestamp NULL,
	country text NULL,
	value float8 NULL
);

tbl_energy_price_data

In [None]:
CREATE TABLE public.tbl_energy_price_data (
	"timestamp" timestamp NULL,
	price float8 NULL,
	unit text NULL
);

tbl_energy_production_data

In [None]:
CREATE TABLE public.tbl_energy_production_data (
	"timestamp" timestamp NULL,
	cross_border_electricity_trading float8 NULL,
	nuclear float8 NULL,
	hydro_run_of_river float8 NULL,
	hydro_water_reservoir float8 NULL,
	hydro_pumped_storage float8 NULL,
	"others" float8 NULL,
	wind_onshore float8 NULL,
	solar float8 NULL,
	"load" float8 NULL,
	residual_load float8 NULL,
	renewable_share_of_load float8 NULL,
	renewable_share_of_generation float8 NULL
);

tbl_weather_data

In [None]:
CREATE TABLE public.tbl_weather_data (
	id serial4 NOT NULL,
	city varchar(100) NULL,
	datetime timestamp NULL,
	"temp" float8 NULL,
	pressure int4 NULL,
	humidity int4 NULL,
	temp_min float8 NULL,
	temp_max float8 NULL,
	wind_speed float8 NULL,
	weather_main varchar(50) NULL,
	weather_description text NULL,
	rain_1h float8 NULL,
	clouds_all float8 NULL,
	CONSTRAINT tbl_weather_data_pkey PRIMARY KEY (id)
);

### Datawarehouse tables

dim_countries

In [None]:
CREATE TABLE dim_countries (
    country_id SERIAL PRIMARY KEY,
    country_name_en VARCHAR(100),
    country_name_de VARCHAR(100),
    iso_code VARCHAR(5)
);

INSERT INTO dim_countries (country_name_en, country_name_de, iso_code)
VALUES
    ('Germany',      'Deutschland',   'DE'),
    ('France',       'Frankreich',    'FR'),
    ('Austria',      'Österreich',    'AT'),
    ('Italy',        'Italien',       'IT'),
    ('Liechtenstein','Liechtenstein', 'LI'),
    ('Switzerland',    'Schweiz',     'CH');

SyntaxError: invalid syntax (4177214843.py, line 1)

dim_currency

In [None]:
CREATE TABLE IF NOT EXISTS public.dim_currency (
	currency_id serial4 NOT NULL,
	currency_code varchar(3) NOT NULL,
	CONSTRAINT dim_currency_pkey PRIMARY KEY (currency_id)
);

INSERT INTO dim_currency (currency_id, currency_code) VALUES
(1, 'CHF'),
(2, 'EUR');

dim_locations

In [None]:
CREATE TABLE dim_locations (
    location_id SERIAL PRIMARY KEY,
    country VARCHAR(100) DEFAULT 'Switzerland',
    city VARCHAR(100) NOT NULL,
    lat DOUBLE PRECISION NOT NULL,
    long DOUBLE PRECISION NOT NULL
);

-- Daten einfügen
INSERT INTO dim_locations (city, lat, long)
VALUES
    ('Aarau', 47.392715, 8.044445),
    ('Baden', 47.473683, 8.308682),
    ('Basel', 47.558108, 7.587826),
    ('Bern', 46.948474, 7.452175),
    ('Chur', 46.854747, 9.526490),
    ('Frauenfeld', 47.556191, 8.896335),
    ('Genf', 46.201756, 6.146601),
    ('Lausanne', 46.521827, 6.632702),
    ('Lugano', 46.005010, 8.952028),
    ('Luzern', 47.050545, 8.305468),
    ('Neuenburg', 46.989583, 6.929264),
    ('Schaffhausen', 47.696049, 8.634513),
    ('Sion', 46.231175, 7.358879),
    ('Solothurn', 47.208135, 7.538405),
    ('St. Gallen', 47.425059, 9.376588),
    ('Winterthur', 47.499172, 8.729150),
    ('Zug', 47.167990, 8.517365),
    ('Zürich', 47.374449, 8.541042);

dim_time

In [None]:
CREATE TABLE public.dim_time (
	time_id serial4 NOT NULL,
	timestamp_utc timestamp NOT NULL,
	"date" date NOT NULL,
	"hour" int4 NULL,
	day_of_week varchar(3) NULL,
	"month" varchar(3) NULL,
	"year" int4 NULL,
	CONSTRAINT dim_time_day_of_week_check CHECK (((day_of_week)::text = ANY ((ARRAY['Mon'::character varying, 'Tue'::character varying, 'Wed'::character varying, 'Thu'::character varying, 'Fri'::character varying, 'Sat'::character varying, 'Sun'::character varying])::text[]))),
	CONSTRAINT dim_time_hour_check CHECK (((hour >= 0) AND (hour <= 23))),
	CONSTRAINT dim_time_month_check CHECK (((month)::text = ANY ((ARRAY['Jan'::character varying, 'Feb'::character varying, 'Mar'::character varying, 'Apr'::character varying, 'May'::character varying, 'Jun'::character varying, 'Jul'::character varying, 'Aug'::character varying, 'Sep'::character varying, 'Oct'::character varying, 'Nov'::character varying, 'Dec'::character varying])::text[]))),
	CONSTRAINT dim_time_pkey PRIMARY KEY (time_id),
	CONSTRAINT dim_time_year_check CHECK ((year >= 1900))
);

WITH RECURSIVE time_gen AS (
    SELECT 
        TIMESTAMP '2024-01-01 00:00:00' AS ts
    UNION ALL
    SELECT 
        ts + INTERVAL '1 hour'
    FROM time_gen
    WHERE ts + INTERVAL '1 hour' <= TIMESTAMP '2025-06-30 23:00:00'
)
INSERT INTO dim_time (
    timestamp_utc,
    date,
    hour,
    day_of_week,
    month,
    year
)
SELECT
    ts AS timestamp_utc,
    ts::date AS date,
    EXTRACT(HOUR FROM ts)::int AS hour,
    TO_CHAR(ts, 'Dy') AS day_of_week,
    TO_CHAR(ts, 'Mon') AS month,
    EXTRACT(YEAR FROM ts)::int AS year
FROM time_gen;

fact_energy_production

In [None]:
CREATE TABLE fact_energy_production (
    production_id SERIAL PRIMARY KEY,
    time_id INTEGER NOT NULL,
    country_id INTEGER NOT NULL,
    timestamp TIMESTAMP NOT NULL,
    solar_output FLOAT,
    wind_output FLOAT,
    load FLOAT,
    residual_load FLOAT,
    hydro_run_of_river_output FLOAT,
    hydro_water_reservoir_output FLOAT,
    hydro_pumped_storage_output FLOAT,
    nuclear_output FLOAT
);

-- Foreign Key
ALTER TABLE fact_energy_production
ADD CONSTRAINT fk_time
FOREIGN KEY (time_id) REFERENCES dim_time(time_id);

ALTER TABLE fact_energy_production
ADD CONSTRAINT fk_country
FOREIGN KEY (country_id) REFERENCES dim_countries(country_id);

fact_weather

In [None]:
CREATE TABLE IF NOT EXISTS fact_weather (
    weather_id       INTEGER PRIMARY KEY,
    time_id          INTEGER REFERENCES dim_time(time_id),
    location_id      INTEGER REFERENCES dim_location(location_id),
    temperature      FLOAT,
    wind_speed       FLOAT,
    sunshine_minutes FLOAT
);

create keys in fact_energy_trade

In [None]:
ALTER TABLE fact_energy_trade
ADD CONSTRAINT fk_time_id
FOREIGN KEY (time_id)
REFERENCES dim_time(time_id);

View for persona 1 dashboard

In [None]:
#View for Dashboard of persona 1 (production data)
CREATE OR REPLACE VIEW vw_energy_production_long AS
SELECT
  production_id,
  time_id,
  "timestamp",
  'Solar Output' AS type,
  solar_output AS value,
  COALESCE(solar_output, 0)
    + COALESCE(wind_output, 0)
    + COALESCE(hydro_run_of_river_output, 0)
    + COALESCE(hydro_water_reservoir_output, 0)
    + COALESCE(hydro_pumped_storage_output, 0)
    + COALESCE(nuclear_output, 0) AS total_output
FROM fact_energy_production

UNION ALL

SELECT
  production_id,
  time_id,
  "timestamp",
  'Wind Output' AS type,
  wind_output AS value,
  COALESCE(solar_output, 0)
    + COALESCE(wind_output, 0)
    + COALESCE(hydro_run_of_river_output, 0)
    + COALESCE(hydro_water_reservoir_output, 0)
    + COALESCE(hydro_pumped_storage_output, 0)
    + COALESCE(nuclear_output, 0) AS total_output
FROM fact_energy_production

UNION ALL

SELECT
  production_id,
  time_id,
  "timestamp",
  'Hydro Run-of-River Output' AS type,
  hydro_run_of_river_output AS value,
  COALESCE(solar_output, 0)
    + COALESCE(wind_output, 0)
    + COALESCE(hydro_run_of_river_output, 0)
    + COALESCE(hydro_water_reservoir_output, 0)
    + COALESCE(hydro_pumped_storage_output, 0)
    + COALESCE(nuclear_output, 0) AS total_output
FROM fact_energy_production

UNION ALL

SELECT
  production_id,
  time_id,
  "timestamp",
  'Hydro Water Reservoir Output' AS type,
  hydro_water_reservoir_output AS value,
  COALESCE(solar_output, 0)
    + COALESCE(wind_output, 0)
    + COALESCE(hydro_run_of_river_output, 0)
    + COALESCE(hydro_water_reservoir_output, 0)
    + COALESCE(hydro_pumped_storage_output, 0)
    + COALESCE(nuclear_output, 0) AS total_output
FROM fact_energy_production

UNION ALL

SELECT
  production_id,
  time_id,
  "timestamp",
  'Hydro Pumped Storage Output' AS type,
  hydro_pumped_storage_output AS value,
  COALESCE(solar_output, 0)
    + COALESCE(wind_output, 0)
    + COALESCE(hydro_run_of_river_output, 0)
    + COALESCE(hydro_water_reservoir_output, 0)
    + COALESCE(hydro_pumped_storage_output, 0)
    + COALESCE(nuclear_output, 0) AS total_output
FROM fact_energy_production

UNION ALL

SELECT
  production_id,
  time_id,
  "timestamp",
  'Nuclear Output' AS type,
  nuclear_output AS value,
  COALESCE(solar_output, 0)
    + COALESCE(wind_output, 0)
    + COALESCE(hydro_run_of_river_output, 0)
    + COALESCE(hydro_water_reservoir_output, 0)
    + COALESCE(hydro_pumped_storage_output, 0)
    + COALESCE(nuclear_output, 0) AS total_output
FROM fact_energy_production;


#View for Dashboard of persona 1 (trading data)
CREATE OR REPLACE VIEW public.vw_energy_trade_net_per_country_hourly
AS WITH base AS (
         SELECT ft.trade_id,
            ft.time_id,
            dt.timestamp_utc,
            ft.neighbor_country_id,
            ft.direction,
            ft.energy_value_gw,
            ft.value_eur,
            ft.value_chf,
            ft.exchange_rate,
            sum(
                CASE
                    WHEN ft.direction = 'export'::text THEN ft.energy_value_gw
                    ELSE - ft.energy_value_gw
                END) OVER (PARTITION BY ft.time_id) AS total_net_energy_gw_per_time
           FROM fact_energy_trade ft
             JOIN dim_time dt ON ft.time_id = dt.time_id
        )
 SELECT time_id,
    timestamp_utc,
    neighbor_country_id,
    sum(
        CASE
            WHEN direction = 'export'::text THEN energy_value_gw
            ELSE - energy_value_gw
        END) AS net_energy_gw,
    sum(
        CASE
            WHEN direction = 'export'::text THEN value_eur
            ELSE - value_eur
        END) AS net_value_eur,
    sum(
        CASE
            WHEN direction = 'export'::text THEN value_chf
            ELSE - value_chf
        END) AS net_value_chf,
    max(exchange_rate) AS exchange_rate,
    max(total_net_energy_gw_per_time) AS total_net_energy_gw_per_time,
        CASE
            WHEN sum(
            CASE
                WHEN direction = 'export'::text THEN value_eur
                ELSE - value_eur
            END) = 0::double precision THEN NULL::double precision
            ELSE (sum(
            CASE
                WHEN direction = 'export'::text THEN value_chf
                ELSE - value_chf
            END) / sum(
            CASE
                WHEN direction = 'export'::text THEN value_eur
                ELSE - value_eur
            END) - 1::double precision) * 100::double precision
        END AS fx_difference_percent
   FROM base
  GROUP BY time_id, timestamp_utc, neighbor_country_id
  ORDER BY timestamp_utc, neighbor_country_id;