In [1]:
import pandas as pd
import psycopg2
import os
from dotenv import load_dotenv


In [2]:
load_dotenv('.env', override=True)

# Get general Environment Variables
username = os.getenv('USERNAME_DL')
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID_INTERNAL')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY_INTERNAL')

# Get Datalake Environment Variables 
dl_endpoint = os.getenv('ENDPOINT_DL')
dl_db_name = os.getenv('DB_NAME_DL')
dl_password = os.getenv('PASSWORD_DL')

# Get Warehouse Environment Variables
wh_endpoint = os.getenv("ENDPOINT_WH")
wh_db_name = os.getenv("DB_NAME_WH")
wh_password = os.getenv("PASSWORD_WH")


In [3]:


def connect_to_db(endpoint: str, db_name: str, username: str, password: str) -> psycopg2.extensions.connection:
    """
    Establish a connection to a PostgreSQL database.
    """
    conn = psycopg2.connect(
        host=endpoint,
        dbname=db_name,
        user=username,
        password=password
    )
    conn.set_session(autocommit=True)
    return conn

def get_data_from_db(
    conn: psycopg2.extensions.connection,
    table_name: str,
    expression: str = '*',
    condition: str = ''
) -> pd.DataFrame:
    """
    Query data from a database table and return as a pandas DataFrame.
    """
    query = f"SELECT {expression} FROM {table_name}"
    if condition != '':
        query += f" WHERE {condition}"
    query += ';'
    print("Query Performed: " + query)
    df = pd.read_sql_query(query, conn)
    return df

def calculate_sunshine_percentage(df_sun: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the percentage of sunshine for each hour of the day.
    """

    def time_to_float(t) -> float:
        # Convert time string or datetime.time to float hours
        if isinstance(t, str):
            h, m, s = map(int, t.split(':'))
        else:
            h, m, s = t.hour, t.minute, t.second
        return h + m/60 + s/3600

    df = df_sun.copy()
    df['sunrise_float'] = df['sunrise'].apply(time_to_float)
    df['sunset_float'] = df['sunset'].apply(time_to_float)

    for hour in range(24):
        def hour_percentage(row) -> float:
            start = row['sunrise_float']
            end = row['sunset_float']
            hour_start = hour
            hour_end = hour + 1
            # No sun during this hour
            if end <= hour_start or start >= hour_end:
                return 0.0
            # Sun is up for part or all of this hour
            overlap_start = max(start, hour_start)
            overlap_end = min(end, hour_end)
            return max(0.0, (overlap_end - overlap_start)) * 100

        df[f'sunshine_pct_hour_{hour}'] = df.apply(hour_percentage, axis=1)

    # Melt the DataFrame to have columns: city, date, hour, sunshine_pct_hour
    df = df.melt(
        id_vars=['city', 'date'],
        value_vars=[f'sunshine_pct_hour_{hour}' for hour in range(24)],
        var_name='hour',
        value_name='sunshine_pct_hour'
    )
    # Extract the hour as integer from the column name
    df['hour'] = df['hour'].str.extract(r'(\d+)').astype(int)
    df['datetime'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')
    df = df.drop(['date', 'hour'], axis=1)
    return df

def upload_to_db(conn: psycopg2.extensions.connection, table_name: str, df: pd.DataFrame) -> None:
    """
    Upload a DataFrame to a database table.
    """
    with conn.cursor() as cur:
        # Create table if it does not exist
        cur.execute(f'''
            CREATE TABLE IF NOT EXISTS {table_name} (
                weather_id       INTEGER PRIMARY KEY,
                time_id          INTEGER REFERENCES dim_time(time_id),
                location_id      INTEGER REFERENCES dim_location(location_id),
                wind_speed       FLOAT,
                sunshine_minutes FLOAT
            );
        ''')
        conn.commit()

    print(f"Table '{table_name}' created successfully.")

    # Insert data row by row
    insert_query = f'''
        INSERT INTO {table_name} (weather_id, time_id, location_id, wind_speed, sunshine_minutes)
        VALUES (%s, %s, %s, %s, %s)
    '''
    data_tuples = df.where(pd.notnull(df), None).values.tolist()

    with conn.cursor() as cur:
        for row in data_tuples:
            cur.execute(insert_query, row)
        conn.commit()

    print(f"Table '{table_name}' created and data uploaded successfully.")

def lambda_handler():
    """
    Main ETL handler function to process and upload weather data.
    """
    try:
        print("Connecting to Datalake ")
        dl_conn = connect_to_db(dl_endpoint, dl_db_name, username, dl_password)
        print("Connection to Datalake successful.")

        # Connect to warehouse
        print("Connecting to WareHouse ")
        wh_conn = connect_to_db(wh_endpoint, wh_db_name, username, wh_password)
        print("Connection to WareHouse successful.")

        # Get the current situation in the warehouse
        df_init_fact = get_data_from_db(wh_conn, 'fact_weather', 'max(time_id),max(weather_id)')
        df = get_data_from_db(wh_conn, 'dim_time', 'timestamp_utc', f'time_id = {df_init_fact.iloc[0,0]}')
        print("Last timestamp in warehouse: ", df.iloc[0,0])

        # Download sunshine data from datalake
        df_sun = get_data_from_db(conn=dl_conn, table_name='tbl_weather_sun_data', condition= f"date >= '{df.iloc[0,0].date()}'")
        df_sun = calculate_sunshine_percentage(df_sun)
        print("Sunshine data downloaded.")

        # Download weather data from datalake
        df_weather = get_data_from_db(conn=dl_conn, table_name='tbl_weather_data', condition= f"datetime > '{df.iloc[0,0]}'")
        print("Weather data downloaded.")

        if df_weather.empty:
            print("No new weather data available.")
            return None
        else:
            print("New weather data available.")

        # Calculate the sunshine duration in minutes
        df_ws = df_weather.merge(df_sun, on=['city', 'datetime'], how='inner')
        df_ws['sunshine_minutes'] = (100 - df_ws['clouds_all']) * df_ws['sunshine_pct_hour'] / 10000 * 60

        # Clean up the DataFrame
        df_ws = df_ws.drop(['id','pressure','humidity','temp_min','temp_max','temp','weather_main','weather_description','rain_1h','clouds_all','sunshine_pct_hour'], axis=1)
        print("Data successfully processed.")

        # Join with city data
        df_city = get_data_from_db(wh_conn, 'dim_locations')
        df_city_s = df_city[['location_id', 'city']]
        df_joined_wl = df_ws.merge(df_city_s, on=['city'], how='outer')
        df_joined_wl = df_joined_wl.drop(['city'], axis=1)

        # Join with time data
        df_time = get_data_from_db(wh_conn, 'dim_time')
        df_time_s = df_time[['time_id', 'timestamp_utc']]
        df_time_s.columns = ['time_id', 'datetime']
        df_joined_wld = df_joined_wl.merge(df_time_s, on=['datetime'], how='inner')
        df_joined_wld = df_joined_wld.drop(['datetime'], axis=1)
        df_joined_wld['weather_id'] = df_joined_wld.index + df_init_fact.iloc[0,1] + 1

        # Upload data to database
        upload_to_db(wh_conn, 'fact_weather', df_joined_wld[['weather_id', 'time_id', 'location_id', 'wind_speed', 'sunshine_minutes']])

    except Exception as e:
        print("Error:", e)
        return None

    return df_joined_wld

df = lambda_handler()

Connecting to Datalake 
Connection to Datalake successful.
Connecting to WareHouse 
Connection to WareHouse successful.
Query Performed: SELECT max(time_id),max(weather_id) FROM fact_weather;
Query Performed: SELECT timestamp_utc FROM dim_time WHERE time_id = 12141;


  df = pd.read_sql_query(query, conn)


Last timestamp in warehouse:  2025-05-20 20:00:00
Query Performed: SELECT * FROM tbl_weather_sun_data WHERE date >= '2025-05-20';
Sunshine data downloaded.
Query Performed: SELECT * FROM tbl_weather_data WHERE datetime > '2025-05-20 20:00:00';


  df = pd.read_sql_query(query, conn)
  df = pd.read_sql_query(query, conn)


Weather data downloaded.
New weather data available.
Data successfully processed.
Query Performed: SELECT * FROM dim_locations;
Query Performed: SELECT * FROM dim_time;


  df = pd.read_sql_query(query, conn)


Table 'fact_weather' created successfully.
Table 'fact_weather' created and data uploaded successfully.
