In [1]:
import pandas as pd
import datetime as dt
import psycopg2
import os
from dotenv import load_dotenv
import numpy as np
from zoneinfo import ZoneInfo


In [2]:
load_dotenv('.env', override=True)

# Get general Environment Variables
username = os.getenv('USERNAME_DL')
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID_INTERNAL')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY_INTERNAL')
bucket_name = os.getenv('BUCKET_NAME')

# Get Datalake Environment Variables 
dl_endpoint = os.getenv('ENDPOINT_DL')
dl_db_name = os.getenv('DB_NAME_DL')
dl_password = os.getenv('PASSWORD_DL')

# Get Warehouse Environment Variables
wh_endpoint = os.getenv("ENDPOINT_WH")
wh_db_name = os.getenv("DB_NAME_WH")
wh_password = os.getenv("PASSWORD_WH")


In [9]:
def connect_to_db(endpoint, db_name, username, password):
    conn = psycopg2.connect(
            host=endpoint,
            dbname=db_name,
            user=username,
            password=password
        )
    conn.set_session(autocommit=True)
    return conn

def get_data_from_db(conn, table_name):
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql_query(query, conn)
    return df


def calculate_sunshine_percentage(df_sun):

    def time_to_float(t):
        if isinstance(t, str):
            h, m, s = map(int, t.split(':'))
        else:
            h, m, s = t.hour, t.minute, t.second
        return h + m/60 + s/3600

    df = df_sun.copy()
    df['sunrise_float'] = df['sunrise'].apply(time_to_float)
    df['sunset_float'] = df['sunset'].apply(time_to_float)

    for hour in range(24):
        def hour_percentage(row):
            start = row['sunrise_float']
            end = row['sunset_float']
            hour_start = hour
            hour_end = hour + 1
            # No sun during this hour
            if end <= hour_start or start >= hour_end:
                return 0.0
            # Sun is up for part or all of this hour
            overlap_start = max(start, hour_start)
            overlap_end = min(end, hour_end)
            return max(0.0, (overlap_end - overlap_start)) * 100

        df[f'sunshine_pct_hour_{hour}'] = df.apply(hour_percentage, axis=1)

    # Melt the DataFrame to have columns: city, date, hour, sunshine_pct_hour
    df = df.melt(
        id_vars=['city', 'date'],
        value_vars=[f'sunshine_pct_hour_{hour}' for hour in range(24)],
        var_name='hour',
        value_name='sunshine_pct_hour'
    )
      
    # Extract the hour as integer from the column name
    df['hour'] = df['hour'].str.extract(r'(\d+)').astype(int)

    df['datetime'] = pd.to_datetime(df['date']) + pd.to_timedelta(df['hour'], unit='h')
    df = df.drop(['date', 'hour'], axis=1)  
    return df

def upload_to_db(conn, table_name, df):

    with conn.cursor() as cur:
        cur.execute(f'DROP TABLE IF EXISTS {table_name};')
        cur.execute(f'''
            CREATE TABLE {table_name} (
                weather_id BIGINT PRIMARY KEY,
                time_id BIGINT,
                location_id DOUBLE PRECISION,
                temperature DOUBLE PRECISION,
                wind_speed DOUBLE PRECISION,
                sunshine_minutes DOUBLE PRECISION
                
            );
        ''')
        conn.commit()

    print(f"Table '{table_name}' created successfully.")

    # Insert data row by row
    insert_query = f'''
        INSERT INTO {table_name} (weather_id, time_id, location_id, temperature, wind_speed, sunshine_minutes)
        VALUES (%s, %s, %s, %s, %s, %s)
    '''

    data_tuples = df.where(pd.notnull(df), None).values.tolist()

    with conn.cursor() as cur:
        for row in data_tuples:
            cur.execute(insert_query, row)
        conn.commit()

    print(f"Table '{table_name}' created and data uploaded successfully.")


def lambda_handler():

    try:
        print("Connecting to Datalake ")
        
        dl_conn = connect_to_db(dl_endpoint, dl_db_name, username, dl_password)

        print("Connection to Datalake successful.")


        df_sun = get_data_from_db(dl_conn, 'tbl_weather_sun_data')
        df_sun = calculate_sunshine_percentage(df_sun)
        print("Sunshine data downloaded.")

        df_weather = get_data_from_db(dl_conn, 'tbl_weather_data')
        print("Weather data downloaded.")

        # Calculate the sunshine duration in minutes
        df_ws = df_weather.merge(df_sun, on=['city', 'datetime'], how='outer')
        df_ws['sunshine_minutes'] = (100- df_ws['clouds_all']) * df_ws['sunshine_pct_hour'] / 10000 * 60

        # clean up the DataFrame
        df_ws = df_ws.drop(['id','pressure','humidity','temp_min','temp_max','weather_main','weather_description','rain_1h','clouds_all','sunshine_pct_hour'], axis=1)

        print("Data successfully processed.")

        # Connect to warehouse
        print("Connecting to WareHouse ")
        wh_conn = connect_to_db(wh_endpoint, wh_db_name, username, wh_password)

        print("Connection to WareHouse successful.")

        # join with city data
        df_city = get_data_from_db(wh_conn, 'dim_locations')
        df_city_s = df_city[['location_id' ,'city']]
        df_joined_wl = df_ws.merge(df_city_s, on=['city'], how='outer')
        df_joined_wl = df_joined_wl.drop(['city'], axis=1)

        #join with time data
        df_time = get_data_from_db(wh_conn, 'dim_time')
        df_time_s = df_time[['time_id', 'timestamp_utc']]
        df_time_s.columns = ['time_id', 'datetime']
        df_joined_wld = df_joined_wl.merge(df_time_s, on=['datetime'], how='outer')
        df_joined_wld = df_joined_wld.drop(['datetime'], axis=1)
        df_joined_wld['weather_id'] = df_joined_wld.index

        upload_to_db(wh_conn, 'fact_weather', df_joined_wld[['weather_id', 'time_id', 'location_id', 'temp', 'wind_speed', 'sunshine_minutes']])


    except Exception as e:
        print("Error:", e)

    return [df_joined_wld, wh_conn]

lambda_handler()

Connecting to Datalake 
Connection to Datalake successful.


  df = pd.read_sql_query(query, conn)


Sunshine data downloaded.


  df = pd.read_sql_query(query, conn)


Weather data downloaded.
Data successfully processed.
Connecting to WareHouse 
Connection to WareHouse successful.


  df = pd.read_sql_query(query, conn)
  df = pd.read_sql_query(query, conn)


Table 'fact_weather' created successfully.
Table 'fact_weather' created and data uploaded successfully.


[        temp  wind_speed  sunshine_minutes  location_id  time_id  weather_id
 0        NaN         NaN               NaN          NaN        1           0
 1        NaN         NaN               NaN          NaN        2           1
 2        NaN         NaN               NaN          NaN        3           2
 3        NaN         NaN               NaN          NaN        4           3
 4        NaN         NaN               NaN          NaN        5           4
 ...      ...         ...               ...          ...      ...         ...
 166963   NaN         NaN               NaN          NaN    13124      166963
 166964   NaN         NaN               NaN          NaN    13125      166964
 166965   NaN         NaN               NaN          NaN    13126      166965
 166966   NaN         NaN               NaN          NaN    13127      166966
 166967   NaN         NaN               NaN          NaN    13128      166967
 
 [166968 rows x 6 columns],
 <connection object at 0x000001650