### Thunderstorm Forecasting using Amateur Weather Stations Meteorological Data

Author: Jan Luis Antoc

##### Import necessary libraries

In [45]:
import polars as pl
import os
from dotenv import find_dotenv, load_dotenv
from datetime import timedelta

##### Printing modifications to show all rows

In [46]:
pl.Config(tbl_rows=-1)

<polars.config.Config at 0x198ccc25710>

##### Get the necessary variables for the database connection

In [47]:
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

POSTGRES_HOST = os.getenv("POSTGRES_HOST")
POSTGRES_DATABASE = os.getenv("POSTGRES_DATABASE")
POSTGRES_USERNAME = os.getenv("POSTGRES_USERNAME")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_SERVER = os.getenv("POSTGRES_SERVER")
POSTGRES_PORT = os.getenv("POSTGRES_PORT")

##### Query for getting the meteorological data from the database

In [48]:
# USER INPUTS HERE
STATION_ID = 'IPARAA10'		# Weather station ID as indicated in Wunderground

# Query for retrieving data
# Use single quotation marks in the query for the column values, like the one in the WHERE clause
query = f"""
SELECT
	obs_time_local,
    DATE(obs_time_local) AS date,
    CAST(EXTRACT(MONTH FROM obs_time_local) AS INTEGER) AS month,
    CAST(EXTRACT(DAY FROM obs_time_local) AS INTEGER) AS day,
    CAST(EXTRACT(HOUR FROM obs_time_local) AS INTEGER) AS hour,
    CAST(EXTRACT(MINUTE FROM obs_time_local) AS INTEGER) AS minute,
	qc_status,
	solar_radiation_high,
	uv_high,
	dew_point_avg,
	temperature_avg,
	humidity_avg,
	heat_index_avg,
	pressure_max,
	pressure_trend,
	wind_direction_avg,
	wind_chill_avg,
	wind_gust_avg,
	wind_speed_avg,
	precipitation_rate,
	precipitation_total
FROM public."measurements"

WHERE station_id = 'IPARAA10' 
"""

##### Run the query and get the data from the local database

In [49]:
connection_url = f"postgres://{POSTGRES_USERNAME}:{POSTGRES_PASSWORD}@{POSTGRES_SERVER}:{POSTGRES_PORT}/{POSTGRES_DATABASE}"
df = pl.read_database_uri(query=query, uri=connection_url)

df = df.sort(by="obs_time_local", descending=False)

df = df.with_columns(
    pl.col("obs_time_local").shift(1).alias("previous_obs_time_local")
)

df.head(10)

obs_time_local,date,month,day,hour,minute,qc_status,solar_radiation_high,uv_high,dew_point_avg,temperature_avg,humidity_avg,heat_index_avg,pressure_max,pressure_trend,wind_direction_avg,wind_chill_avg,wind_gust_avg,wind_speed_avg,precipitation_rate,precipitation_total,previous_obs_time_local
datetime[ns],date,i32,i32,i32,i32,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,datetime[ns]
2023-03-13 00:04:53,2023-03-13,3,13,0,4,1,0.0,0.0,21.0,26.0,77.0,26.0,1015.58,0.0,104.0,26.0,10.0,9.0,0.0,0.0,
2023-03-13 00:09:52,2023-03-13,3,13,0,9,1,0.0,0.0,21.0,26.0,77.0,26.0,1015.58,0.0,107.0,26.0,11.0,11.0,0.0,0.0,2023-03-13 00:04:53
2023-03-13 00:14:52,2023-03-13,3,13,0,14,1,0.0,0.0,21.0,25.0,77.0,26.0,1015.58,0.0,95.0,25.0,11.0,10.0,0.0,0.0,2023-03-13 00:09:52
2023-03-13 00:19:52,2023-03-13,3,13,0,19,1,0.0,0.0,21.0,25.0,77.0,26.0,1015.58,0.0,103.0,25.0,11.0,10.0,0.0,0.0,2023-03-13 00:14:52
2023-03-13 00:24:52,2023-03-13,3,13,0,24,1,0.0,0.0,21.0,25.0,78.0,26.0,1015.58,0.0,81.0,25.0,8.0,7.0,0.0,0.0,2023-03-13 00:19:52
2023-03-13 00:29:52,2023-03-13,3,13,0,29,1,0.0,0.0,21.0,25.0,77.0,26.0,1015.58,0.0,90.0,25.0,9.0,9.0,0.0,0.0,2023-03-13 00:24:52
2023-03-13 00:34:52,2023-03-13,3,13,0,34,1,0.0,0.0,21.0,25.0,77.0,26.0,1015.58,0.0,92.0,25.0,10.0,10.0,0.0,0.0,2023-03-13 00:29:52
2023-03-13 00:39:52,2023-03-13,3,13,0,39,1,0.0,0.0,21.0,25.0,77.0,26.0,1015.58,0.0,88.0,25.0,9.0,9.0,0.0,0.0,2023-03-13 00:34:52
2023-03-13 00:44:52,2023-03-13,3,13,0,44,1,0.0,0.0,21.0,25.0,76.0,26.0,1015.58,0.0,93.0,25.0,13.0,12.0,0.0,0.0,2023-03-13 00:39:52
2023-03-13 00:49:52,2023-03-13,3,13,0,49,1,0.0,0.0,21.0,25.0,77.0,26.0,1015.58,0.0,93.0,25.0,12.0,11.0,0.0,0.0,2023-03-13 00:44:52


##### Filtering only rows with 5 minutes interval between each consecutive rows

In [60]:
time_difference = df["obs_time_local"] - df["previous_obs_time_local"]

# Removed seconds in the time difference due to timing issues seen in the obs_time_local column
df = df.with_columns(
    ((time_difference >= pl.duration(minutes=4)) & (time_difference <= pl.duration(minutes=5))).alias("is_5_minutes_interval")
)

# filtered_df = df.filter(df["is_5_minutes_interval"] == True)


##### Adding the rainfall in next minutes indicator columns

In [65]:
# TODO: Do more QA here to make sure there are no logical errors in the implementation.
# TODO: For improvements, instead of checking the rainfall in the next 10, 20, and 30 minutes, check WITHIN the next 10, 20, and 30 minutes.
# TODO: For improvements, instead of precipitation_rate > 0, try >= 2.5 (light rains per PAGASA)

with_rainfall_df = df.with_columns(
    pl.when((df["is_5_minutes_interval"] == True) & (df["precipitation_rate"].shift(periods=-2) >= 2.5))
    .then(1)
    .when((df["is_5_minutes_interval"] == False) & (df["precipitation_rate"].shift(periods=-2) >= 2.5))
    .then(-1)
    .otherwise(0)
    .alias("rain_in_10_minutes"),
    pl.when((df["is_5_minutes_interval"] == True) & (df["precipitation_rate"].shift(periods=-4) >= 2.5))
    .then(1)
    .when((df["is_5_minutes_interval"] == False) & (df["precipitation_rate"].shift(periods=-4) >= 2.5))
    .then(-1)
    .otherwise(0)
    .alias("rain_in_20_minutes"),
    pl.when((df["is_5_minutes_interval"] == True) & (df["precipitation_rate"].shift(periods=-6) >= 2.5))
    .then(1)
    .when((df["is_5_minutes_interval"] == False) & (df["precipitation_rate"].shift(periods=-6) >= 2.5))
    .then(-1)
    .otherwise(0)
    .alias("rain_in_30_minutes")
)

# Checking out some days with rainfall to see if there are no wrong labelling
with_rainfall_df.filter((pl.col("date") == pl.date(year=2023, month=9, day=28)) & (pl.col("rain_in_10_minutes") == 1))

obs_time_local,date,month,day,hour,minute,qc_status,solar_radiation_high,uv_high,dew_point_avg,temperature_avg,humidity_avg,heat_index_avg,pressure_max,pressure_trend,wind_direction_avg,wind_chill_avg,wind_gust_avg,wind_speed_avg,precipitation_rate,precipitation_total,previous_obs_time_local,is_5_minutes_interval,rain_in_10_minutes,rain_in_20_minutes,rain_in_30_minutes
datetime[ns],date,i32,i32,i32,i32,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,datetime[ns],bool,i32,i32,i32
2023-09-28 15:59:53,2023-09-28,9,28,15,59,1,32.5,0.0,26.2,29.4,82.9,36.6,1008.81,0.0,24.0,29.4,9.9,9.0,0.0,0.0,2023-09-28 15:54:53,True,1,1,1
2023-09-28 16:04:53,2023-09-28,9,28,16,4,1,6.0,0.0,25.8,28.0,87.7,33.6,1009.48,8.5,24.0,28.0,8.1,7.5,0.0,0.0,2023-09-28 15:59:53,True,1,1,1
2023-09-28 16:09:53,2023-09-28,9,28,16,9,1,3.7,0.0,25.9,27.5,90.8,32.7,1009.48,0.0,37.0,27.5,8.4,7.6,16.76,2.79,2023-09-28 16:04:53,True,1,1,1
2023-09-28 16:14:53,2023-09-28,9,28,16,14,1,3.3,0.0,25.4,26.2,94.8,28.8,1009.48,0.0,35.0,26.2,10.4,9.2,32.0,5.33,2023-09-28 16:09:53,True,1,1,1
2023-09-28 16:19:53,2023-09-28,9,28,16,19,1,4.3,0.0,25.3,25.9,96.6,28.0,1009.48,0.0,32.0,25.9,8.7,7.8,50.29,11.43,2023-09-28 16:14:53,True,1,1,1
2023-09-28 16:24:53,2023-09-28,9,28,16,24,1,6.3,0.0,25.2,25.6,98.2,27.0,1009.48,0.0,50.0,25.6,10.7,9.6,79.25,19.05,2023-09-28 16:19:53,True,1,1,1
2023-09-28 16:29:53,2023-09-28,9,28,16,29,1,6.2,0.0,25.2,25.3,99.0,26.5,1009.48,0.0,40.0,25.3,8.4,7.8,92.96,27.43,2023-09-28 16:24:53,True,1,1,1
2023-09-28 16:34:53,2023-09-28,9,28,16,34,1,4.8,0.0,25.3,25.4,99.0,26.6,1010.5,12.83,42.0,25.4,3.9,3.8,109.73,37.85,2023-09-28 16:29:53,True,1,1,1
2023-09-28 16:39:53,2023-09-28,9,28,16,39,1,5.1,0.0,24.5,24.6,99.0,25.7,1010.5,0.0,297.0,24.6,3.5,3.5,123.44,47.5,2023-09-28 16:34:53,True,1,1,1
2023-09-28 16:44:53,2023-09-28,9,28,16,44,1,5.2,0.0,24.5,24.5,99.0,25.6,1010.5,0.0,222.0,24.5,3.0,3.0,117.35,50.8,2023-09-28 16:39:53,True,1,1,1
