In [0]:
from src.utils.locate_filepaths import storage_filepaths
from pyspark.sql.functions import col,avg, min, unix_timestamp
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
import matplotlib.pyplot as plt

paths = storage_filepaths()

groundwater_df = spark.read.format('delta').load(f"{paths['gold']}/groundwater_for_ml")
climate_df = spark.read.format('delta').load(f"{paths['gold']}/climate_for_ml")

# Verifying dataframe format
# display(groundwater_df)
# display(climate_df)

In [0]:
display(groundwater_df)

In [0]:

# Filtering groundwater_df for one station, as each ML model should be trained per station.
# Due to different soil stratification, different groundwater level behaviours are expected per station.
station_id_applied = '187.199'
groundwater_df_filtered = groundwater_df.filter(col('station_id') == station_id_applied)

# Joining climate and groundwater dataframes
groundwater_df_filtered = groundwater_df_filtered.select(col('observed_date'), col('daily_avg_waterlevel'))
climate_df = climate_df.join(groundwater_df_filtered, on='observed_date', how='inner')

climate_df = climate_df.withColumn("date_ts", unix_timestamp("observed_date"))
first_observation_day_ts = climate_df.select(min("date_ts")).first()[0]
climate_df = climate_df.filter(col('date_ts') > first_observation_day_ts+200*24*3600)



In [0]:
# Selecting feature columns
feature_cols = [i for i in climate_df.columns if i not in ['observed_date', 'daily_avg_waterlevel']]

# Assemble features into one vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df_vector = assembler.transform(climate_df)

# Split into train/test sets
train_df, test_df = df_vector.randomSplit([0.8, 0.2], seed=42)

# Create and fit the linear regression model
lr = LinearRegression(featuresCol='features', labelCol='daily_avg_waterlevel')
lr_model = lr.fit(train_df)

# Predicting on test values
predictions = lr_model.transform(test_df)

In [0]:
# Plotting measured and predicted values on y axis, observed dates on x-axis.
row = groundwater_df.filter(col("station_id") == station_id_applied).first()
location = row["station_name"]

plot_df = predictions.select("observed_date", "daily_avg_waterlevel", "prediction","7days_avg_rain").toPandas()
plt.figure(figsize=(5,3))
plt.plot(plot_df['observed_date'], plot_df['daily_avg_waterlevel'], label='Measured value')
plt.plot(plot_df['observed_date'],plot_df['prediction'],label='Prediction')
plt.title(f'Groundwater Prediction, Accuracy Plot\nStation: {location}\nR2={format(lr_model.summary.r2,"0.2f")}')
plt.legend()
plt.xlabel('Date')
plt.ylabel('Groundwater level [m]')
plt.grid()

# Print evaluation metrics
print(f"RMSE: {lr_model.summary.rootMeanSquaredError}")
print(f"R2: {lr_model.summary.r2}")
