In [0]:
from src.utils.locate_filepaths import storage_filepaths

paths = storage_filepaths()

groundwater_df = spark.read.format('delta').load(f"{paths['gold']}/groundwater_for_ml")
climate_df = spark.read.format('delta').load(f"{paths['gold']}/climate_for_ml")


In [0]:
#display(groundwater_df.limit(3))
display(climate_df.limit(3))

In [0]:
from pyspark.sql.functions import col,avg


groundwater_df = groundwater_df.filter(col('station_id') == '187.199')
groundwater_df = groundwater_df.select(col('observed_date'), col('daily_avg_waterlevel'))

climate_df = climate_df.join(groundwater_df, on='observed_date', how='inner')
display(climate_df.orderBy(col('observed_date').desc()))




In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

feature_cols = [i for i in climate_df.columns if i not in ['observed_date', 'daily_avg_waterlevel']]

# Assemble features into one vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df_vector = assembler.transform(climate_df)

# Split into train/test sets
train_df, test_df = df_vector.randomSplit([0.8, 0.2], seed=42)

# Create and fit the linear regression model
lr = LinearRegression(featuresCol='features', labelCol='daily_avg_waterlevel')
lr_model = lr.fit(train_df)

predictions = lr_model.transform(test_df)

In [0]:
plot_df = predictions.select("observed_date", "daily_avg_waterlevel", "prediction").toPandas()

import matplotlib.pyplot as plt
plt.figure(figsize=(5,3))
plt.plot(plot_df['observed_date'], plot_df['daily_avg_waterlevel'], label='Measured value')
plt.plot(plot_df['observed_date'],plot_df['prediction'],label='Prediction')
plt.legend()

# Print evaluation metrics
print(f"RMSE: {lr_model.summary.rootMeanSquaredError}")
print(f"R2: {lr_model.summary.r2}")
