# Explore here

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import numpy as np


In [4]:
# Load the dataset
df = pd.read_csv('../data/raw/Aquifer_Auser.csv')

# Explore the dataset
print(df.head())  # Preview the data
print(df.info())  # Check for missing values and data types


         Date  Rainfall_Gallicano  Rainfall_Pontetetto  Rainfall_Monte_Serra  \
0  05/03/1998                 NaN                  NaN                   NaN   
1  06/03/1998                 NaN                  NaN                   NaN   
2  07/03/1998                 NaN                  NaN                   NaN   
3  08/03/1998                 NaN                  NaN                   NaN   
4  09/03/1998                 NaN                  NaN                   NaN   

   Rainfall_Orentano  Rainfall_Borgo_a_Mozzano  Rainfall_Piaggione  \
0                NaN                       NaN                 NaN   
1                NaN                       NaN                 NaN   
2                NaN                       NaN                 NaN   
3                NaN                       NaN                 NaN   
4                NaN                       NaN                 NaN   

   Rainfall_Calavorno  Rainfall_Croce_Arcana  \
0                 NaN                    NaN   
1 

In [5]:
# Select features (X) and target variable (y)
X = df[['Rainfall_Gallicano', 'Rainfall_Pontetetto', 'Temperature_Monte_Serra']]  # Example features
y = df['Volume_POL']  # Target variable


In [6]:
# Create an imputer to fill missing values with the median
imputer = SimpleImputer(strategy='median')


In [7]:
# Fill missing values in the target variable (y_train)
y_filled = y.fillna(y.median())


In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_filled, test_size=0.2, random_state=42)

# Fit the imputer on the training data and transform both the training and test sets
X_train_imputed = imputer.fit_transform(X_train)  # Fit on train data and transform it
X_test_imputed = imputer.transform(X_test)  # Transform test data based on the training data's statistics


In [9]:
# Initialize the Linear Regression model
lr_model = LinearRegression()

# Train the model on the imputed training data
lr_model.fit(X_train_imputed, y_train)


In [10]:
# Make predictions on the imputed test data
y_pred = lr_model.predict(X_test_imputed)


In [11]:
# Calculate the Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Output the result
print(f'Linear Regression RMSE: {rmse}')


Linear Regression RMSE: 1805.2881009816929
