In [None]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA

# Load the datasets
final_cleaned_data = pd.read_csv("final_cleaned_data.csv")
synthetic_data = pd.read_csv("ms_delta_synthetic_wells.csv")

# Prepare the data by focusing on overlapping columns
overlapping_columns = set(final_cleaned_data.columns) & set(synthetic_data.columns)
final_cleaned_data_filtered = final_cleaned_data[overlapping_columns]
synthetic_data_filtered = synthetic_data.rename(
    columns={"Well_UUID": "Well_ID", "latitude": "lat_dec", "longitude": "long_dec"}
)[overlapping_columns]
merged_data = pd.concat(
    [final_cleaned_data_filtered, synthetic_data_filtered], ignore_index=True
)
merged_data["Date"] = pd.to_datetime(merged_data["Date"])

# Aggregating data on a monthly basis
merged_data_monthly = merged_data.resample("M", on="Date").mean().dropna()

# Splitting the data into training and testing sets
train, test = train_test_split(merged_data_monthly, test_size=0.2, shuffle=False)


# Define a function to fit and evaluate ARIMA model
def fit_arima(train, test, order):
    model = ARIMA(train, order=order)
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=len(test))
    mse = mean_squared_error(test, predictions)
    return mse, predictions


# ARIMA model parameters
order = (5, 1, 0)

# Fitting and evaluating the model on merged data
mse, predictions = fit_arima(
    train["GW_measurement_smoothed"], test["GW_measurement_smoothed"], order
)

# Preparing and evaluating the model on real data
final_cleaned_data_filtered["Date"] = pd.to_datetime(
    final_cleaned_data_filtered["Date"]
)
real_data_monthly = (
    final_cleaned_data_filtered.set_index("Date").resample("M").mean().dropna()
)
train_real, test_real = train_test_split(
    real_data_monthly, test_size=0.2, shuffle=False
)
mse_real, predictions_real = fit_arima(
    train_real["GW_measurement_smoothed"], test_real["GW_measurement_smoothed"], order
)

# Display the MSE for both models
print("MSE with Merged Data: ", mse)
print("MSE with Real Data: ", mse_real)