In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
data = pd.read_excel('data/Data_.xlsx')

# Split the data into features (X) and target (y)
X = data.drop(columns=['EC'])
y = data['EC']

# Check for missing values and handle them
print(X.isnull().sum())  # Should be zero
print(y.isnull().sum())  # Should be zero

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verify shapes
print(X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape)


HCO3    0
Na      0
Mg      0
Cl      0
K       0
dtype: int64
0
(57, 5) (15, 5) (57,) (15,)


In [12]:
import statsmodels.api as sm

# Add a constant term for intercept
X_train_scaled = sm.add_constant(X_train_scaled)
X_test_scaled = sm.add_constant(X_test_scaled)

# Define a simple nonlinear model (this is just an example)
def nonlinear_model(params, x):
    # params contains the parameters of the model
    # x contains the independent variables
    return params[0] + params[1] * np.sin(x[:, 0]) + params[2] * np.exp(x[:, 1])

# Define residuals for optimization
def residuals(params, y, x):
    return y - nonlinear_model(params, x)

# Initial guess for the parameters
initial_params = np.ones(X_train_scaled.shape[1])

# Fit the model
from scipy.optimize import least_squares
result = least_squares(residuals, initial_params, args=(y_train, X_train_scaled))

# Extract fitted parameters
fitted_params = result.x

# Make predictions
y_train_pred_hwm = nonlinear_model(fitted_params, X_train_scaled)
y_test_pred_hwm = nonlinear_model(fitted_params, X_test_scaled)

# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

mse_train_hwm = mean_squared_error(y_train, y_train_pred_hwm)
r2_train_hwm = r2_score(y_train, y_train_pred_hwm)
mse_test_hwm = mean_squared_error(y_test, y_test_pred_hwm)
r2_test_hwm = r2_score(y_test, y_test_pred_hwm)

print(f'Training MSE: {mse_train_hwm}, Training R²: {r2_train_hwm}')
print(f'Testing MSE: {mse_test_hwm}, Testing R²: {r2_test_hwm}')


Training MSE: 8057933.438965175, Training R²: 0.003833428437368669
Testing MSE: 5440275.988880813, Testing R²: -0.07328670389369707


In [14]:
# Create a DataFrame to save actual vs predicted values
train_results_hwm = pd.DataFrame({
    'Actual': y_train,
    'Predicted': y_train_pred_hwm
})

test_results_hwm = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_test_pred_hwm
})

# Save to Excel
with pd.ExcelWriter('results/hwm_actual_vs_predicted.xlsx') as writer:
    train_results_hwm.to_excel(writer, sheet_name='Train', index=False)
    test_results_hwm.to_excel(writer, sheet_name='Test', index=False)
