In [16]:
import numpy as np
from gplearn.genetic import SymbolicRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [17]:
# Example dataset
np.random.seed(42)
x = np.random.uniform(-10, 10, 100).reshape(-1, 1)  # Input feature
y = 3 * x[:, 0]**2 - 2 * x[:, 0] + 5 + np.random.normal(0, 10, 100)  # Target variable

In [18]:
problem = np.load('./data/problem_2.npz')
x = problem['x'].T
y = problem['y']

x.shape, y.shape

((5000, 3), (5000,))

In [None]:
x_train, _, y_train, _ = train_test_split(x, y, train_size=0.1, random_state=42)

# Define the symbolic regressor
est = SymbolicRegressor(
    population_size=2000,
    generations=10,
    stopping_criteria=0.01,
    p_crossover=0.98,
    p_subtree_mutation=0.01,
    p_point_mutation=0.01,
    p_hoist_mutation=0.00,
    max_samples=0.9,
    verbose=1,
    parsimony_coefficient=0.01,
    random_state=42
)

# Fit the model
est.fit(x_train, y_train)

# Predict on test data
y_pred = est.predict(x)

# Print the resulting formula
print("Best formula:", est._program)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    33.27      1.53874e+07        7      4.65104e+06      6.09778e+06      3.68m
   1    35.39      4.79581e+06       63       4.6409e+06       6.1891e+06      3.77m
   2    37.15      4.79825e+06      125      4.63972e+06      6.19959e+06      3.38m
   3    34.00       4.7993e+06        3      4.67991e+06      5.83795e+06      3.33m
   4    32.81      4.79697e+06        3      4.68725e+06      5.77188e+06      3.76m


KeyboardInterrupt: 

In [None]:
# Evaluate and visualize
mse = mean_squared_error(y, y_pred)
print(f"Mean Squared Error on Validations Set: {mse}")

Mean Squared Error on Validations Set: 23.573815744412034
