<a href="https://colab.research.google.com/github/MO230101/The-codes-for-hydrogel-study-/blob/main/Symbolic_learn_for_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# The code for swelling
import pandas as pd
import numpy as np
!pip install gplearn
from gplearn.genetic import SymbolicRegressor
from sklearn.model_selection import train_test_split, ParameterSampler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
import random
from gplearn.functions import make_function

# ランダムシード固定
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

# データ読み込み
data = pd.read_csv('file name.csv')
if 'Unnamed: 0' in data.columns:
    X = data.drop(['CA1', 'Unnamed: 0'], axis=1)
else:
    X = data.drop('CA1', axis=1)
y = data['CA1']

# データ分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=random_seed, stratify=y
)

X_train_scaled_df = X_train.copy()
X_test_scaled_df = X_test.copy()

# カスタム関数定義 (NaN 対策を含む)
def protected_square(x):
    return np.where(np.abs(x) < 1e10, x**2, 1e10)

def multiply2(x1, x2):
    return x1 * x2

def safe_divide(x1, x2):
    return np.where(np.abs(x2) < 1e-6, 1.0, x1 / x2)

def multiply3(x1, x2, x3):
    return x1 * x2 * x3

def protected_exp(x):
    return np.where(x < 100, np.exp(x), 1e10) # 無限大を大きな有限数に置換

def protected_log(x):
    return np.where(x > 0, np.log(x), -1e10) # 非正の値を大きな負の数に置換

# make_function を使って GPlearn が認識できる形式に変換
square_function = make_function(function=protected_square, arity=1, name='square')
multiply2_function = make_function(function=multiply2, arity=2, name='mul2')
safe_divide_function = make_function(function=safe_divide, arity=2, name='div')
multiply3_function = make_function(function=multiply3, arity=3, name='mul3')
exp_function = make_function(function=protected_exp, arity=1, name='exp')
log_function = make_function(function=protected_log, arity=1, name='log')
# sqrt_function の定義は残しておいても問題ありませんが、function_set から削除します

# 改善版：関数セットとパラメータ
function_set = ['add', 'sub', 'mul', 'abs', log_function, 'inv', 'neg',
                square_function,
                multiply2_function, safe_divide_function,
                multiply3_function,
                exp_function]

param_grid_improved = {
    'n_features_to_select': [10],
    'function_set': [function_set],
    'population_size': [5000],
    'generations': [50],
    'tournament_size': [5],
    'stopping_criteria': [0.05],
    'p_crossover': [0.85],
    'p_subtree_mutation': [0.05],
    'p_hoist_mutation': [0.05],
    'p_point_mutation': [0.05],
    'max_samples': [1.0],
    'parsimony_coefficient': [0.01],
   }

# パラメータサンプリング
n_iter = 5
param_sampler = ParameterSampler(param_grid_improved, n_iter=n_iter, random_state=random_seed)

best_score = -1.0
best_params = None
best_estimator = None
all_results = []

for i, params in enumerate(param_sampler):
    print(f"\nTrial {i+1}/{n_iter} with parameters:\n{params}")

    # 特徴量選択
    selector = SelectKBest(score_func=f_regression, k=params['n_features_to_select'])
    X_train_sel = selector.fit_transform(X_train_scaled_df, y_train)
    X_test_sel = selector.transform(X_test_scaled_df)
    selected_features = X_train.columns[selector.get_support(indices=True)].tolist()

    # モデル構築
    est = SymbolicRegressor(
        random_state=random_seed,
        function_set=params['function_set'],
        metric='mse',
        population_size=params['population_size'],
        generations=params['generations'],
        tournament_size=params['tournament_size'],
        stopping_criteria=params['stopping_criteria'],
        p_crossover=params['p_crossover'],
        p_subtree_mutation=params['p_subtree_mutation'],
        p_hoist_mutation=params['p_hoist_mutation'],
        p_point_mutation=params['p_point_mutation'],
        max_samples=params['max_samples'],
        parsimony_coefficient=params['parsimony_coefficient'],
        n_jobs=-1,
        feature_names=selected_features
    )

    est.fit(X_train_sel, y_train)
    y_pred = est.predict(X_test_sel)
    y_pred_binary = (y_pred > 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)
    auc = roc_auc_score(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    all_results.append({
        'params': params,
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc,
        'r2_score': r2,
        'features': selected_features,
        'equation': str(est._program)
    })

    if acc > best_score:
        best_score = acc
        best_params = params
        best_estimator = est
        best_features = selected_features
        best_precision = precision
        best_recall = recall
        best_f1 = f1
        best_auc = auc

# ベストモデル結果
print("\nBest Model Summary:")
print(f"  Accuracy: {best_score}")
print(f"  Precision: {best_precision}")
print(f"  Recall: {best_recall}")
print(f"  F1-score: {best_f1}")
print(f"  AUC: {best_auc}")
print(f"  Features: {best_features}")
print(f"  Equation: {best_estimator._program}")
print(f"  Parameters: {best_params}")

Collecting gplearn
  Downloading gplearn-0.4.2-py3-none-any.whl.metadata (4.3 kB)
Downloading gplearn-0.4.2-py3-none-any.whl (25 kB)
Installing collected packages: gplearn
Successfully installed gplearn-0.4.2

Trial 1/5 with parameters:
{'tournament_size': 5, 'stopping_criteria': 0.05, 'population_size': 5000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.05, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 1.0, 'generations': 50, 'function_set': ['add', 'sub', 'mul', 'abs', <gplearn.functions._Function object at 0x7c76191e0110>, 'inv', 'neg', <gplearn.functions._Function object at 0x7c7619b0f050>, <gplearn.functions._Function object at 0x7c761c328390>, <gplearn.functions._Function object at 0x7c761a2cce90>, <gplearn.functions._Function object at 0x7c76191cff10>, <gplearn.functions._Function object at 0x7c76191cffd0>]}


  return np.where(np.abs(x2) < 1e-6, 1.0, x1 / x2)
  return np.where(x > 0, np.log(x), -1e10) # 非正の値を大きな負の数に置換
  return np.where(x > 0, np.log(x), -1e10) # 非正の値を大きな負の数に置換



Best Model Summary:
  Accuracy: 1.0
  Precision: 1.0
  Recall: 1.0
  F1-score: 1.0
  AUC: 1.0
  Features: ['MaxPartialCharge', 'NHOHCount', 'NumHDonors', 'Mobile_water_109ms', 'Mobile_water_118ms', 'Mobile_water_122ms', 'Mobile_water_130ms', 'Mobile_water_146ms', 'Mobile_water_181ms', 'Mobile_water_256ms']
  Equation: exp(mul2(log(exp(log(neg(NHOHCount)))), Mobile_water_181ms))
  Parameters: {'tournament_size': 5, 'stopping_criteria': 0.05, 'population_size': 5000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.05, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 1.0, 'generations': 50, 'function_set': ['add', 'sub', 'mul', 'abs', <gplearn.functions._Function object at 0x7c76191e0110>, 'inv', 'neg', <gplearn.functions._Function object at 0x7c7619b0f050>, <gplearn.functions._Function object at 0x7c761c328390>, <gplearn.functions._Function object at 0x7c761a2cce90>, <gplearn.functions._Function object at 0x7c7619

  return np.where(x > 0, np.log(x), -1e10) # 非正の値を大きな負の数に置換
  return np.where(x > 0, np.log(x), -1e10) # 非正の値を大きな負の数に置換


In [None]:
# The code for HSQC
import pandas as pd
import numpy as np
!pip install gplearn
from gplearn.genetic import SymbolicRegressor
from sklearn.model_selection import train_test_split, ParameterSampler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
# from sklearn.preprocessing import StandardScaler # 削除
import random
from gplearn.functions import make_function

# ランダムシード固定
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

# データ読み込み
data = pd.read_csv('file name.csv')
if 'Unnamed: 0' in data.columns:
    X = data.drop(['CA1', 'Unnamed: 0'], axis=1)
else:
    X = data.drop('CA1', axis=1)
y = data['CA1']

# データ分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=random_seed, stratify=y
)

X_train_scaled_df = X_train.copy()
X_test_scaled_df = X_test.copy()

# カスタム関数定義 (NaN 対策を含む)
def protected_square(x):
    return np.where(np.abs(x) < 1e10, x**2, 1e10)

def multiply2(x1, x2):
    return x1 * x2

def safe_divide(x1, x2):
    return np.where(np.abs(x2) < 1e-6, 1.0, x1 / x2)

def multiply3(x1, x2, x3):
    return x1 * x2 * x3

def protected_exp(x):
    return np.where(x < 100, np.exp(x), 1e10) # 無限大を大きな有限数に置換

def protected_log(x):
    return np.where(x > 0, np.log(x), -1e10) # 非正の値を大きな負の数に置換

# make_function を使って GPlearn が認識できる形式に変換
square_function = make_function(function=protected_square, arity=1, name='square')
multiply2_function = make_function(function=multiply2, arity=2, name='mul2')
safe_divide_function = make_function(function=safe_divide, arity=2, name='div')
multiply3_function = make_function(function=multiply3, arity=3, name='mul3')
exp_function = make_function(function=protected_exp, arity=1, name='exp')
log_function = make_function(function=protected_log, arity=1, name='log')
# sqrt_function の定義は残しておいても問題ありませんが、function_set から削除します

# 改善版：関数セットとパラメータ
function_set = ['add', 'sub', 'mul', 'abs', log_function, 'inv', 'neg',
                square_function,
                multiply2_function, safe_divide_function,
                multiply3_function,
                exp_function]

param_grid_improved = {
    'n_features_to_select': [5, 10, 15, 20],
    'function_set': [function_set],
    'population_size': [1000],
    'generations': [500],
    'tournament_size': [5],
    'stopping_criteria': [0.05],
    'p_crossover': [0.95],
    'p_subtree_mutation': [0.01],
    'p_hoist_mutation': [0.01],
    'p_point_mutation': [0.01],
    'max_samples': [1.0],
    'parsimony_coefficient': [0.01],
   }

# パラメータサンプリング
n_iter = 5
param_sampler = ParameterSampler(param_grid_improved, n_iter=n_iter, random_state=random_seed)

best_score = -1.0
best_params = None
best_estimator = None
all_results = []

for i, params in enumerate(param_sampler):
    print(f"\nTrial {i+1}/{n_iter} with parameters:\n{params}")

    # 特徴量選択
    selector = SelectKBest(score_func=f_regression, k=params['n_features_to_select'])
    X_train_sel = selector.fit_transform(X_train_scaled_df, y_train)
    X_test_sel = selector.transform(X_test_scaled_df)
    selected_features = X_train.columns[selector.get_support(indices=True)].tolist()

    # モデル構築
    est = SymbolicRegressor(
        random_state=random_seed,
        function_set=params['function_set'],
        metric='mse',
        population_size=params['population_size'],
        generations=params['generations'],
        tournament_size=params['tournament_size'],
        stopping_criteria=params['stopping_criteria'],
        p_crossover=params['p_crossover'],
        p_subtree_mutation=params['p_subtree_mutation'],
        p_hoist_mutation=params['p_hoist_mutation'],
        p_point_mutation=params['p_point_mutation'],
        max_samples=params['max_samples'],
        parsimony_coefficient=params['parsimony_coefficient'],
        n_jobs=-1,
        feature_names=selected_features
    )

    est.fit(X_train_sel, y_train)
    y_pred = est.predict(X_test_sel)
    y_pred_binary = (y_pred > 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)
    auc = roc_auc_score(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    all_results.append({
        'params': params,
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc,
        'r2_score': r2,
        'features': selected_features,
        'equation': str(est._program)
    })

    if acc > best_score:
        best_score = acc
        best_params = params
        best_estimator = est
        best_features = selected_features
        best_precision = precision
        best_recall = recall
        best_f1 = f1
        best_auc = auc

# ベストモデル結果
print("\nBest Model Summary:")
print(f"  Accuracy: {best_score}")
print(f"  Precision: {best_precision}")
print(f"  Recall: {best_recall}")
print(f"  F1-score: {best_f1}")
print(f"  AUC: {best_auc}")
print(f"  Features: {best_features}")
print(f"  Equation: {best_estimator._program}")
print(f"  Parameters: {best_params}")


Trial 1/10 with parameters:
{'tournament_size': 5, 'stopping_criteria': 0.05, 'population_size': 4000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 20, 'max_samples': 0.9, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7c7604dde4d0>, <gplearn.functions._Function object at 0x7c75f8984490>, <gplearn.functions._Function object at 0x7c75f8985bd0>, <gplearn.functions._Function object at 0x7c75f8985450>]}


  return np.where(np.abs(x2) < 1e-6, 1.0, x1 / x2)



Trial 2/10 with parameters:
{'tournament_size': 10, 'stopping_criteria': 0.05, 'population_size': 4000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 0.9, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7c7604dde4d0>, <gplearn.functions._Function object at 0x7c75f8984490>, <gplearn.functions._Function object at 0x7c75f8985bd0>, <gplearn.functions._Function object at 0x7c75f8985450>]}





Trial 3/10 with parameters:
{'tournament_size': 5, 'stopping_criteria': 0.05, 'population_size': 4000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 5, 'max_samples': 0.9, 'generations': 300, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7c7604dde4d0>, <gplearn.functions._Function object at 0x7c75f8984490>, <gplearn.functions._Function object at 0x7c75f8985bd0>, <gplearn.functions._Function object at 0x7c75f8985450>]}





Trial 4/10 with parameters:
{'tournament_size': 5, 'stopping_criteria': 0.05, 'population_size': 4000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 0.9, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7c7604dde4d0>, <gplearn.functions._Function object at 0x7c75f8984490>, <gplearn.functions._Function object at 0x7c75f8985bd0>, <gplearn.functions._Function object at 0x7c75f8985450>]}





Trial 5/10 with parameters:
{'tournament_size': 10, 'stopping_criteria': 0.05, 'population_size': 4000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 20, 'max_samples': 0.9, 'generations': 300, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7c7604dde4d0>, <gplearn.functions._Function object at 0x7c75f8984490>, <gplearn.functions._Function object at 0x7c75f8985bd0>, <gplearn.functions._Function object at 0x7c75f8985450>]}





Trial 6/10 with parameters:
{'tournament_size': 5, 'stopping_criteria': 0.05, 'population_size': 4000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 0.9, 'generations': 300, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7c7604dde4d0>, <gplearn.functions._Function object at 0x7c75f8984490>, <gplearn.functions._Function object at 0x7c75f8985bd0>, <gplearn.functions._Function object at 0x7c75f8985450>]}





Trial 7/10 with parameters:
{'tournament_size': 10, 'stopping_criteria': 0.05, 'population_size': 4000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 5, 'max_samples': 0.9, 'generations': 300, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7c7604dde4d0>, <gplearn.functions._Function object at 0x7c75f8984490>, <gplearn.functions._Function object at 0x7c75f8985bd0>, <gplearn.functions._Function object at 0x7c75f8985450>]}





Trial 8/10 with parameters:
{'tournament_size': 10, 'stopping_criteria': 0.05, 'population_size': 4000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 20, 'max_samples': 0.9, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7c7604dde4d0>, <gplearn.functions._Function object at 0x7c75f8984490>, <gplearn.functions._Function object at 0x7c75f8985bd0>, <gplearn.functions._Function object at 0x7c75f8985450>]}





Trial 9/10 with parameters:
{'tournament_size': 5, 'stopping_criteria': 0.05, 'population_size': 4000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 20, 'max_samples': 0.9, 'generations': 300, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7c7604dde4d0>, <gplearn.functions._Function object at 0x7c75f8984490>, <gplearn.functions._Function object at 0x7c75f8985bd0>, <gplearn.functions._Function object at 0x7c75f8985450>]}





Trial 10/10 with parameters:
{'tournament_size': 10, 'stopping_criteria': 0.05, 'population_size': 4000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 5, 'max_samples': 0.9, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7c7604dde4d0>, <gplearn.functions._Function object at 0x7c75f8984490>, <gplearn.functions._Function object at 0x7c75f8985bd0>, <gplearn.functions._Function object at 0x7c75f8985450>]}





Best Model Summary:
  Accuracy: 0.75
  Precision: 1.0
  Recall: 0.5
  F1-score: 0.6666666666666666
  AUC: 0.75
  Features: ['PEOE_VSA14', 'SlogP_VSA1', 'fr_COO', 'fr_COO2', 'Mobile_water_93ms', 'Mobile_water_130ms', 'Mobile_water_158ms', 'Mobile_water_195ms', 'Mobile_water_375ms', 'Mobile_chains_191ms']
  Equation: abs(mul2(sub(Mobile_chains_191ms, fr_COO), -0.425))
  Parameters: {'tournament_size': 10, 'stopping_criteria': 0.05, 'population_size': 4000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 0.9, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7c7604dde4d0>, <gplearn.functions._Function object at 0x7c75f8984490>, <gplearn.functions._Function object at 0x7c75f8985bd0>, <gplearn.functions._Function object at 0x7c75f8985450>]}


In [None]:
# The code for HNCO
import pandas as pd
import numpy as np
!pip install gplearn
from gplearn.genetic import SymbolicRegressor
from sklearn.model_selection import train_test_split, ParameterSampler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
import random
from gplearn.functions import make_function

# ランダムシード固定
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

# データ読み込み
data = pd.read_csv('file name.csv')
if 'Unnamed: 0' in data.columns:
    X = data.drop(['CA1', 'Unnamed: 0'], axis=1)
else:
    X = data.drop('CA1', axis=1)
y = data['CA1']

# データ分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=random_seed, stratify=y
)

# カスタム関数定義
def protected_square(x):
    return np.where(np.abs(x) < 1e10, x**2, 1e10)

def protected_cbrt(x):
    return np.cbrt(x)

def multiply2(x1, x2):
    return x1 * x2

def safe_divide(x1, x2):
    return np.where(np.abs(x2) < 1e-6, 1.0, x1 / x2)

# make_function を使って GPlearn が認識できる形式に変換
square_function = make_function(function=protected_square, arity=1, name='square')
cbrt_function = make_function(function=protected_cbrt, arity=1, name='cbrt')
multiply2_function = make_function(function=multiply2, arity=2, name='mul2')
safe_divide_function = make_function(function=safe_divide, arity=2, name='div')

# 改善版：関数セットとパラメータ
function_set = ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg',
                square_function, cbrt_function,
                multiply2_function, safe_divide_function] # 関数オブジェクトを直接指定

param_grid_improved = {
    'n_features_to_select': [5, 10],
    'function_set': [function_set],
    'population_size': [2000, 3000],
    'generations': [300, 500],
    'tournament_size': [5, 10],
    'stopping_criteria': [0.05],
    'p_crossover': [0.85],
    'p_subtree_mutation': [0.05],
    'p_hoist_mutation': [0.03],
    'p_point_mutation': [0.05],
    'max_samples': [0.9],
    'parsimony_coefficient': [0.001, 0.01],
    'max_depth': [5, 7, 10], # max_depth を追加
}

# パラメータサンプリング
n_iter = 10
param_sampler = ParameterSampler(param_grid_improved, n_iter=n_iter, random_state=random_seed)

best_score = -1.0
best_params = None
best_estimator = None
all_results = []

for i, params in enumerate(param_sampler):
    print(f"\nTrial {i+1}/{n_iter} with parameters:\n{params}")

    # 特徴量選択
    selector = SelectKBest(score_func=f_regression, k=params['n_features_to_select'])
    X_train_sel = selector.fit_transform(X_train, y_train) # スケール前のデータを使用
    X_test_sel = selector.transform(X_test) # スケール前のデータを使用
    selected_features = X_train.columns[selector.get_support(indices=True)].tolist()

    # モデル構築
    est = SymbolicRegressor(
        random_state=random_seed,
        function_set=params['function_set'],
        metric='mse',
        population_size=params['population_size'],
        generations=params['generations'],
        tournament_size=params['tournament_size'],
        stopping_criteria=params['stopping_criteria'],
        p_crossover=params['p_crossover'],
        p_subtree_mutation=params['p_subtree_mutation'],
        p_hoist_mutation=params['p_hoist_mutation'],
        p_point_mutation=params['p_point_mutation'],
        max_samples=params['max_samples'],
        parsimony_coefficient=params['parsimony_coefficient'],
        n_jobs=-1,
        feature_names=selected_features
    )

    est.fit(X_train_sel, y_train)
    y_pred = est.predict(X_test_sel)
    y_pred_binary = (y_pred > 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)
    auc = roc_auc_score(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    all_results.append({
        'params': params,
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc,
        'r2_score': r2,
        'features': selected_features,
        'equation': str(est._program)
    })

    if acc > best_score:
        best_score = acc
        best_params = params
        best_estimator = est
        best_features = selected_features
        best_precision = precision
        best_recall = recall
        best_f1 = f1
        best_auc = auc

# ベストモデル結果
print("\nBest Model Summary:")
print(f"  Accuracy: {best_score}")
print(f"  Precision: {best_precision}")
print(f"  Recall: {best_recall}")
print(f"  F1-score: {best_f1}")
print(f"  AUC: {best_auc}")
print(f"  Features: {best_features}")
print(f"  Equation: {best_estimator._program}")
print(f"  Parameters: {best_params}")

Collecting gplearn
  Downloading gplearn-0.4.2-py3-none-any.whl.metadata (4.3 kB)
Downloading gplearn-0.4.2-py3-none-any.whl (25 kB)
Installing collected packages: gplearn
Successfully installed gplearn-0.4.2

Trial 1/10 with parameters:
{'tournament_size': 5, 'stopping_criteria': 0.05, 'population_size': 2000, 'parsimony_coefficient': 0.001, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 5, 'max_samples': 0.9, 'max_depth': 10, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7fb19c1b8450>, <gplearn.functions._Function object at 0x7fb19ae057d0>, <gplearn.functions._Function object at 0x7fb1e8640e90>, <gplearn.functions._Function object at 0x7fb19beaedd0>]}


  return np.where(np.abs(x2) < 1e-6, 1.0, x1 / x2)



Trial 2/10 with parameters:
{'tournament_size': 10, 'stopping_criteria': 0.05, 'population_size': 2000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 0.9, 'max_depth': 7, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7fb19c1b8450>, <gplearn.functions._Function object at 0x7fb19ae057d0>, <gplearn.functions._Function object at 0x7fb1e8640e90>, <gplearn.functions._Function object at 0x7fb19beaedd0>]}





Trial 3/10 with parameters:
{'tournament_size': 10, 'stopping_criteria': 0.05, 'population_size': 2000, 'parsimony_coefficient': 0.001, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 0.9, 'max_depth': 7, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7fb19c1b8450>, <gplearn.functions._Function object at 0x7fb19ae057d0>, <gplearn.functions._Function object at 0x7fb1e8640e90>, <gplearn.functions._Function object at 0x7fb19beaedd0>]}





Trial 4/10 with parameters:
{'tournament_size': 5, 'stopping_criteria': 0.05, 'population_size': 3000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 0.9, 'max_depth': 10, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7fb19c1b8450>, <gplearn.functions._Function object at 0x7fb19ae057d0>, <gplearn.functions._Function object at 0x7fb1e8640e90>, <gplearn.functions._Function object at 0x7fb19beaedd0>]}





Trial 5/10 with parameters:
{'tournament_size': 10, 'stopping_criteria': 0.05, 'population_size': 2000, 'parsimony_coefficient': 0.001, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 5, 'max_samples': 0.9, 'max_depth': 10, 'generations': 300, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7fb19c1b8450>, <gplearn.functions._Function object at 0x7fb19ae057d0>, <gplearn.functions._Function object at 0x7fb1e8640e90>, <gplearn.functions._Function object at 0x7fb19beaedd0>]}





Trial 6/10 with parameters:
{'tournament_size': 10, 'stopping_criteria': 0.05, 'population_size': 3000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 0.9, 'max_depth': 7, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7fb19c1b8450>, <gplearn.functions._Function object at 0x7fb19ae057d0>, <gplearn.functions._Function object at 0x7fb1e8640e90>, <gplearn.functions._Function object at 0x7fb19beaedd0>]}





Trial 7/10 with parameters:
{'tournament_size': 10, 'stopping_criteria': 0.05, 'population_size': 2000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 5, 'max_samples': 0.9, 'max_depth': 7, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7fb19c1b8450>, <gplearn.functions._Function object at 0x7fb19ae057d0>, <gplearn.functions._Function object at 0x7fb1e8640e90>, <gplearn.functions._Function object at 0x7fb19beaedd0>]}





Trial 8/10 with parameters:
{'tournament_size': 5, 'stopping_criteria': 0.05, 'population_size': 3000, 'parsimony_coefficient': 0.001, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 0.9, 'max_depth': 10, 'generations': 300, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7fb19c1b8450>, <gplearn.functions._Function object at 0x7fb19ae057d0>, <gplearn.functions._Function object at 0x7fb1e8640e90>, <gplearn.functions._Function object at 0x7fb19beaedd0>]}





Trial 9/10 with parameters:
{'tournament_size': 5, 'stopping_criteria': 0.05, 'population_size': 2000, 'parsimony_coefficient': 0.001, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 5, 'max_samples': 0.9, 'max_depth': 5, 'generations': 300, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7fb19c1b8450>, <gplearn.functions._Function object at 0x7fb19ae057d0>, <gplearn.functions._Function object at 0x7fb1e8640e90>, <gplearn.functions._Function object at 0x7fb19beaedd0>]}





Trial 10/10 with parameters:
{'tournament_size': 5, 'stopping_criteria': 0.05, 'population_size': 3000, 'parsimony_coefficient': 0.001, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 0.9, 'max_depth': 5, 'generations': 300, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7fb19c1b8450>, <gplearn.functions._Function object at 0x7fb19ae057d0>, <gplearn.functions._Function object at 0x7fb1e8640e90>, <gplearn.functions._Function object at 0x7fb19beaedd0>]}





Best Model Summary:
  Accuracy: 0.75
  Precision: 0.6666666666666666
  Recall: 0.6666666666666666
  F1-score: 0.6666666666666666
  AUC: 0.7666666666666666
  Features: ['BCUT2D_MWHI', 'PEOE_VSA13', 'VSA_EState10', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'DSC_area', 'DSC_peak_height', 'MSEH2O_0.12992', 'CPMGD2O_119.88928']
  Equation: abs(mul2(fr_Al_COO, VSA_EState10))
  Parameters: {'tournament_size': 10, 'stopping_criteria': 0.05, 'population_size': 2000, 'parsimony_coefficient': 0.01, 'p_subtree_mutation': 0.05, 'p_point_mutation': 0.05, 'p_hoist_mutation': 0.03, 'p_crossover': 0.85, 'n_features_to_select': 10, 'max_samples': 0.9, 'max_depth': 7, 'generations': 500, 'function_set': ['add', 'sub', 'mul', 'abs', 'sqrt', 'log', 'inv', 'neg', <gplearn.functions._Function object at 0x7fb19c1b8450>, <gplearn.functions._Function object at 0x7fb19ae057d0>, <gplearn.functions._Function object at 0x7fb1e8640e90>, <gplearn.functions._Function object at 0x7fb19beaedd0>]}
