In [1]:
#Data
import pandas as pd
import numpy as np
import os
import pickle

#Plot
import matplotlib.pyplot as plt

#Model
from pysr import PySRRegressor, TemplateExpressionSpec
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

Detected IPython. Loading juliacall extension. See https://juliapy.github.io/PythonCall.jl/stable/compat/#IPython


In [466]:
# # Import data
# with open('data/df_tc.pkl', 'rb') as f:
#     df = pickle.load(f)
# df['K_reduced'] = df['K'] - df['mAKA'] - df['mBKB']

In [467]:
# Import data without glyxerol-methanol, K_reduced
with open('data/df_filtered_reduced.pkl', 'rb') as f:
    df = pickle.load(f)

In [468]:
# # Condition where KB > KA
# swap_mask = df['KB'] > df['KA']

# # Swap values where condition is met
# df.loc[swap_mask, ['KA', 'KB']] = df.loc[swap_mask, ['KB', 'KA']].values
# df.loc[swap_mask, ['mA', 'mB']] = df.loc[swap_mask, ['mB', 'mA']].values

In [None]:
# Condition where KB > KA
swap_mask = df['KB'] > df['KA']

# Swap values where condition is met
df_new.loc[swap_mask, ['KA', 'KB']] = df_new.loc[swap_mask, ['KB', 'KA']].values
df_new.loc[swap_mask, ['mA', 'mB']] = df_new.loc[swap_mask, ['mB', 'mA']].values
df_new.loc[swap_mask, ['Tr_A', 'Tr_B']] = df_new.loc[swap_mask, ['Tr_B', 'Tr_A']].values

In [469]:
df['mix'] = df['compA'] + df['compB']
mixtures = df['mix'].unique()

train_mix, test_mix = train_test_split(mixtures, test_size=0.3, random_state=42)

df = df[df['P'] != 77000]
X_columns = ['mA', 'mB', 'KA', 'KB', 'Tr_A', 'Tr_B', 'mAKA', 'mBKB', 'Tr_mix']
X = df[X_columns]
y = df['K_reduced']

train_df = df[df['mix'].isin(train_mix)]
test_df = df[df['mix'].isin(test_mix)]

X_train = train_df[X_columns]
y_train = train_df['K_reduced']
X_test = test_df[X_columns]
y_test = test_df['K_reduced']

In [470]:
# # Path to the 'outputs' folder
# path_ref = 'logs'

# # Get a list of folder names inside the 'outputs' folder (only from 100 onwards)
# folder_names_ref = os.listdir(path_ref)

# runtotal = len(folder_names_ref) - 1
# red_runs = runtotal - 99

# # Path to the 'outputs' folder
# path = 'outputs'

# # Get a list of folder names inside the 'outputs' folder
# folder_names = [os.path.join(path,f).replace('\\', '/') for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
# folder_names_red = folder_names[-red_runs:]
# folder_names_red

In [471]:
# Read linux outputs
# Path to the 'outputs' folder
path = 'linux_outputs/10K_red'
# Get a list of folder names inside the 'outputs' folder
folder_names = [os.path.join(path,f).replace('\\', '/') for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
folder_names_red = folder_names
folder_names_red

['linux_outputs/10K_red/20250407_145313_33rUrV',
 'linux_outputs/10K_red/20250407_154456_WxKJnd',
 'linux_outputs/10K_red/20250407_163101_ATxW33',
 'linux_outputs/10K_red/20250407_170632_zSlYa0',
 'linux_outputs/10K_red/20250408_100608_0qGS3E',
 'linux_outputs/10K_red/20250408_104259_Hh5m06',
 'linux_outputs/10K_red/20250408_112111_0YVijh',
 'linux_outputs/10K_red/20250408_120046_xi7TEt',
 'linux_outputs/10K_red/20250408_130140_ZcCakH',
 'linux_outputs/10K_red/20250408_133735_bx1il9',
 'linux_outputs/10K_red/20250408_141618_TDnkvh',
 'linux_outputs/10K_red/20250408_144919_gGolrq',
 'linux_outputs/10K_red/20250414_101045_fleC2F',
 'linux_outputs/10K_red/20250414_165657_5ZjeOo',
 'linux_outputs/10K_red/20250415_095251_v7OogZ',
 'linux_outputs/10K_red/20250415_142840_1aNxVf',
 'linux_outputs/10K_red/20250430_103309_BV4HyP',
 'linux_outputs/10K_red/20250430_140015_S3Eqqh',
 'linux_outputs/10K_red/20250430_150532_fKSsHs',
 'linux_outputs/10K_red/20250430_152609_b3zCLN',
 'linux_outputs/10K_

In [472]:
equations = []
y_preds = []
rmses = []
r2_scores = []
failed_folders = []  # To keep track of folders that fail to load or produce predictions
error_messages = []  # To store error messages

for folder in folder_names_red:
    try:
        # Attempt to load the model from the specified folder
        model = PySRRegressor.from_file(run_directory=folder)
        
        # If the model is loaded, perform the calculations
        best_equation = model.get_best().equation
        equations.append(best_equation)
        
        y_pred = model.predict(X_test) #Switch between X_train and X_test
        y_preds.append(y_pred)
        
        rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # Switch between y_train and y_test
        rmses.append(rmse)
        
        r2 = r2_score(y_test, y_pred) # Switch between y_train and y_test
        r2_scores.append(r2)
    
    except Exception as e:
        # If an error occurs, store the folder and the error message
        failed_folders.append(folder)
        error_message = str(e)
        error_messages.append(error_message)
        print(f"Error in folder {folder}: {error_message}")
        
        # Append custom message based on the error
        if "Ran out of input" in error_message:
            equations.append("Ran out of input")
        else:
            equations.append("Run failed")
        
        # Append default values when the process fails
        rmses.append(1)
        r2_scores.append(0)
        y_preds.append(1)

Attempting to load model from linux_outputs\10K_red\20250407_145313_33rUrV\checkpoint.pkl...
Attempting to load model from linux_outputs\10K_red\20250407_154456_WxKJnd\checkpoint.pkl...
Attempting to load model from linux_outputs\10K_red\20250407_163101_ATxW33\checkpoint.pkl...
Attempting to load model from linux_outputs\10K_red\20250407_170632_zSlYa0\checkpoint.pkl...
Attempting to load model from linux_outputs\10K_red\20250408_100608_0qGS3E\checkpoint.pkl...
Attempting to load model from linux_outputs\10K_red\20250408_104259_Hh5m06\checkpoint.pkl...
Attempting to load model from linux_outputs\10K_red\20250408_112111_0YVijh\checkpoint.pkl...
Attempting to load model from linux_outputs\10K_red\20250408_120046_xi7TEt\checkpoint.pkl...
Attempting to load model from linux_outputs\10K_red\20250408_130140_ZcCakH\checkpoint.pkl...
Attempting to load model from linux_outputs\10K_red\20250408_133735_bx1il9\checkpoint.pkl...
Attempting to load model from linux_outputs\10K_red\20250408_141618_TD

In [473]:
error_messages

[]

In [474]:
# # Single model inspect
# model = PySRRegressor.from_file(run_directory=folder_names[-1]) # last run
# best_equation = model.get_best().equation
# y_pred = model.predict(X_test)
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# r2 = r2_score(y_test, y_pred)
# print(f"RMSE: {rmse}")
# print(f"R2 Score: {r2}")

In [475]:
# model.get_best().equation

In [476]:
# Change y_test to original K instead of K_reduced
y_test = test_df['K']

# Calculate additive term
test_df.loc[:, 'mAKA+mBKB'] = test_df['mA'] * test_df['KA'] + test_df['mB'] * test_df['KB']
term = test_df['mAKA+mBKB'].to_numpy()

# # For using train to evaluate the model
# y_train = train_df['K']
# train_df.loc[:, 'mAKA+mBKB'] = train_df['mA'] * train_df['KA'] + train_df['mB'] * train_df['KB']
# term = train_df['mAKA+mBKB'].to_numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.loc[:, 'mAKA+mBKB'] = test_df['mA'] * test_df['KA'] + test_df['mB'] * test_df['KB']


In [477]:
# Calculate whole equation errors
true_y_preds = []
for y_pred in y_preds:
    true_y_pred = y_pred + term
    true_y_preds.append(true_y_pred)

In [478]:
for i,e in enumerate(equations, start=1):
    print(f"Equation {i}: {e}")

Equation 1: ((mBKB * ((KB - (KA * 0.9046222)) * (abs((Tr_mix * 0.301831) - KB) - 0.035186045))) * mA) * 128.88832
Equation 2: ((sqrt(abs((KA * KA) - ((Tr_A / KB) * 0.010006066))) * -0.7621054) + sqrt(abs((0.006022795 / KB) - (KA * KA)))) * (KA * ((mA * abs(mB * (KB - KA))) * -93.397415))
Equation 3: (mA * (mB * (KB - KA))) * (0.5418495 - (0.010028673 / ((Tr_A * -1.362892) + (KA * 3.5067956))))
Equation 4: ((mB * mA) * -0.6605629) * abs(abs(KA + -0.17922655) - abs(KB + -0.17922774))
Equation 5: (mBKB * mA) * -0.08055717
Equation 6: (mBKB * -17.971628) * (sqrt(0.20960015 - KB) * abs((mAKA + mBKB) - KB))
Equation 7: abs(KA - KB) * ((((9.41914e-5 / (KA + -0.20952305)) + -0.58187085) * mA) * mB)
Equation 8: sqrt(abs(mA * ((KA - KB) * ((0.2096026 - KA) * ((0.12978531 - KA) * mB))))) * -1.5901082
Equation 9: (-0.6781911 - (0.0045934073 / (KA + (KB - 0.35398537)))) * abs((mA * (KB - KA)) * mB)
Equation 10: (mA * (abs(KB - KA) * mB)) * -0.4322214
Equation 11: ((mA * mB) / log(sqrt(abs(KA + (-0.

In [479]:
# #Screen laptop outputs
# true_rmses = []
# true_r2 = []
# for i, y in enumerate(true_y_preds, start=100):  # Start numbering from 100
#     rmse = np.sqrt(mean_squared_error(y_test, y))
#     r2 = r2_score(y_test, y)
#     true_rmses.append(rmse)
#     true_r2.append(r2)
    
#     print(f"Run {i}:")
#     print(f"RMSE: {rmse}")
#     print(f"R2 Score: {r2}")
#     print("______________")  # Separator line

In [480]:
# Screen linux outputs
true_rmses = []
true_r2 = []
for i, y in enumerate(true_y_preds, start=1):  # Start numbering from 100 for laptop; 0 for linux
    rmse = np.sqrt(mean_squared_error(y_test, y)) # Change to y_train for training set
    r2 = r2_score(y_test, y) # Change to y_train for training set
    true_rmses.append(rmse)
    true_r2.append(r2)
    
    print(f"Run {i}:")
    print(f"RMSE: {rmse}")
    print(f"R2 Score: {r2}")
    print("______________")  # Separator line

Run 1:
RMSE: 0.0058220555066541705
R2 Score: 0.9305644162009947
______________
Run 2:
RMSE: 0.0037296870167878578
R2 Score: 0.9715046375391101
______________
Run 3:
RMSE: 0.00345402395324348
R2 Score: 0.9755611875727052
______________
Run 4:
RMSE: 0.003531994641959527
R2 Score: 0.974445376888162
______________
Run 5:
RMSE: 0.005671790970600173
R2 Score: 0.9341023629740723
______________
Run 6:
RMSE: 0.009281205131288111
R2 Score: 0.8235433008263453
______________
Run 7:
RMSE: 0.0033774818238249903
R2 Score: 0.9766323279440675
______________
Run 8:
RMSE: 0.0034977287658158503
R2 Score: 0.9749388113598975
______________
Run 9:
RMSE: 0.0035019996378337684
R2 Score: 0.9748775724918112
______________
Run 10:
RMSE: 0.0034174972780574653
R2 Score: 0.9760753406546754
______________
Run 11:
RMSE: 0.003460794040993702
R2 Score: 0.9754652906967473
______________
Run 12:
RMSE: 0.0033956246979257618
R2 Score: 0.9763806046531941
______________
Run 13:
RMSE: 0.00501537022507756
R2 Score: 0.9484729353

In [481]:
run_r2 = []
for i, e in enumerate(true_r2, start=1):
    run_number = i + 1
    run_r2.append([run_number,e])
    print(f"Run {i}: {e}")

Run 1: 0.9305644162009947
Run 2: 0.9715046375391101
Run 3: 0.9755611875727052
Run 4: 0.974445376888162
Run 5: 0.9341023629740723
Run 6: 0.8235433008263453
Run 7: 0.9766323279440675
Run 8: 0.9749388113598975
Run 9: 0.9748775724918112
Run 10: 0.9760753406546754
Run 11: 0.9754652906967473
Run 12: 0.9763806046531941
Run 13: 0.9484729353415027
Run 14: 0.9719975671452724
Run 15: 0.9763683583594707
Run 16: 0.9750379445733734
Run 17: 0.9766781480997829
Run 18: 0.9766434779668098
Run 19: 0.976639573433955
Run 20: 0.9766624303090603
Run 21: 0.976438989657105


In [482]:
top10_r2 = sorted(run_r2, key=lambda x: x[1], reverse=True)[:10]
for i, e in top10_r2:
    print(f"Run {i-1}: {e}")

Run 17: 0.9766781480997829
Run 20: 0.9766624303090603
Run 18: 0.9766434779668098
Run 19: 0.976639573433955
Run 7: 0.9766323279440675
Run 21: 0.976438989657105
Run 12: 0.9763806046531941
Run 15: 0.9763683583594707
Run 10: 0.9760753406546754
Run 3: 0.9755611875727052


In [483]:
# Finding the index and value of the highest R2 score
best_index = np.argmax(true_r2)  # Gets index of the max R2 score
best_r2 = true_r2[best_index]
best_rmse = true_rmses[best_index]
best_run = best_index + 1  # Adjusting index to match run number (+100 if laptop run, +1 if linux run)

print(f"Best Run: {best_run}")
print(f"Highest R2 Score: {best_r2}")
print(f"Corresponding RMSE: {best_rmse}")
print(f"Equation: {equations[best_index]} ")

Best Run: 17
Highest R2 Score: 0.9766781480997829
Corresponding RMSE: 0.003374168856227867
Equation: (mB * mA) * ((abs((KB - KA) * (KB - 0.20966384)) / (0.20954196 - KB)) * -0.585045) 


In [484]:
import sympy

# Function to calculate the depth of the symbolic expression
def get_depth(expr):
    if isinstance(expr, sympy.Basic):
        return 1 + max([get_depth(arg) for arg in expr.args], default=0)
    return 0

# Extract the first symbolic equation found by PySR
best_equation = model.sympy()  # This is a list of SymPy expressions

# Calculate its depth
depth = get_depth(best_equation)
print(f"The depth of the equation is: {depth}")

RuntimeError: Couldn't find equation file! The equation search likely exited before a single iteration completed.

MISC

In [None]:
# X_all = df[X_columns]
# y_all = df['K']

# # Calculate additive term
# df.loc[:, 'mAKA+mBKB'] = df['mA'] * df['KA'] + df['mB'] * df['KB']
# term_all = df['mAKA+mBKB'].to_numpy()
# term_all

In [None]:
# model = PySRRegressor.from_file(run_directory=folder_names_red[6])
# y_pred_all = model.predict(X_all)
# y_pred_all = np.array(y_pred_all)
# y_pred_all

In [None]:
# true_y_pred_all = y_pred_all + term_all

In [None]:
# rmse_all = np.sqrt(mean_squared_error(true_y_pred_all, y_all))
# r2_all = r2_score(true_y_pred_all, y_all)
# print(f"RMSE: {rmse_all}")
# print(f"R2 Score: {r2_all}")

In [None]:
# true_y_pred_all = pd.Series(true_y_pred_all, name='true_y_pred_all')
# # Save X_all to a CSV file
# X_all.to_csv('data_exports/X_all.csv', index=False)

# # Save y_all to a CSV file
# y_all.to_csv('data_exports/y_all.csv', index=False)

# # Save true_y_pred_all to a CSV file
# true_y_pred_all.to_csv('data_exports/true_y_pred_all.csv', index=False)

In [None]:
# df_all = pd.concat([X_all, y_all], axis=1)
# # Save df_all to a CSV file
# df_all.to_csv('data_exports/df_all.csv', index=False)

Filippov and Novoselova (Optimized C)

In [None]:
# y_fan = X_all['KA']*X_all['mA'] + X_all['KB']*X_all['mB'] - 0.5*abs(X_all['KA'] - X_all['KB'])*X_all['mA']*X_all['mB']

In [None]:
# # Save y_fan to a CSV file
# y_fan.to_csv('data_exports/y_fan.csv', index=False)

Jamieson and Irving 

In [None]:
# def calc_K(df):
#     if df['KA'] > df['KB']:
#         return df['KA'] * df['mA'] + df['KB'] * df['mB'] - (df['KA'] - df['KB']) * df['mA'] * (1 - np.sqrt(df['mA']))
#     else:
#         return df['KA'] * df['mA'] + df['KB'] * df['mB'] - (df['KB'] - df['KA']) * df['mB'] * (1 - np.sqrt(df['mB']))

# y_jai = X_all.apply(calc_K, axis=1)

In [None]:
# # Save y_jai to a CSV file
# y_jai.to_csv('data_exports/y_jai.csv', index=False)