In [None]:
import pysr
import pandas as pd
import os
from pysr import PySRRegressor
import sympy
import numpy as np
import itertools


In [None]:
# create dataframes 
dir = "datasets/Feynman_with_units/Feynman_with_units"
df_dict={}
for file in os.listdir(dir):
    fname = os.fsdecode(file)
    df = pd.read_csv(os.path.join(dir,fname),sep=" ",header=None)
    dfname,_ = os.path.splitext(fname)
    df_dict[dfname] = df
# now we instantiate pysr

print("DFs created")
file="best_equations.csv",  
best_list=[]
c=0
for name,df in df_dict.items():
    c+=1
    X=df.iloc[:10000,:-1]
    y=df.iloc[:10000,-1] 
    print(f"Going through DF {c} solving for equation {name}, X shape: {X.shape}, y shape: {y.shape}")
    model=PySRRegressor(batching=True,batch_size=32,verbosity=0)
    model.fit(X,y)
    best=model.equations_.sort_values("score",axis=0,ascending=False).iloc[[0]] # score chooses the equation with the best complexity-accuracy trade off.
    best["name"] = name 
    best_list.append(best)
    
# now concat vertically and save
pd.concat(best_list,axis=0).to_csv('best_eqs.csv',index=False,header=["complexity","loss","equation","score","sympy format","lambda_format","Filename"])
 

In [2]:
symbols = sympy.symbols("""
sigma theta1 x1 x2 y1 y2 m1 m2 G z1
z2 m_0 v c x3 y3 mu Nn q1 q2
epsilon r Ef q B m u w r1 r2
z k_spring x t F omega omega_0 n theta2 d1
d2 Int_0 lambd d a p h I1 I2 delta
pr gamma kb n_0 mu_drift Volt mob V1 V2 rho
alpha kappa T1 T2 Pwr p_d y sigma_den chi n_rho
I rho_c_0 mom g_ Jz E_n Bx By Bz k
I_0 beta A_vec x0 x4 x5 x6 x7 x8 x9
""")
(sigma, theta1, x1, x2, y1, y2, m1, m2, G, z1,
z2, m_0, v, c, x3, y3, mu, Nn, q1, q2,
epsilon, r, Ef, q, B, m, u, w, r1, r2,
z, k_spring, x, t, F, omega, omega_0, n, theta2, d1,
d2, Int_0, lambd, d, a, p, h, I1, I2, delta,
pr, gamma, kb, n_0, mu_drift, Volt, mob, V1, V2, rho,
alpha, kappa, T1, T2, Pwr, p_d, y, sigma_den, chi, n_rho,
I, rho_c_0, mom, g_, Jz, E_n, Bx, By, Bz, k,
I_0, beta, A_vec,x0,x4,x5,x6,x7,x8,x9) = symbols

locals_ = {s.name: s for s in symbols}


In [None]:
# creating the core DF 

df_pred=pd.read_csv("best_eqs.csv")
df_original = pd.read_csv("~/projects/FeynmanEquations.csv")[["Filename","Formula"]]
df_merged = df_pred.merge(df_original,on="Filename",how='left')
df_merged["Equation"] = df_merged["Formula"].map(lambda f: sympy.sympify(f, locals=locals_))
df_merged["sympy format"] = df_merged["sympy format"].map(lambda f: sympy.sympify(f, locals=locals_))
df_merged= df_merged[["Filename","Equation","loss","sympy format"]]
df_merged.set_index("Filename")
df_merged.drop(df_merged[df_merged['Equation']==sympy.nan].index,inplace=True)




In [60]:
df_merged

Unnamed: 0,Filename,Equation,loss,sympy format
0,III.15.27,2*pi*alpha/(d*n),8.323608e-14,6.2831855*x0/(x1*x2)
1,III.4.33,h*omega/(2*pi*(exp(h*omega/(2*pi*T*kb)) - 1)),2.128536e-06,-0.07950927*x0*(-0.025731286*x0*x1**2/(x2*x3) ...
2,I.24.6,m*x**2*(omega**2 + omega_0**2)/4,1.122923e+00,x0*x3**2*(x1 + x2 - 1.8326204)
3,III.9.52,8*pi*Ef*p_d*sin(t*(omega - omega_0)/2)**2/(h*t...,2.812233e+01,16.122032*x0*x1/(x3*((-x4 + x5)**2 + 1.1870767))
4,I.34.14,omega_0*(1 + v/c)/sqrt(1 - v**2/c**2),4.642324e-04,x2 - x2/(-x0/x1 + 0.6281144)
...,...,...,...,...
95,II.10.9,sigma_den/(epsilon*(chi + 1)),6.736250e-16,x0/(x1*x2 + x1)
96,III.12.43,h*n/(2*pi),1.330285e-14,0.15915494*x0*x1
97,II.27.16,Ef**2*c*epsilon,1.120711e-10,x0*x1*x2**2
98,I.47.23,sqrt(gamma*pr/rho),1.099245e-03,x0/(0.16989318*x0 + 2.136836*x2/x1) + 0.5798818


In [61]:
# i want to know how similar these are...
# constants are an issue.. we will try to prove equivalence up to a constant 
pred_eqs = df_merged['sympy format']
orig_eqs = df_merged['Equation']


In [None]:


def check_equality(pred_expr, true_expr, true_varnames=None):
    '''
    used to check if two sympy expressions are "equal".
    equality is defined here as equal upto a constant.
    Works by sampling the true equation and then trying out permutations 
    of the input variables of the candidate solution and obtaining the ratio of the fits. 
    If the ratios are identical, the expressions are equivalent upto a constant.  
    
    '''
    # print("Free symbols in the true expression: ")
    # print(true_expr.free_symbols)
    pred_vars = sorted(pred_expr.free_symbols, key=lambda s: s.name)
    if true_varnames:
        true_vars = [sympy.Symbol(n) for n in true_varnames]
    else:
        true_vars = sorted(true_expr.free_symbols, key=lambda s: s.name)
    # print("variables in the true expression: ")
    print(true_vars)
    if len(pred_vars) != len(true_vars):
        return False
# lambdify
    f_pred = sympy.lambdify(pred_vars, pred_expr, "numpy")
    f_true = sympy.lambdify(true_vars, true_expr, "numpy")
    
# 10 points 
    N_samples = 10
    X = np.random.uniform(1.0, 5.0, (N_samples, len(true_vars)))
    
    # Calculate True Output
    try:
        y_true = f_true(*X.T) # evaluate true function over 10 points. 
    except Exception:
        return False 

    best_perm = None
    
    for perm_indices in itertools.permutations(range(len(true_vars))): # [0,1,2] eg if len is 3
        print(perm_indices) # [0,1,2], [0,2,1]
        try:
            # Reorder columns of X to match this permutation
            X_permuted = X[:, perm_indices]
            
            # Eval prediction
            y_pred = f_pred(*X_permuted.T) # get the fit on y after re-ordering. 
            
            # Calculate Ratio: y_pred / y_true
            # If equations are equivalent (up to constant), ratio should be identical for all points
            ratio = y_pred / y_true            
            # Check if ratio is constant (sigma is almost 0)
            if np.std(ratio) < 1e-5 and np.isfinite(ratio).all():
                # We found a candidate permutation! 
                # Map the indices back to symbols
                best_perm = [true_vars[i] for i in perm_indices]
                break
                
        except Exception:
            continue

    if best_perm:
        mapper = dict(zip(pred_vars, best_perm)) # make the correct mapping.
        pred_mapped = pred_expr.subs(mapper)
        
        # run simplify for a final sanity check 
        final_ratio = sympy.simplify(pred_mapped / true_expr)
        
        if final_ratio.is_constant():
            return True

    return False

In [None]:
# finally, apply the above onto the dataframe and add the col
df_merged["equivalent"] = df_merged.apply(
    lambda row: check_equality(
        row["sympy format"],   # predicted
        row["Equation"]        # ground truth
    ),
    axis=1
)



  return omega_0*(1 + v/c)/sqrt(1 - v**2/c**2)
  return m_0/sqrt(1 - v**2/c**2)
  return (t - u*x/c**2)/sqrt(1 - u**2/c**2)
  return rho_c_0/sqrt(1 - v**2/c**2)
  return arcsin(n*sin(theta2))
  return sqrt(-pi**2/d**2 + omega**2/c**2)


In [69]:
df_merged[df_merged['equivalent']==True]


Unnamed: 0,Filename,Equation,loss,sympy format,equivalent
0,III.15.27,2*pi*alpha/(d*n),8.323608e-14,6.2831855*x0/(x1*x2),True
5,III.7.38,4*pi*B*mom/h,1.810693e-11,12.566371*x0*x1/x2,True
9,II.11.3,Ef*q/(m*(-omega**2 + omega_0**2)),4.377453e-16,x0*x1/(x2*(x3 - x4)*(x3 + x4)),True
10,II.21.32,q/(4*pi*epsilon*r*(1 - v/c)),3.887186e-17,0.07957747*(x0*x3/(-x3 + x4) + x0)/(x1*x2),True
12,II.34.29b,2*pi*B*Jz*g_*mom/h,7.030817e-10,6.2831855*x0*x2*x3*x4/x1,True
13,I.14.3,g*m*z,4.462443e-12,x0*x1*x2,True
14,I.13.12,G*m1*m2*(1/r2 - 1/r1),7.885416e-13,x0*x1*x4*(x2 - x3)/(x2*x3),True
16,I.12.5,Ef*q2,2.798131e-13,x0*x1,True
17,I.43.16,Volt*mu_drift*q/d,1.222163e-12,x0*x1*x2/x3,True
23,I.14.4,k_spring*x**2/2,2.312956e-12,0.5*x0*x1**2,True


In [None]:
df_merged.shape
# score (pysr with defaults) = 46/97 (appx 47%)

(97, 5)