In [1]:
from scipy.optimize import minimize
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv('foldseek_Swiss-Prot_2002_reduced.tsv', sep='\t')

In [3]:
df

Unnamed: 0,query,target,fident,alnlen,bits,evalue,lddt,mismatch,alntmscore,qtmscore,ttmscore,query_ec_number,target_ec_number,query_ec,target_ec,identical_ec,zone,zone_mmseqs
0,O00115,O62855,0.740,362,2468,5.822000e-65,0.9119,91,0.93850,0.94880,0.93850,3.1.22.1,3.1.22.1,3.1.22,3.1.22,True,daylight zone,daylight zone
1,O00141,Q9WVC6,0.965,431,2886,8.165000e-73,0.9300,15,0.82730,0.82730,0.82730,2.7.11.1,2.7.11.1,2.7.11,2.7.11,True,daylight zone,daylight zone
2,O00167,Q23977,0.194,293,276,2.830000e-19,0.7389,203,0.03853,0.07579,0.03853,3.1.3.48,2.7.12.2,3.1.3,2.7.12,False,daylight zone,daylight zone
3,O00408,Q01062,0.922,917,6283,0.000000e+00,0.9546,65,0.85110,0.83990,0.85110,3.1.4.17,3.1.4.17,3.1.4,3.1.4,True,daylight zone,daylight zone
4,O00442,P46849,0.310,354,1351,3.241000e-40,0.7632,226,0.94020,0.87030,0.94020,6.5.1.4,6.5.1.4,6.5.1,6.5.1,True,daylight zone,daylight zone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2711,Q9ZLB9,P00805,0.498,333,1820,3.233000e-49,0.9108,163,0.92960,0.97390,0.92960,3.5.1.1,3.5.1.1,3.5.1,3.5.1,True,daylight zone,daylight zone
2712,Q9ZLT0,P56868,0.356,255,858,1.388000e-24,0.7906,150,0.88620,0.88290,0.88620,5.1.1.3,5.1.1.3,5.1.1,5.1.1,True,daylight zone,daylight zone
2713,Q9ZMW8,O34667,0.445,157,710,6.026000e-19,0.8892,81,0.92190,0.95150,0.92190,4.4.1.21,4.4.1.21,4.4.1,4.4.1,True,daylight zone,daylight zone
2714,Q9ZNZ7,Q43155,0.848,1509,12352,0.000000e+00,0.9813,229,0.99400,0.92960,0.99400,1.4.7.1,1.4.7.1,1.4.7,1.4.7,True,daylight zone,daylight zone


In [4]:
def compute_f(PIDE, L, factor, exponent):
    if L <= 11:
        return PIDE - 100
    elif 11 < L <= 450:
        return factor * L**(exponent * (1 + np.exp(-L/1000)))
    else:
        return 19.5

In [5]:
# Define the objective function for Nelder-Mead optimization
def objective_function(params):
    factor, exponent = params
    kf = KFold(n_splits=10, shuffle=True, random_state=1)
    f1_scores = []

    for train_index, test_index in kf.split(df):
        train_data, test_data = df.iloc[train_index], df.iloc[test_index]
        
        y_true = []
        y_pred = []
        
        for _, row in train_data.iterrows():
            PIDE= row[2] * 100
            L = row[3] - row[7]
            f_value = compute_f(PIDE, L, factor, exponent)
            
            y_true.append(row['identical_ec'])
            y_pred.append(PIDE> f_value)  # True if above the curve, False if below

        # Calculate F1-score for the current fold
        f1 = f1_score(y_true, y_pred, pos_label=True)
        f1_scores.append(f1)

    # Return the negative mean F1-score (since we want to maximize it)
    return -np.mean(f1_scores)

In [6]:
# Initialize the simplex for Nelder-Mead
initial_simplex = np.array([
    [300, -0.3],
    [1500, -0.3],
    [300, -0.9]
])

In [7]:
# Run Nelder-Mead Optimization
result = minimize(objective_function, x0=[300, -0.3], method='Nelder-Mead', options={'initial_simplex': initial_simplex})
best_hyperparams = result.x

  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row[2] * 100
  L = row[3] - row[7]
  PIDE= row

In [None]:
print("Best hyperparameters found:", best_hyperparams)