In [9]:
class Model:
    def __init__(self):
        self.structural = {"Heme":None, "Flavin":None, "FeS":None, "ActiveSiteAccess":None, "SubstrateSize":None}
        self.theoretical = {"Redox":None, "Exergonic":None}
        self.experimental = {"Selectivity":None, "Specificity":None, "Diversity":None, 
                             "NonIntStoichiometry":None, "VariableStoichiometry":None,"KmKd":None, "DiffusionLimits":None, 
                            "ZerothOrderKinetics":None}
        self.proposal = {"ShapeChange":None, "Serial":None, "Complexity":None, "MechanisiticSteps":None,
                        "Intermemdiates":None, "Probability":None, "LongDistanceOSET":None, "HighAffinityESComplex":None}
        self.Name = None

In [10]:
def OccamsRazor(model1, model2):
    simplicity_cnt_1 = 0
    simplicity_cnt_2 = 0
    if model1.proposal["ShapeChange"] == 0:
        simplicity_cnt_1 += 1
    if model1.proposal["Serial"] == 0:
        simplicity_cnt_1 += 1
    if model1.proposal["Complexity"] == 0:
        simplicity_cnt_1 += 1
    if model1.proposal["Probability"] == 1:
        simplicity_cnt_1 += 1
    if model1.proposal["Intermediates"] == 0:
        simplicity_cnt_1 += 1
    if model1.proposal["LongDistanceOSET"] == 0:
        simplicity_cnt_1 += 1
    if model1.proposal["HighAffinityESComplex"] == 0:
        simplicity_cnt_1 += 1
        
        
        
    if model2.proposal["ShapeChange"] == 0:
        simplicity_cnt_2 += 1
    if model2.proposal["Serial"] == 0:
        simplicity_cnt_2 += 1
    if model2.proposal["Complexity"] == 0:
        simplicity_cnt_2 += 1
    if model2.proposal["Probability"] == 1:
        simplicity_cnt_2 += 1
    if model2.proposal["Intermediates"] == 1:
        simplicity_cnt_2 += 1

    if model1.proposal["MechanisticSteps"] < model2.proposal["MechanisticSteps"]:
        simplicity_cnt_1 += 1
    if model1.proposal["MechanisticSteps"] > model2.proposal["MechanisticSteps"]:
        simplicity_cnt_2 += 1

    if model2.proposal["LongDistanceOSET"] == 0:
        simplicity_cnt_2 += 1
    if model2.proposal["HighAffinityESComplex"] == 0:
        simplicity_cnt_2 += 1



    
    if simplicity_cnt_1 > simplicity_cnt_2:
        return 1
    if simplicity_cnt_1 < simplicity_cnt_2:
        return 2
        
    return 0
    

In [11]:
model1 = Model()
model2 = Model()

model1.proposal["ShapeChange"] = 1
model1.proposal["Serial"] = 1
model1.proposal["Complexity"] = 0
model1.proposal["MechanisticSteps"] = 3
model1.proposal["Probability"] = 0
model1.proposal["Intermediates"] = 1 #molecularity -> change name
model1.proposal["LongDistanceOSET"] = 1
model1.proposal["HighAffinityESComplex"] = 1

model2.proposal["ShapeChange"] = 0
model2.proposal["Serial"] = 0
model2.proposal["Complexity"] = 0
model2.proposal["MechanisticSteps"] = 2
model2.proposal["Probability"] = 1
model2.proposal["Intermediates"] = 0
model2.proposal["LongDistanceOSET"] = 0
model2.proposal["HighAffinityESComplex"] = 0


if OccamsRazor(model1, model2) == 1:
    print("model1 is simple as per simplicity measures")
else:
    print("model2 is simple as per simplicity measures")
    

model2 is simple as per simplicity measures


In [14]:
def consistency_check(model):
    access = model.structural["ActiveSiteAccess"]
    selectivity = model.experimental["Selectivity"]
    specificity = model.experimental["Specificity"]
    diversity = model.experimental["Diversity"]
    non_int_stoich = model.experimental["NonIntStoichiometry"]
    substrate_size = model.structural["SubstrateSize"]
    kmkd = model.experimental["KmKd"]
    zeroth_order_kinetics = model.experimental["ZerothOrderKinetics"]
    catalytic_rate = model.experimental["CatalyticRate"]
    substrate_bound = model.proposal["SubstrateBound"]
    int_KIE = model.experimental["IntKIE"]

    inconsistency_cnt = 0
    check_cnt = 0

    check_cnt += 1
    if access == "limited" and selectivity == False:
        inconsistency_cnt += 1
        
    check_cnt += 1
    if access == "limited" and specificity == False:
        inconsistency_cnt += 1

    check_cnt += 1
    if selectivity == False and non_int_stoich == True:
        inconsistency_cnt += 1

    check_cnt += 1
    if kmkd == True: #Km < Kd
        inconsistency_cnt += 1

    check_cnt += 1
    if substrate_size == "large":
        inconsistency_cnt += 1

    check_cnt += 1
    if zeroth_order_kinetics == 1:
        inconsistency_cnt += 1

    check_cnt += 1
    if catalytic_rate > 1e9:
        inconsistency_cnt += 1

    check_cnt += 1
    if substrate_bound == True and int_KIE == "HIGH":
        inconsistency_cnt += 1
    
    return inconsistency_cnt, check_cnt      
        

In [15]:
model = Model()
model.structural["ActiveSiteAccess"] = "limited"
model.experimental["Selectivity"] = False
model.experimental["Specificity"] = False
model.experimental["Diversity"] = True
model.experimental["NonIntStoichiometry"] = True
model.experimental["SubstrateSize"] = "large"
model.experimental["KmKd"] = True
model.experimental["ZerothOrderKinetics"] = 1
model.experimental["CatalyticRate"] = 1.5e9
model.proposal["SubstrateBound"] = True
model.experimental["IntKIE"] = "HIGH"

inconsistency_cnt, check_cnt = consistency_check(model)
print(f"{inconsistency_cnt} consistency check failures out of {check_cnt} checks")

7 consistency check failures out of 8 checks


In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import joblib  # 🔁 NEW: for saving models

# ------------------------------
# Load the Data
# ------------------------------
df = pd.read_excel("Data_CM.xlsx", sheet_name="Sheet1")

enzyme_names = df.iloc[:, 0]
label = df.iloc[:, -1]
features = df.iloc[:, 1:-1]

# ------------------------------
# Define Feature Categories
# ------------------------------
structure_idx = list(range(0, 5))
theoretical_idx = list(range(5, 7))
experimental_idx = list(range(7, 20))

categories = {
    "Structure": structure_idx,
    "Theoretical": theoretical_idx,
    "Experimental": experimental_idx
}

# ------------------------------
# Encode the Label
# ------------------------------
le_label = LabelEncoder()
y_encoded = le_label.fit_transform(label)

# ------------------------------
# Safe Encoding Helper
# ------------------------------
def safe_label_encoding(column):
    le = LabelEncoder()
    encoded = le.fit_transform(column)
    reverse_map = dict(zip(range(len(le.classes_)), le.classes_))
    return encoded, reverse_map

# ------------------------------
# Generate C-style If-Else Tree
# ------------------------------
def generate_if_else_code(tree, feature_names, label_encoder, reverse_maps):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != -2 else "undefined!"
        for i in tree_.feature
    ]

    def recurse(node, depth):
        indent = "    " * depth
        if tree_.feature[node] != -2:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            threshold = int(threshold + 0.5)

            if name in reverse_maps:
                val = reverse_maps[name].get(threshold, f"<UNK_{threshold}>")
                left = recurse(tree_.children_left[node], depth + 1)
                right = recurse(tree_.children_right[node], depth + 1)
                return (
                    f'{indent}if (strcmp({name}, "{val}") == 0) {{\n'
                    f'{left}\n{indent}}} else {{\n{right}\n{indent}}}'
                )
            else:
                left = recurse(tree_.children_left[node], depth + 1)
                right = recurse(tree_.children_right[node], depth + 1)
                return (
                    f"{indent}if ({name} <= {threshold}) {{\n"
                    f"{left}\n{indent}}} else {{\n{right}\n{indent}}}"
                )
        else:
            value = np.argmax(tree_.value[node])
            class_name = label_encoder.inverse_transform([value])[0]
            return f'{indent}return "{class_name}";'

    return recurse(0, 0)

# ------------------------------
# Train Trees and Generate Output
# ------------------------------
output_lines = []

for category, indices in categories.items():
    X = features.iloc[:, indices].copy()
    reverse_maps = {}

    for col in X.columns:
        if X[col].dtype == "object":
            X[col], rev_map = safe_label_encoding(X[col])
            reverse_maps[col] = rev_map

    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X, y_encoded)

    # 🔁 Save trained model
    model_filename = f"model_{category}.pkl"
    joblib.dump(clf, model_filename)

    # Generate code
    code = generate_if_else_code(clf, list(X.columns), le_label, reverse_maps)
    output_lines.append(f"// Category: {category}")
    output_lines.append("const char* predict(...) {")
    output_lines.append(code)
    output_lines.append("}")
    output_lines.append("=" * 60)

# ------------------------------
# Save C-style Code to File
# ------------------------------
with open("result_C_code.txt", "w") as f:
    f.write("\n".join(output_lines))

print("✅ C-code generated and models saved as model_<Category>.pkl")
