In [12]:
import pandas as pd
import numpy as np

In [13]:
# Get the training data from Google Drive
train_II_active_A = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TrainingDatasets/II_active_95_filtered_rule_A.csv')
train_II_inactive_A = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TrainingDatasets/II_inactive_95_filtered_rule_A.csv')

train_IX_active_A = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TrainingDatasets/IX_active_95_filtered_rule_A.csv')
train_IX_inactive_A = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TrainingDatasets/IX_inactive_95_filtered_rule_A.csv')

In [14]:
# Get the testing data from Google Drive
test_II_quad1_A = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleA/II_quad_I_A.csv')
test_II_quad2_A = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleA/II_quad_II_A.csv')
test_II_quad3_A = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleA/II_quad_III_A.csv')
test_II_quad4_A = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleA/II_quad_IV_A.csv')

test_IX_quad1_A = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleA/IX_quad_I_A.csv')
test_IX_quad2_A = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleA/IX_quad_II_A.csv')
test_IX_quad3_A = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleA/IX_quad_III_A.csv')
test_IX_quad4_A = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleA/IX_quad_IV_A.csv')

In [15]:
def training_data_preparation(active, inactive):
  data = pd.concat([active, inactive])
  data = data.sample(frac=1).reset_index(drop=True)
  features = data.iloc[:, 1:-1].values
  labels = data.iloc[:, -1].values
  return features, labels

In [16]:
from sklearn.model_selection import train_test_split

def training_data_preparation_grande(active, inactive):
  data = pd.concat([active, inactive])
  data = data.sample(frac=1).reset_index(drop=True)
  features = data.iloc[:, 1:-1].values
  labels = data.iloc[:, -1].values
  X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
  return X_train, X_test, y_train, y_test

In [17]:
def testing_data_preparation(quad1):
  data = quad1
  smiles = data.iloc[:, 0].values
  features = data.iloc[:, 1:-1].values
  labels = data.iloc[:, -1].values
  return smiles, features, labels

In [18]:
!pip install tabpfn grande



In [19]:
# GRANDE hyperparameters
params = {
        'depth': 5, # tree depth
        'n_estimators': 2048, # number of estimators / trees

        'learning_rate_weights': 0.005, # learning rate for leaf weights
        'learning_rate_index': 0.01, # learning rate for split indices
        'learning_rate_values': 0.01, # learning rate for split values
        'learning_rate_leaf': 0.01, # learning rate for leafs (logits)

        'optimizer': 'adam', # optimizer
        'cosine_decay_steps': 0, # decay steps for lr schedule (CosineDecayRestarts)

        'loss': 'crossentropy', # loss function (default 'crossentropy' for binary & multi-class classification and 'mse' for regression)
        'focal_loss': False, # use focal loss {True, False}
        'temperature': 0.0, # temperature for stochastic re-weighted GD (0.0, 1.0)

        'from_logits': True, # use logits for weighting {True, False}
        'use_class_weights': True, # use class weights for training {True, False}

        'dropout': 0.0, # dropout rate (here, dropout randomly disables individual estimators of the ensemble during training)

        'selected_variables': 0.8, # feature subset percentage (0.0, 1.0)
        'data_subset_fraction': 1.0, # data subset percentage (0.0, 1.0)
}

args = {
    'epochs': 1_000, # number of epochs for training
    'early_stopping_epochs': 25, # patience for early stopping (best weights are restored)
    'batch_size': 64,  # batch size for training

    'cat_idx': [], # put list of categorical indices
    'objective': 'binary', # objective / task {'binary', 'classification', 'regression'}

    'random_seed': 42,
    'verbose': 1,
}

In [20]:
# Initialize rule A models (extra trees for II, tabpfn/grande for IX)
from sklearn.ensemble import ExtraTreesClassifier
from tabpfn import TabPFNClassifier
from GRANDE import GRANDE

model_II_A = ExtraTreesClassifier()
model_IX_A = ExtraTreesClassifier()

# Generate dataset
II_A_X_train, II_A_y_train = training_data_preparation(train_II_active_A, train_II_inactive_A)
IX_A_X_train, IX_A_y_train = training_data_preparation(train_IX_active_A, train_IX_inactive_A)

In [21]:
# Fit the training dataset to the model
model_II_A.fit(II_A_X_train, II_A_y_train)

In [22]:
model_IX_A.fit(IX_A_X_train, IX_A_y_train)

In [23]:
# Predict quad values

# Rule A: Between II and IX
# Quadrant I: II active, IX active -> True Positive or False Negative
# Quadrant II: II active, IX inactive -> True Positive, True Negative, False Positive, False Negative
# Quadrant III: II inactive, IX active -> True Negative, True Positive, False Positive, False Negative
# Quadrant IV: II inactive, IX inactive -> True Negative, False Positive
from sklearn.metrics import confusion_matrix

# Check 1st quadrant with TabPFN for IX
smiles_II_quad1_A, features_II_quad1_A, labels_II_quad1_A = testing_data_preparation(test_II_quad1_A)
smiles_IX_quad1_A, features_IX_quad1_A, labels_IX_quad1_A = testing_data_preparation(test_IX_quad1_A)
model_II_A_pred = model_II_A.predict(features_II_quad1_A)
model_IX_A_pred = model_IX_A.predict(features_IX_quad1_A)

# Get TP, TN, FP, FN
cm_II_quad1_A = confusion_matrix(labels_II_quad1_A, model_II_A_pred)
cm_IX_quad1_A = confusion_matrix(labels_IX_quad1_A, model_IX_A_pred)

df_cm_II_quad1_A = pd.DataFrame(cm_II_quad1_A, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_cm_IX_quad1_A = pd.DataFrame(cm_IX_quad1_A, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [24]:
import warnings
warnings.filterwarnings("ignore", category=ImportWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [25]:
df_cm_II_quad1_A

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,0
Actual 1,16,59


In [26]:
df_cm_IX_quad1_A

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,0
Actual 1,16,59


In [27]:
# Predict quad values

# Rule A: Between II and IX
# Quadrant I: II active, IX active -> True Positive or False Negative
# Quadrant II: II active, IX inactive -> True Positive, True Negative, False Positive, False Negative
# Quadrant III: II inactive, IX active -> True Negative, True Positive, False Positive, False Negative
# Quadrant IV: II inactive, IX inactive -> True Negative, False Positive
from sklearn.metrics import confusion_matrix

# Check 2nd quadrant with TabPFN (1) and GRANDE (2) for IX
smiles_II_quad2_A, features_II_quad2_A, labels_II_quad2_A = testing_data_preparation(test_II_quad2_A)
smiles_IX_quad2_A, features_IX_quad2_A, labels_IX_quad2_A = testing_data_preparation(test_IX_quad2_A)
model_II_A_pred = model_II_A.predict(features_II_quad2_A)
model_IX_A_pred = model_IX_A.predict(features_IX_quad2_A)

# Get TP, TN, FP, FN
cm_II_quad2_A = confusion_matrix(labels_II_quad2_A, model_II_A_pred)
cm_IX_quad2_A = confusion_matrix(labels_IX_quad2_A, model_IX_A_pred)

df_cm_II_quad2_A = pd.DataFrame(cm_II_quad2_A, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_cm_IX_quad2_A = pd.DataFrame(cm_IX_quad2_A, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [28]:
df_cm_II_quad2_A

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,0
Actual 1,44,31


In [29]:
df_cm_IX_quad2_A

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,45,30
Actual 1,0,0


In [30]:
# Predict quad values

# Rule A: Between II and IX
# Quadrant I: II active, IX active -> True Positive or False Negative
# Quadrant II: II active, IX inactive -> True Positive, True Negative, False Positive, False Negative
# Quadrant III: II inactive, IX active -> True Negative, True Positive, False Positive, False Negative
# Quadrant IV: II inactive, IX inactive -> True Negative, False Positive
from sklearn.metrics import confusion_matrix

# Check 3rd quadrant with TabPFN (1) and GRANDE (2) for IX
smiles_II_quad3_A, features_II_quad3_A, labels_II_quad3_A = testing_data_preparation(test_II_quad3_A)
smiles_IX_quad3_A, features_IX_quad3_A, labels_IX_quad3_A = testing_data_preparation(test_IX_quad3_A)
model_II_A_pred = model_II_A.predict(features_II_quad3_A)
model_IX_A_pred = model_IX_A.predict(features_IX_quad3_A)

# Get TP, TN, FP, FN
cm_II_quad3_A = confusion_matrix(labels_II_quad3_A, model_II_A_pred)
cm_IX_quad3_A = confusion_matrix(labels_IX_quad3_A, model_IX_A_pred)

df_cm_II_quad3_A = pd.DataFrame(cm_II_quad3_A, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_cm_IX_quad3_A = pd.DataFrame(cm_IX_quad3_A, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [31]:
df_cm_II_quad3_A

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69,6
Actual 1,0,0


In [32]:
df_cm_IX_quad3_A

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,0
Actual 1,20,55


In [33]:
# Predict quad values

# Rule A: Between II and IX
# Quadrant I: II active, IX active -> True Positive or False Negative
# Quadrant II: II active, IX inactive -> True Positive, True Negative, False Positive, False Negative
# Quadrant III: II inactive, IX active -> True Negative, True Positive, False Positive, False Negative
# Quadrant IV: II inactive, IX inactive -> True Negative, False Positive
from sklearn.metrics import confusion_matrix

# Check 4th quadrant with TabPFN (1) and GRANDE (2) for IX
smiles_II_quad4_A, features_II_quad4_A, labels_II_quad4_A = testing_data_preparation(test_II_quad4_A)
smiles_IX_quad4_A, features_IX_quad4_A, labels_IX_quad4_A = testing_data_preparation(test_IX_quad4_A)
model_II_A_pred = model_II_A.predict(features_II_quad4_A)
model_IX_A_pred = model_IX_A.predict(features_IX_quad4_A)

# Get TP, TN, FP, FN
cm_II_quad4_A = confusion_matrix(labels_II_quad4_A, model_II_A_pred)
cm_IX_quad4_A = confusion_matrix(labels_IX_quad4_A, model_IX_A_pred)

df_cm_II_quad4_A = pd.DataFrame(cm_II_quad4_A, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_cm_IX_quad4_A = pd.DataFrame(cm_IX_quad4_A, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [34]:
df_cm_II_quad4_A

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,68,7
Actual 1,0,0


In [35]:
df_cm_IX_quad4_A

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,57,18
Actual 1,0,0


In [36]:
# Combine all confusion matrices together
cm_II_A = cm_II_quad1_A + cm_II_quad2_A + cm_II_quad3_A + cm_II_quad4_A
df_cm_II_A = pd.DataFrame(cm_II_A, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_cm_II_A

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,137,13
Actual 1,60,90


In [37]:
# Combine all confusion matrices together
cm_IX_A = cm_IX_quad1_A + cm_IX_quad2_A + cm_IX_quad3_A + cm_IX_quad4_A
df_cm_IX_A = pd.DataFrame(cm_IX_A, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_cm_IX_A

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,102,48
Actual 1,36,114
