In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore", category=ImportWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# Get the training data from Google Drive
train_II_active_B = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TrainingDatasets/II_active_95_filtered_rule_B.csv')
train_II_inactive_B = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TrainingDatasets/II_inactive_95_filtered_rule_B.csv')

train_XII_active_B = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TrainingDatasets/XII_active_95_filtered_rule_B.csv')
train_XII_inactive_B = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TrainingDatasets/XII_inactive_95_filtered_rule_B.csv')

In [4]:
# Get the testing data from Google Drive
test_II_quad1_B = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleB/II_quad_I_B.csv')
test_II_quad2_B = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleB/II_quad_II_B.csv')
test_II_quad3_B = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleB/II_quad_III_B.csv')
test_II_quad4_B = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleB/II_quad_IV_B.csv')

test_XII_quad1_B = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleB/XII_quad_I_B.csv')
test_XII_quad2_B = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleB/XII_quad_II_B.csv')
test_XII_quad3_B = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleB/XII_quad_III_B.csv')
test_XII_quad4_B = pd.read_csv('/content/drive/MyDrive/SelectivityProfile/TestingDatasets/RuleB/XII_quad_IV_B.csv')

In [5]:
def training_data_preparation(active, inactive):
  data = pd.concat([active, inactive])
  data = data.sample(frac=1).reset_index(drop=True)
  features = data.iloc[:, 1:-1].values
  labels = data.iloc[:, -1].values
  return features, labels

In [6]:
from sklearn.model_selection import train_test_split

def training_data_preparation_grande(active, inactive):
  data = pd.concat([active, inactive])
  data = data.sample(frac=1).reset_index(drop=True)
  features = data.iloc[:, 1:-1].values
  labels = data.iloc[:, -1].values
  X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
  return X_train, X_test, y_train, y_test

In [7]:
def testing_data_preparation(quad1):
  data = quad1
  smiles = data.iloc[:, 0].values
  features = data.iloc[:, 1:-1].values
  labels = data.iloc[:, -1].values
  return smiles, features, labels

In [8]:
# Initialize rule A models (extra trees for II, tabpfn/grande for IX)
from sklearn.ensemble import ExtraTreesClassifier

model_II_B = ExtraTreesClassifier()
model_XII_B = ExtraTreesClassifier()

# Generate dataset
II_B_X_train, II_B_y_train = training_data_preparation(train_II_active_B, train_II_inactive_B)
XII_B_X_train, XII_B_y_train = training_data_preparation(train_XII_active_B, train_XII_inactive_B)

In [9]:
model_II_B.fit(II_B_X_train, II_B_y_train)

In [10]:
model_XII_B.fit(XII_B_X_train, XII_B_y_train)

In [11]:
# Predict quad values

# Rule A: Between II and XII
# Quadrant I: II active, XII active -> True Positive or False Negative
# Quadrant II: II active, XII inactive -> True Positive, True Negative, False Positive, False Negative
# Quadrant III: II inactive, XII active -> True Negative, True Positive, False Positive, False Negative
# Quadrant IV: II inactive, XII inactive -> True Negative, False Positive
from sklearn.metrics import confusion_matrix

# Check 1st quadrant with TabPFN for IX
smiles_II_quad1_B, features_II_quad1_B, labels_II_quad1_B = testing_data_preparation(test_II_quad1_B)
smiles_XII_quad1_B, features_XII_quad1_B, labels_XII_quad1_B = testing_data_preparation(test_XII_quad1_B)
model_II_B_pred = model_II_B.predict(features_II_quad1_B)
model_XII_B_pred = model_XII_B.predict(features_XII_quad1_B)

# Get TP, TN, FP, FN
cm_II_quad1_B = confusion_matrix(labels_II_quad1_B, model_II_B_pred, labels=[0,1])
cm_XII_quad1_B = confusion_matrix(labels_XII_quad1_B, model_XII_B_pred, labels=[0,1])

df_cm_II_quad1_B = pd.DataFrame(cm_II_quad1_B, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_cm_XII_quad1_B = pd.DataFrame(cm_XII_quad1_B, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [12]:
df_cm_II_quad1_B

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,0
Actual 1,3,22


In [13]:
df_cm_XII_quad1_B

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,0
Actual 1,13,12


In [14]:
# Predict quad values

# Rule A: Between II and XII
# Quadrant I: II active, XII active -> True Positive or False Negative
# Quadrant II: II active, XII inactive -> True Positive, True Negative, False Positive, False Negative
# Quadrant III: II inactive, XII active -> True Negative, True Positive, False Positive, False Negative
# Quadrant IV: II inactive, XII inactive -> True Negative, False Positive
from sklearn.metrics import confusion_matrix

# Check 2nd quadrant with TabPFN for IX
smiles_II_quad2_B, features_II_quad2_B, labels_II_quad2_B = testing_data_preparation(test_II_quad2_B)
smiles_XII_quad2_B, features_XII_quad2_B, labels_XII_quad2_B = testing_data_preparation(test_XII_quad2_B)
model_II_B_pred = model_II_B.predict(features_II_quad2_B)
model_XII_B_pred = model_XII_B.predict(features_XII_quad2_B)

# Get TP, TN, FP, FN
cm_II_quad2_B = confusion_matrix(labels_II_quad2_B, model_II_B_pred, labels=[0,1])
cm_XII_quad2_B = confusion_matrix(labels_XII_quad2_B, model_XII_B_pred, labels=[0,1])

df_cm_II_quad2_B = pd.DataFrame(cm_II_quad2_B, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_cm_XII_quad2_B = pd.DataFrame(cm_XII_quad2_B, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [15]:
df_cm_II_quad2_B

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,0
Actual 1,12,13


In [16]:
df_cm_XII_quad2_B

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,16,9
Actual 1,0,0


In [17]:
# Predict quad values

# Rule A: Between II and XII
# Quadrant I: II active, XII active -> True Positive or False Negative
# Quadrant II: II active, XII inactive -> True Positive, True Negative, False Positive, False Negative
# Quadrant III: II inactive, XII active -> True Negative, True Positive, False Positive, False Negative
# Quadrant IV: II inactive, XII inactive -> True Negative, False Positive
from sklearn.metrics import confusion_matrix

# Check 3rd quadrant with TabPFN for IX
smiles_II_quad3_B, features_II_quad3_B, labels_II_quad3_B = testing_data_preparation(test_II_quad3_B)
smiles_XII_quad3_B, features_XII_quad3_B, labels_XII_quad3_B = testing_data_preparation(test_XII_quad3_B)
model_II_B_pred = model_II_B.predict(features_II_quad3_B)
model_XII_B_pred = model_XII_B.predict(features_XII_quad3_B)

# Get TP, TN, FP, FN
cm_II_quad3_B = confusion_matrix(labels_II_quad3_B, model_II_B_pred, labels=[0,1])
cm_XII_quad3_B = confusion_matrix(labels_XII_quad3_B, model_XII_B_pred, labels=[0,1])

df_cm_II_quad3_B = pd.DataFrame(cm_II_quad3_B, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_cm_XII_quad3_B = pd.DataFrame(cm_XII_quad3_B, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [18]:
df_cm_II_quad3_B

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,22,3
Actual 1,0,0


In [19]:
df_cm_XII_quad3_B

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,0
Actual 1,6,19


In [20]:
# Predict quad values

# Rule A: Between II and XII
# Quadrant I: II active, XII active -> True Positive or False Negative
# Quadrant II: II active, XII inactive -> True Positive, True Negative, False Positive, False Negative
# Quadrant III: II inactive, XII active -> True Negative, True Positive, False Positive, False Negative
# Quadrant IV: II inactive, XII inactive -> True Negative, False Positive
from sklearn.metrics import confusion_matrix

# Check 4th quadrant with TabPFN for IX
smiles_II_quad4_B, features_II_quad4_B, labels_II_quad4_B = testing_data_preparation(test_II_quad4_B)
smiles_XII_quad4_B, features_XII_quad4_B, labels_XII_quad4_B = testing_data_preparation(test_XII_quad4_B)
model_II_B_pred = model_II_B.predict(features_II_quad4_B)
model_XII_B_pred = model_XII_B.predict(features_XII_quad4_B)

# Get TP, TN, FP, FN
cm_II_quad4_B = confusion_matrix(labels_II_quad4_B, model_II_B_pred, labels=[0,1])
cm_XII_quad4_B = confusion_matrix(labels_XII_quad4_B, model_XII_B_pred, labels=[0,1])

df_cm_II_quad4_B = pd.DataFrame(cm_II_quad4_B, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_cm_XII_quad4_B = pd.DataFrame(cm_XII_quad4_B, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [21]:
df_cm_II_quad4_B

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,23,2
Actual 1,0,0


In [22]:
df_cm_XII_quad4_B

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,21,4
Actual 1,0,0


In [23]:
# Combine all confusion matrices together
cm_II_B = cm_II_quad1_B + cm_II_quad2_B + cm_II_quad3_B + cm_II_quad4_B
df_cm_II_B = pd.DataFrame(cm_II_B, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_cm_II_B

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,45,5
Actual 1,15,35


In [24]:
# Combine all confusion matrices together
cm_XII_B = cm_XII_quad1_B + cm_XII_quad2_B + cm_XII_quad3_B + cm_XII_quad4_B
df_cm_XII_B = pd.DataFrame(cm_XII_B, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
df_cm_XII_B

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,37,13
Actual 1,19,31
