# **Installation of the RDKit**

In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

In [None]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw, AllChem
IPythonConsole.ipython_useSVG=True 
Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C")




---


# Method configuration

In [None]:
#Center output
#from IPython.display import display, HTML
CSS = """
.output {
    align-items: center;
}
"""
#HTML('<style>{}</style>'.format(CSS))

# Enables large output display
#from IPython.core.display import display, HTML
#display(HTML("<style>div.output_scroll { height: 44em; }</style>"))

#from google.colab import data_table
#data_table.enable_dataframe_formatter()

from IPython.display import display, HTML

def show(obj):
  display(HTML(obj.to_html(escape=False)))


In [None]:
from IPython.display import display
import FLuID as fluid
import plotly.express as px
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

###
# Consortium size
k = 8

params = {
    
    # experiment details
    'details' : 4,                  # level of detail of the experiment (low=1,medium=2,high=3,full=4)
    
    # datafiles
'training_data_file' : 'hERG_lhasa_training',
    'test_data_file' : 'hERG_lhasa_test',
'transfer_data_file' : 'FLuID_full',
  'fluid_label_file' : 'FLuID_labels',
    
    # data sampling
   'validation_ratio': 0.2,        # ratio validation/training
     'transfer_size' : 50000 ,      # sample for the transfer data (-1 = all)
         'test_size' : -1,          # sample for the test data (-1 = all)
     'training_size' : -1,          # sample for the training data (-1 = all)

    # number of teacher/clusters (kMean)
                 'k' : k,           # number of clusters (kMean)
     'smooth_factor' : 0.05,        # level of post-clustering mixing to avoid fully biased teachers
    
    # teachers
 'teacher_algorithm' : 'rf',        # algorithm used to build the teacher models
    
    # students
 'federated_student' : 'F' + str(k),
      'student_size' : 10000,                                              # size of the student (number of labelled Cronos data used)
      'student_sizes' : [100,250,500, 1000,2500,5000,10000,25000,50000],   # sizes of the student ti study the impact of the size
 'student_algorithm' : 'rf',                                               # default algorithm used to build the student models
      'student_mode' : 'balanced',                                         # default mode used to select the student data 
    
    # random seed for reproductibility
      'random_state' : 42,

    # t-SNE settings
         'tsne_size' : 500,
   'tsne_iterations' : 1000,
    
    # replication level
    'replicate_count' : 3,
    
    # fonts
       'figure_font' : dict(family="Arial",size=14,color="black"),
 'small_figure_font' : dict(family="Arial",size=10,color="black"),

    # colors
'figure_color_scale' : [(0,"red"),(0.2,"orange"), (0.3,'yellow'),(1,'green')],
        'bar_colors' : px.colors.qualitative.Prism,
         'green_map' : plt.get_cmap('Greens')
}

In [None]:
import importlib
importlib.reload(fluid)



---
# **From Private Data to Public Knowledge**
---

## Abstract

In this note book, we demonstrate a data driven federated learning using the Teacher-Student approach to transfer knowledge from confidential data into public knowledge whilst preserving the privacy of the source data.

We use the hERG endpoint to validate the concept in the context of a classification task. 

- The sources of knowledge to federate are simulated by clustering a set of curated ChEMBL data into subspaces representing different chemical domains

- Using a k-mean algortihm and FCFP2 as the domain representation, we split the ChEMBL data into k different clusters, each cluster representing a virtual member of the federation.

- Next, we train k teachers models from the associated cluster data. Thiese teacher models represent private models that can be trained within the private space of each federation member.

- We then annotate a large collection (100k) of unlabelled public data  using these teachers (one label/teachers) leading to a matrix of k labels for each structure. The collection of unlabbeled data is call the transfer data.

- We consolidate the k labels into a single federated label using an a weighted average scheme over the classe probabilityies across the k labels. This step results in a new dataset of 100k structures annotated with a federated label; this label can be seen as a surrogate for an experimental label.

- Next we build a student model using the annotated transfer data as a training set

- In order to validate the approach, we compare the performance of the teachers models with the student model. More specififically we will use the Mathew Correlation Coefficient (MCC) as the accuracy metric and compare the average teacher performance woth the student performance when validated against an external test set.

- If the student is performing better than the average teacher performance we consider the concept as validated

The idea is to demonstrate that although the teachers capture source knowledge about different chemical spaces (cluster separation), the student will be able to federate the knowledge across the teachers and outperform individual teachers when validated against an external  dataset 

We will also look at different aspects of this experiment like the individual contribution of each teacher, the impact of the number of teachers, the size of the transfer data set, etc.

## Data preparation

We will uses three datasets:

- Public ChEMBL hERG data as a source of knowledge for the teachers
- Preissner as benchmark data from the [Preissner et al. paper](https://pubs.acs.org/doi/10.1021/acs.jcim.8b00150)
- Cronos as transfer data

We are going to use ChEMBL hERG data as the source for the teachers' data. The ChEMBL data will be clustered to simulate different sources (teachers) covering distinct chemical spaces. Dupplicates are removed.

As we load the datasets we create an activity table and compute the fingerprints

We use the Preissner dataset as an external test set to benchmark our teachers and students. Preissner compounds overlapping with ChEMBL data are removed. We also make sure there are no internal dupplicates (including stereo-isomers and tautomers).

Cronos is our dataset prepared from 100 millions Puch Chem structures sampled down to 1M and further tiled down to 350k structures that evenly cover the PuchChem space and hence represents a large homogeneous chemical space.

In [None]:
####
# Load and prepare the data
# if force_sdf True we force reloading from the SDF files
force_sdf = False 
training_full = fluid.load_training_data(params, force_sdf)
test_full = fluid.load_test_data(training_full, params, force_sdf)
transfer_full = fluid.load_transfer_data(params, force_sdf)

Sampling the data according to the experiment configuration:

In [None]:
####
# Sample the desired size of data according to the experiment parameters
training_data, test_data, transfer_data, validation_data = fluid.sample_data(training_full, test_full, transfer_full, params)

## Creating Teachers
### Teacher source space

We use the ChEMBL training dataset as the source space for the teachers. 
Here we see the global activity distribution of the source data.

Use the ChEMBL data as the source data for the experiment. This data is going to be split in training and validation sets. The training set will be the base of the simulation (split in 'k' individual member data sets).

In [None]:
%matplotlib inline
import plotly.io as pio
pio.renderers.default = "iframe"

In [None]:
####
# Use the training data as teacher source space
# and plot the class distribution in this space
teacher_space = training_data
fluid.display_distribution(teacher_space,'ACTIVITY',"Source space class distribution")

### Training space
The training data will be used as the teacher domain space. This space will be divided in 'k' individal member spaces.

In [None]:
fluid.display_distribution(training_data,'ACTIVITY',"ChEMBL training space")

### Validation space
The validation data will be used validate the models within the original source space

In [None]:
fluid.display_distribution(validation_data,'ACTIVITY',"ChEMBL validation space")

### Test space
The Preissner data will be used as the external validation space. It is independent from the original ChEMBL domain.

In [None]:
fluid.display_distribution(test_data,'ACTIVITY',"Preissner test space")

### Create teacher data

Next we create k teachers using k clusters from the ChEMBL training data using kMean clustering.
We use the Scikit Learn kMean implementation. *Unfortunately the default and only distance metric in this version is Euclidian but ideally we would like to work with the Tanimoto similarity.*

As we can see in the bar plot below, the k clusters are distributed into different sizes and different bias between active and inactive compounds. This gives us a good variety of virtual contributors to the federated model and fits with a real world situation.

We extract the teacher data from the ChEMBL clusters; each cluster provides the data for one individual teacher. We have added one special teacher T0 (not used in the federation process) that is trained on all the ChEMBL data. T0 will allow us to have a reference point to measure the performance we would obtain if we directly build a model using all the data, in other words, this would be the best performance (upper bound) we can expect if we had access to all the private data without confidentiality protection nor need for federation.

In [None]:
####
# Cluster the source data into k teacher training sets
teacher_data = fluid.cluster_data_space(teacher_space, 'Teacher', 'T', params['k'], params['smooth_factor'], params)


### Teacher space projection

We can see that the k cluster have separate chemical space which will make the experiment challenging: "Will the federated student be able to rebuild a global chemical space knowledge by learning from the relatively disjoint teacher chemical spaces.

We can observe that some cluster are strongly biased towards a specific class whilst others are more balanced. This again gives us a good simulation of the real world.

In [None]:
fluid.project_teacher_cluster_space(training_data, params)

In [None]:
fluid.project_teacher_activity_space(training_data, params)

### Transfer space projection

Here can verify that the training, test and transfer spaces overlap. Also the test space is relatively well spread accross the training space whilst being external.

In [None]:
fluid.plot_transfer_space(training_data, test_data, transfer_data, params)

In [None]:
fluid.plot_data_space([(transfer_data, "transfer", 1000), (training_data,"training", 500)], params)

In [None]:
datasets = [(transfer_data, "transfer", 5000)] +[ (teacher_data[i], "T" + str(i), 100) for i in range(1, 1+params['k'])]

fluid.plot_data_space(datasets, params)

In [None]:
datasets = [(transfer_data, "transfer", 1000)] + [ (pd.concat([ teacher_data[i] for i in range(1, 1+params['k'])]), "teachers", 500)]

fluid.plot_data_space(datasets, params)

### Building the teacher models

We use the selected machine learning algorithm to build a model for each teacher.

In [None]:
teacher_models = fluid.build_teacher_models(teacher_data, params)

### Teachers internal cross-validation

Not suprisingly the teacher's internal validation display relatively good performances with an MCC ~= 0.5

In [None]:
fluid.cross_validate_teachers(teacher_data, params)

### Teachers  validation on the ChEMBL space

The performance of the teacher against the external Preissner test set allows us to conclude that the teacher models slightly overfit in their respective focused chemical space. Their performance is on average lower when tested against a more global chemical space as in the Preissner test set.

In [None]:
teacher_validation_table, teacher_average_table = fluid.validate_teachers(teacher_models, validation_data, params)

### Teachers  validation on the Preissner space

The performance of the teacher against the external Preissner test set allows us to conclude that the teacher models slightly overfit in their respective focused chemical space. Their performance is on average lower when tested against a more global chemical space as in the Preissner test set.

In [None]:
teacher_validation_table, teacher_average_table = fluid.validate_teachers(teacher_models, test_data, params)

## Annotating the transfer data

### Labeling the FLuID transfer data

When annotating the tranferdata, we associate each public structure with a hERG predicted label. Each label is expressed as a probability distribution between the ACTIVE and INACTIVE classes and it is therefore interesting to analyse how these to classes are distributed over the whole transfer data

In [None]:
force_annotation = True
label_table = fluid.annotate_transfer_data(transfer_data, teacher_models, teacher_data, params, force_annotation)

### Federating the labels

In [None]:
label_table = fluid.federate_teacher_annotations(label_table, params)

In [None]:
fluid.plot_annotation_distributions(label_table, 800, 600, params)

In [None]:
fluid.compute_teacher_probability_distributions(label_table, params)

In [None]:
fluid.plot_confidence_distributions(label_table, 900,800, params)

## Building the student training data
We now have a full set of non sensitive transfer data annotate by a federation of k teachers. We could use all this data to build a student model or we can ask ourselves if it is preferable to select a subset of data that would allow us to reduce the cost of building the student model and potentially to improve the performance of the student by selecting only the *best* data. We can therefore define what is a good data point and on that basis explore the performance of the student as the number of selected point increases.

### Selecting the student data points

#### Assertiveness ('best') 
The most intuitive criteria for selecting student training data from the annotated transfer data is the assertiveness of the label associated to the data. The label is the result of the consolidation of the individual labels provided by the individual teacher. The probability distribution of the class is a good indicator of the assertiveness resulting from the ensemble of teacher. If the probability of the most likely class is close to 1.0 (100%) than the all the teachers agreed and were themselves assertive and we could think that this is worth contributing to the training of our student. On the other hand if this probability is close to 0.5 (50%), either the teacher were in disagreement or they were not assertive or both and the data point does not seem to be valuable. We will therfore in a first instance select the data points with the highest probabvility of the most likely class. The inconvenient with this selection mode is that the most confident labels are not distributed evenly accross the FLuID space and this creates a weaker coverage of the chemical space.

#### Max diversity ('random', 'balanced') 
Another criteria can be to maximize the coverage of the chemical space. Since the FLuID space is diverse and homogeneous we can simply pick randomly points in this space ('random'). The inconvenient of this selection method is that the decidability level of the points is arbitrary and the class may be not balanced. The latter is taken into account by taking the same number of instances from each classes ('balanced')

#### Assertive and diverse ('mixed')
If we want to combine assertiveness with diversity we can select randomnly a batch of points and subsequently cluster theses points in separate chemical spaces, then we can select from each cluster a subset of the most assertive points. We repeat this operation until we have collected the desired number of FLuID points. This approach select assertive and diverse points from the FLuID space.

#### Non assertivess ('worst')
In order to have a base line we introduce a selection method that select the least assertive data points in the FLuID space. This will combine the weakenesses of 'best' and 'random' and shall provide a good base line to improve.

In [None]:
student_validation_table, teacher_validation_table, hybrid_data, hybrid_models = fluid.benchmark(label_table, teacher_models, validation_data, params)

In [None]:
federated_data = fluid.add_class_to_student_data(label_table, params['federated_student'])

In [None]:
fluid.cross_domain_validate(teacher_models, federated_data, test_data, params)

In [None]:
fluid.benchmark_teacher_count(label_table, teacher_validation_table, validation_data, params)

In [None]:
fluid.ADbenchmark(teacher_data, hybrid_data, teacher_models, hybrid_models, validation_data, radius = 2)

In [None]:

teachers = [t for t in range(1, 9)]

JPselected_data = fluid.select_federated_data(label_table, teachers, 10000, params)

JPhybrid_data = []

print("teacher")

for data in teacher_data:
    print(data.shape)
    JPhybrid_data.append(pd.concat([data, JPselected_data]))


print(len(JPhybrid_data))

print("hybrid")
for data in JPhybrid_data:
    print(data.shape)

JPhybrid_models = []

print("learning models")
for data in JPhybrid_data:
    JPhybrid_models.append(fluid.create_trained_classifier('rf', data))

print(len(JPhybrid_models))
                           
    

In [None]:

fluid.ADbenchmark(teacher_data, JPhybrid_data, teacher_models, JPhybrid_models, validation_data, radius = 2)

In [None]:
fluid_new_validation_table = fluid.create_validation_table()

fluid_new_validation_table = fluid.add_classifier_validation(fluid_new_validation_table, hybrid_models[1], validation_data,'H1', hybrid_data[1].__len__())

display(fluid_new_validation_table)


In [None]:

JP_doubelhybrid = []

for data in teacher_data:
    print(data.shape)
    JP_doubelhybrid.append(pd.concat([data, JPselected_data, JPselected_data]))

double_h_models = []

for data in JP_doubelhybrid:
    double_h_models.append(fluid.create_trained_classifier('rf', data))
    print(data.shape)
    

fluid.ADbenchmark(teacher_data, hybrid_data, teacher_models, double_h_models, validation_data, radius = 2)

In [None]:
importlib.reload(fluid)

h_models = []

for data in hybrid_data:
    h_models.append(fluid.create_trained_classifier('rf', data))
    print(data.shape)
    
print("length of hybrid models: " + str(len(h_models)))

fluid.ADbenchmark(teacher_data, hybrid_data, teacher_models, h_models, validation_data, radius = 2)

In [None]:
import pandas as pd

k = params['k']
mode = params['student_mode']
size = params['student_size']
student = params['federated_student']
algorithm = params['teacher_algorithm']

student_data = fluid.select_student_data(label_table, student, size, mode, params)
student_data['CLASS'] = [1 if c == 'Active' else 0 for c in student_data['C-' +  student]]

hybrid_data_list_made = []
hybrid_models_made = []

for i in range(0, k + 1):
    teacher_data = teacher_models[i][2]
    hybrid_data = pd.concat([student_data, teacher_data])
    hybrid_data_list_made.append(hybrid_data)
    hybrid_model = fluid.create_trained_classifier(algorithm, hybrid_data)
    hybrid_models_made.append(hybrid_model)

print(len(hybrid_models_made))
        

In [None]:
importlib.reload(fluid)
fluid.ADbenchmark(teacher_data, hybrid_data, teacher_models, hybrid_models_made, validation_data, radius = 2)

In [None]:
importlib.reload(fluid)
fluid.ADbenchmark(teacher_data, hybrid_data, teacher_models, hybrid_models, validation_data, radius = 2)

In [None]:
student_validation_table, teacher_validation_table, hybrid_data, hybrid_models = fluid.benchmark(label_table, teacher_models, test_data, params)

In [None]:
importlib.reload(fluid)
fluid.ADbenchmark(teacher_data, hybrid_data, teacher_models, hybrid_models, test_data, radius = 2)

In [None]:
fluid.benchmark_teacher_count(label_table, student_validation_table[0], test_data, params)

### Impact of the student training size 

Here we study the impact of the number of federated datapoints used to train the student.

#fluid.benchmark_student_size(label_table, training_data, test_data, params)

### Applicability Domain

Here we can see...

In [None]:
importlib.reload(fluid)
#print(teacher_data[1]['MOLECULE'].dtype)

#help(test_data.apply)

#print(teacher_data[1]['MOLECULE'].array)

#List of featureDictionaries for the teachers

print("calculating FD for all the teachers")
teacherFD = fluid.ADCalculateFD(teacher_data, 2)

fluid.ADPrintFDSize(teacherFD)


print("calculating FD for the students")
student_list = []

student_list.append(fluid.select_student_data(label_table, params['federated_student'], params['student_size'], params['student_mode'], params))

studentFD = fluid.ADCalculateFD(student_list)

fluid.ADPrintFDSize(studentFD)

print("Calculating FD for the hybrids")

hybridFD = fluid.ADCalculateFD(hybrid_data, 2)

fluid.ADPrintFDSize(hybridFD)

print("Appending domain columns for teacher")
fluid.ADAppendDomainColumn(test_data, teacherFD, "Domain_T", 2)

print("Appending domain columns for hybrid")
fluid.ADAppendDomainColumn(test_data, hybridFD, "Domain_H", 2)

print("Appending domain columns for student")
fluid.ADAppendDomainColumn(test_data, studentFD, "Domain_S", 2)
                                           
                     

In [None]:

domain_test = []

for i in range(0, 9):
    teacherColName = "Domain_T"+ str(i)
    hybridColName = "Domain_H" + str(i)
    domain_test.append(test_data[test_data[teacherColName] == False][test_data[hybridColName] == True])
    print(domain_test[i].__len__())
    

In [None]:
importlib.reload(fluid)
test_data_multiplex = []

for i in range(0,9):
    test_data_multiplex.append(test_data)

display(fluid.ADValidate(test_data_multiplex))

In [None]:
importlib.reload(fluid)

val_table = fluid.ADValidate(domain_test)

display(val_table)

          
                     

In [None]:
importlib.reload(fluid)

teacher_in_data = []

for i in range(0,9):
    teacherColName = "Domain_T" + str(i)
    teacher_in_data.append(test_data[test_data[teacherColName] == True])

print(len(teacher_in_data))

hybrid_in_data = []

for i in range(0,9):
    hybridColName = "Domain_H" + str(i)
    hybrid_in_data.append(test_data[test_data[hybridColName] == True])
    
print(len(teacher_in_data))

                                                     
                     

In [None]:
importlib.reload(fluid)

print("Teacher:")

display(fluid.ADValidate(teacher_in_data))

print("Hybrid:")

display(fluid.ADValidate(hybrid_in_data))


In [None]:
print("Appending domain columns for teacher")
fluid.ADAppendDomainColumn(validation_data, teacherFD, "Domain_T", 2)

print("Appending domain columns for hybrid")
fluid.ADAppendDomainColumn(validation_data, hybridFD, "Domain_H", 2)

print("Appending domain columns for student")
fluid.ADAppendDomainColumn(validation_data, studentFD, "Domain_S", 2)

print(test_data.shape)

fluid.ADAppendPredictionColumn(validation_data, teacher_models, "Predicted_T")

fluid.ADAppendPredictionColumn(validation_data, hybrid_models, "Predicted_H")

print(test_data.shape)

In [None]:
domain_validate_test = []

for i in range(0, 9):
    teacherColName = "Domain_T"+ str(i)
    hybridColName = "Domain_H" + str(i)
    domain_validate_test.append(validation_data[validation_data[teacherColName] == False][validation_data[hybridColName] == True])
    print(domain_validate_test[i].__len__())

In [None]:
validation_data_multiplex = []

for i in range(0,9):
    validation_data_multiplex.append(validation_data)

display(fluid.ADValidate(validation_data_multiplex))

In [None]:
teacher_val_in_data = []

for i in range(0,9):
    teacherColName = "Domain_T" + str(i)
    teacher_val_in_data.append(validation_data[validation_data[teacherColName] == True])

print(len(teacher_in_data))

hybrid_val_in_data = []

for i in range(0,9):
    hybridColName = "Domain_H" + str(i)
    hybrid_val_in_data.append(validation_data[validation_data[hybridColName] == True])
    
print(len(teacher_in_data))

In [None]:

print("Teacher:")

display(fluid.ADValidate(teacher_val_in_data))

print("Hybrid:")

display(fluid.ADValidate(hybrid_val_in_data))