In [13]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import pickle
import os
import fcd
from fcd import get_fcd, load_ref_model,canonical_smiles, get_predictions, calculate_frechet_distance
np.random.seed(0)

os.environ["CUDA_VISIBLE_DEVICES"]= '0' #set gpu 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
model = load_ref_model()

### Load and prepare data
Take care when preparing data.
- Different canonicalizations change the FCD. rdkit canonicalization should be used.
- In case the generated "molecules" contain invalid SMILES, decide if you want to include them in the FCD.
- Make sure that you respect sample sizes as the FCD varies with size.

In [15]:
# Load generated molecules
gen_mol_file = "generated_smiles/LSTM_Segler.smi" #input file which contains one generated SMILES per line
gen_mol = pd.read_csv(gen_mol_file,header=None)[0] #IMPORTANT: take at least 10000 molecules as FCD can vary with sample size 
sample1 = np.random.choice(gen_mol, 10000, replace=False)
sample2 = np.random.choice(gen_mol, 10000, replace=False)

# get canonical smiles and filter invalid ones
can_sample1 = [w for w in canonical_smiles(sample1) if w is not None]
can_sample2 = [w for w in canonical_smiles(sample2) if w is not None]

RDKit ERROR: [11:11:12] Explicit valence for atom # 16 N, 4, is greater than permitted
RDKit ERROR: [11:11:12] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [11:11:12] Explicit valence for atom # 14 N, 4, is greater than permitted
RDKit ERROR: [11:11:12] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [11:11:12] Explicit valence for atom # 15 N, 4, is greater than permitted
RDKit ERROR: [11:11:12] Explicit valence for atom # 6 N, 5, is greater than permitted
RDKit ERROR: [11:11:12] Explicit valence for atom # 2 N, 4, is greater than permitted
RDKit ERROR: [11:11:12] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [11:11:12] Explicit valence for atom # 10 N, 5, is greater than permitted
RDKit ERROR: [11:11:12] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [11:11:12] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [11:11:12] Explicit valence for ato

## Calculation of FCD

In [16]:
#get CHEBMLNET activations of generated molecules 
act1 = get_predictions(model, can_sample1)
act2 = get_predictions(model, can_sample2)

mu1 = np.mean(act1, axis=0)
sigma1 = np.cov(act1.T)

mu2 = np.mean(act2, axis=0)
sigma2 = np.cov(act2.T)

fcd_score = calculate_frechet_distance(
    mu1=mu1,
    mu2=mu2, 
    sigma1=sigma1,
    sigma2=sigma2)

print('FCD: ',fcd_score)

FCD:  0.3338613001233881


In [17]:
"""if you don't need to store the activations you can also take a shortcut."""
fcd_score = get_fcd(model, can_sample1, can_sample2)

print('FCD: ',fcd_score)

FCD:  0.3338613001233881


In [18]:
"""This is what happens if you do not canonicalize the smiles"""
fcd_score = get_fcd(model, can_sample1, sample2)

print('FCD: ',fcd_score)

FCD:  25.63927611890624
