### Baseline Metrics

In [None]:
from pref_opt_for_mols.metrics import internal_diversity, frac_unique, frac_valid, fcd_score
import pandas as pd

gpt_smiles = pd.read_csv("generated_smiles/smiles-gpt-base-10k.csv")["smiles"].tolist()
rnn_smiles = pd.read_csv("generated_smiles/smiles-rnn-base-10k.csv")["smiles"].tolist()
ref_smiles = pd.read_csv("data/test.csv")["smiles"].tolist()

rnn_int_div = internal_diversity(rnn_smiles, n_workers=8)
gpt_int_div = internal_diversity(gpt_smiles, n_workers=8)

rnn_unique = frac_unique(rnn_smiles, n_workers=8)
gpt_unique = frac_unique(gpt_smiles, n_workers=8)

rnn_valid = frac_valid(rnn_smiles, n_workers=8)
gpt_valid = frac_valid(gpt_smiles, n_workers=8)

rnn_fcd = fcd_score(rnn_smiles, ref_smiles, device="cuda:1", n_workers=8)
gpt_fcd = fcd_score(gpt_smiles, ref_smiles, device="cuda:1", n_workers=8)

results = f"""
    Fraction Valid
--------------------------
RNN: {round(rnn_valid, 3)}
GPT: {round(gpt_valid, 3)}

    Fraction Unique
--------------------------
RNN: {round(rnn_unique, 3)}
GPT: {round(gpt_unique, 3)}

    Internal Diversity
--------------------------
RNN: {round(rnn_int_div, 3)}
GPT: {round(gpt_int_div, 3)}

    FCD Score
--------------------------
RNN: {round(rnn_fcd, 3)}
GPT: {round(gpt_fcd, 3)}
"""

print(results)

### Scaffold Matching Task

**Unsupervised fine-tuned model**

In [None]:
from pref_opt_for_mols.metrics import frac_contains_scaffold, internal_diversity, frac_unique, frac_valid, strip_invalid
from rdkit.Chem import MolFromSmiles, MolFromSmarts
import pandas as pd 

scaffold_smiles = "*NC(=O)c1cccc(*)c1" #"*c1c[nH]c2ncnc(*)c12" # scaffold 5
scaffold_mol = MolFromSmarts(scaffold_smiles)

gpt_smiles = pd.read_csv("generated_smiles/smiles-gpt-scaff_2-10k.csv")["smiles"].tolist()
rnn_smiles = pd.read_csv("generated_smiles/smiles-rnn-scaff_2-10k.csv")["smiles"].tolist()

gpt_smiles_w_scaffold = [smi for smi in strip_invalid(gpt_smiles) if MolFromSmiles(smi).HasSubstructMatch(scaffold_mol)]
rnn_smiles_w_scaffold = [smi for smi in strip_invalid(rnn_smiles) if MolFromSmiles(smi).HasSubstructMatch(scaffold_mol)]

frac_contains_scaff_gpt = frac_contains_scaffold(gpt_smiles, scaffold_smiles)
frac_contains_scaff_rnn = frac_contains_scaffold(rnn_smiles, scaffold_smiles)

frac_unique_gpt = frac_unique(gpt_smiles)
frac_unique_rnn = frac_unique(rnn_smiles)

frac_valid_gpt = frac_valid(gpt_smiles)
frac_valid_rnn = frac_valid(rnn_smiles)

int_div_gpt = internal_diversity(gpt_smiles)
int_div_rnn = internal_diversity(rnn_smiles)

int_div_gpt_scaff = internal_diversity(gpt_smiles_w_scaffold)
int_div_rnn_scaff = internal_diversity(rnn_smiles_w_scaffold)

results = f"""
    Fraction Valid
--------------------------
RNN: {round(frac_valid_rnn, 3)}
GPT: {round(frac_valid_gpt, 3)}

    Fraction Unique
--------------------------
RNN: {round(frac_unique_rnn, 3)}
GPT: {round(frac_unique_gpt, 3)}

    Internal Diversity
--------------------------
RNN: {round(int_div_rnn, 3)}
GPT: {round(int_div_gpt, 3)}

Internal Diversity (Scaffold)
--------------------------
RNN: {round(int_div_rnn_scaff, 3)}
GPT: {round(int_div_gpt_scaff, 3)}


Fraction Contains Scaffold
--------------------------
RNN: {round(frac_contains_scaff_rnn, 3)}
GPT: {round(frac_contains_scaff_gpt, 3)}
"""

print(results)

**DPO fine-tuned model**

In [None]:
from pref_opt_for_mols.metrics import frac_contains_scaffold, internal_diversity, frac_unique, frac_valid, strip_invalid
from rdkit.Chem import MolFromSmiles, MolFromSmarts
import pandas as pd 

scaffold_smiles = "*c1c[nH]c2ncnc(*)c12" # scaffold 5
scaffold_mol = MolFromSmarts(scaffold_smiles)

gpt_smiles = pd.read_csv("generated_smiles/smiles-gpt-scaff_5-dpo-v2-10k.csv")["smiles"].tolist()
rnn_smiles = pd.read_csv("generated_smiles/smiles-rnn-scaff_5-dpo-v2-10k.csv")["smiles"].tolist()

gpt_smiles_w_scaffold = [smi for smi in strip_invalid(gpt_smiles) if MolFromSmiles(smi).HasSubstructMatch(scaffold_mol)]
rnn_smiles_w_scaffold = [smi for smi in strip_invalid(rnn_smiles) if MolFromSmiles(smi).HasSubstructMatch(scaffold_mol)]

frac_contains_scaff_gpt = frac_contains_scaffold(gpt_smiles, scaffold_smiles)
frac_contains_scaff_rnn = frac_contains_scaffold(rnn_smiles, scaffold_smiles)

frac_unique_gpt = frac_unique(gpt_smiles)
frac_unique_rnn = frac_unique(rnn_smiles)

frac_valid_gpt = frac_valid(gpt_smiles)
frac_valid_rnn = frac_valid(rnn_smiles)

int_div_gpt = internal_diversity(gpt_smiles)
int_div_rnn = internal_diversity(rnn_smiles)

int_div_gpt_scaff = internal_diversity(gpt_smiles_w_scaffold)
int_div_rnn_scaff = internal_diversity(rnn_smiles_w_scaffold)

results = f"""
    Fraction Valid
--------------------------
RNN: {round(frac_valid_rnn, 3)}
GPT: {round(frac_valid_gpt, 3)}

    Fraction Unique
--------------------------
RNN: {round(frac_unique_rnn, 3)}
GPT: {round(frac_unique_gpt, 3)}

    Internal Diversity
--------------------------
RNN: {round(int_div_rnn, 3)}
GPT: {round(int_div_gpt, 3)}

Internal Diversity (Scaffold)
--------------------------
RNN: {round(int_div_rnn_scaff, 3)}
GPT: {round(int_div_gpt_scaff, 3)}


Fraction Contains Scaffold
--------------------------
RNN: {round(frac_contains_scaff_rnn, 3)}
GPT: {round(frac_contains_scaff_gpt, 3)}
"""

print(results)

## Med Chem Filtering

**Baseline model**

In [None]:
from pref_opt_for_mols.metrics import frac_contains_scaffold, internal_diversity, frac_unique, frac_valid, strip_invalid
import pandas as pd 

gpt_data = pd.read_csv("generated_smiles/smiles-gpt-base-10k-filtered.csv")
rnn_data = pd.read_csv("generated_smiles/smiles-rnn-base-10k-filtered.csv")

gpt_passes = gpt_data["label"].mean()
rnn_passes = rnn_data["label"].mean()

gpt_smiles = gpt_data["smiles"].tolist()
rnn_smiles = rnn_data["smiles"].tolist()

frac_unique_gpt = frac_unique(gpt_smiles)
frac_unique_rnn = frac_unique(rnn_smiles)

frac_valid_gpt = frac_valid(gpt_smiles)
frac_valid_rnn = frac_valid(rnn_smiles)

int_div_gpt = internal_diversity(gpt_smiles)
int_div_rnn = internal_diversity(rnn_smiles)

results = f"""
    Fraction Valid
--------------------------
RNN: {round(frac_valid_rnn, 3)}
GPT: {round(frac_valid_gpt, 3)}

    Fraction Unique
--------------------------
RNN: {round(frac_unique_rnn, 3)}
GPT: {round(frac_unique_gpt, 3)}

    Internal Diversity
--------------------------
RNN: {round(int_div_rnn, 3)}
GPT: {round(int_div_gpt, 3)}

    Passes Filters
--------------------------
RNN: {round(rnn_passes, 3)}
GPT: {round(gpt_passes, 3)}
"""

print(results)

**DPO fine-tuned**

In [None]:
from pref_opt_for_mols.metrics import frac_contains_scaffold, internal_diversity, frac_unique, frac_valid, strip_invalid
import pandas as pd 

gpt_data = pd.read_csv("generated_smiles/smiles-gpt-mcf-dpo-10k-filtered.csv")
rnn_data = pd.read_csv("generated_smiles/smiles-rnn-mcf-dpo-10k-filtered.csv")

gpt_passes = gpt_data["label"].mean()
rnn_passes = rnn_data["label"].mean()

gpt_smiles = gpt_data["smiles"].tolist()
rnn_smiles = rnn_data["smiles"].tolist()

frac_unique_gpt = frac_unique(gpt_smiles)
frac_unique_rnn = frac_unique(rnn_smiles)

frac_valid_gpt = frac_valid(gpt_smiles)
frac_valid_rnn = frac_valid(rnn_smiles)

int_div_gpt = internal_diversity(gpt_smiles)
int_div_rnn = internal_diversity(rnn_smiles)

results = f"""
    Fraction Valid
--------------------------
RNN: {round(frac_valid_rnn, 3)}
GPT: {round(frac_valid_gpt, 3)}

    Fraction Unique
--------------------------
RNN: {round(frac_unique_rnn, 3)}
GPT: {round(frac_unique_gpt, 3)}

    Internal Diversity
--------------------------
RNN: {round(int_div_rnn, 3)}
GPT: {round(int_div_gpt, 3)}

    Passes Filters
--------------------------
RNN: {round(rnn_passes, 3)}
GPT: {round(gpt_passes, 3)}
"""

print(results)

## EGFR Predicted Activity

**CheMBL models**

In [None]:
import pandas as pd 
from pref_opt_for_mols.metrics import internal_diversity, frac_unique, frac_valid, strip_invalid

rnn_data = pd.read_csv("generated_smiles/smiles-rnn-chembl-10k-raw.csv")
gpt_data = pd.read_csv("generated_smiles/smiles-gpt-chembl-10k-raw.csv")

rnn_smiles = rnn_data["smiles"].tolist()
gpt_smiles = gpt_data["smiles"].tolist()

rnn_active = pd.read_csv("generated_smiles/smiles-rnn-chembl-10k-egfr.csv")["label"].sum()/len(rnn_smiles)
gpt_active = pd.read_csv("generated_smiles/smiles-gpt-chembl-10k-egfr.csv")["label"].sum()/len(gpt_smiles)

frac_unique_gpt = frac_unique(gpt_smiles)
frac_unique_rnn = frac_unique(rnn_smiles)

frac_valid_gpt = frac_valid(gpt_smiles)
frac_valid_rnn = frac_valid(rnn_smiles)

int_div_gpt = internal_diversity(gpt_smiles)
int_div_rnn = internal_diversity(rnn_smiles)

results = f"""
    Fraction Valid
--------------------------
RNN: {round(frac_valid_rnn, 3)}
GPT: {round(frac_valid_gpt, 3)}

    Fraction Unique
--------------------------
RNN: {round(frac_unique_rnn, 3)}
GPT: {round(frac_unique_gpt, 3)}

    Internal Diversity
--------------------------
RNN: {round(int_div_rnn, 3)}
GPT: {round(int_div_gpt, 3)}

    Fraction Pred. Active
--------------------------
RNN: {round(rnn_active, 3)}
GPT: {round(gpt_active, 3)}
"""

print(results)


In [None]:
import pandas as pd 
from pref_opt_for_mols.metrics import internal_diversity, frac_unique, frac_valid, strip_invalid

rnn_data = pd.read_csv("generated_smiles/smiles-rnn-EGFR-dpo-10k-raw.csv")
gpt_data = pd.read_csv("generated_smiles/smiles-gpt-EGFR-dpo-10k-raw.csv")

rnn_smiles = rnn_data["smiles"].tolist()
gpt_smiles = gpt_data["smiles"].tolist()

rnn_active = pd.read_csv("generated_smiles/smiles-rnn-EGFR-dpo-10k-egfr.csv")["label"].sum()/len(rnn_smiles)
gpt_active = pd.read_csv("generated_smiles/smiles-gpt-EGFR-dpo-10k-egfr.csv")["label"].sum()/len(gpt_smiles)

frac_unique_gpt = frac_unique(gpt_smiles)
frac_unique_rnn = frac_unique(rnn_smiles)

frac_valid_gpt = frac_valid(gpt_smiles)
frac_valid_rnn = frac_valid(rnn_smiles)

int_div_gpt = internal_diversity(gpt_smiles)
int_div_rnn = internal_diversity(rnn_smiles)

results = f"""
    Fraction Valid
--------------------------
RNN: {round(frac_valid_rnn, 3)}
GPT: {round(frac_valid_gpt, 3)}

    Fraction Unique
--------------------------
RNN: {round(frac_unique_rnn, 3)}
GPT: {round(frac_unique_gpt, 3)}

    Internal Diversity
--------------------------
RNN: {round(int_div_rnn, 3)}
GPT: {round(int_div_gpt, 3)}

    Fraction Pred. Active
--------------------------
RNN: {round(rnn_active, 3)}
GPT: {round(gpt_active, 3)}
"""

print(results)