In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw

In [2]:
df_sw_tr_org = pd.read_csv("../data/original/0.sweet-train.tsv", sep="\t")
df_sw_te_org = pd.read_csv("../data/original/0.sweet-test.tsv", sep="\t")
df_bt_tr_org = pd.read_csv("../data/original/0.bitter-train.tsv", sep="\t")
df_bt_te_org = pd.read_csv("../data/original/0.bitter-test.tsv", sep="\t")

In [3]:
df_sw_tr_org

Unnamed: 0,Name,Taste,Reference,SMILES,Canonical SMILES,Sweet
0,Sucrose,Sweet,Rojas et al. (2017),OCC1OC(C(C1O)O)(CO)OC1OC(CO)C(C(C1O)O)O,OCC1OC(C(C1O)O)(CO)OC1OC(CO)C(C(C1O)O)O,True
1,"Sucralose / 4,1',6'-Trichloro-galactosucrose",Sweet,Rojas et al. (2017),ClCC1OC(C(C1O)O)(CCl)OC1OC(CO)C(C(C1O)O)Cl,ClCC1OC(C(C1O)O)(CCl)OC1OC(CO)C(C(C1O)O)Cl,True
2,Aspartame/Aspartyl-phenylalanine methylester,Sweet,Rojas et al. (2017),COC(=O)C(NC(=O)C(CC(=O)O)N)Cc1ccccc1,COC(=O)C(NC(=O)C(CC(=O)O)N)Cc1ccccc1,True
3,Tagatose,Sweet,Rojas et al. (2017),OCC1(O)OCC(C(C1O)O)O,OCC1(O)OCC(C(C1O)O)O,True
4,Isomaltulose/Palatinose,Sweet,Rojas et al. (2017),OCC1OC(OCC2OC(C(C2O)O)(O)CO)C(C(C1O)O)O,OCC1OC(OCC2OC(C(C2O)O)(O)CO)C(C(C1O)O)O,True
...,...,...,...,...,...,...
2200,6-Methyl-2-pyridinemethanol,Bitter,The Good Scents Company Database,CC1=NC(=CC=C1)CO,OCc1cccc(n1)C,False
2201,4-hydroxybenzyl alcohol,Bitter,The Good Scents Company Database,C1=CC(=CC=C1CO)O,OCc1ccc(cc1)O,False
2202,4-Benzoylpyridine,Bitter,The Good Scents Company Database,C1=CC=C(C=C1)C(=O)C2=CC=NC=C2,O=C(c1ccncc1)c1ccccc1,False
2203,4-(5-Methyl-2-furyl)-2-butanone,Bitter,The Good Scents Company Database,CC1=CC=C(O1)CCC(=O)C,CC(=O)CCc1ccc(o1)C,False


In [4]:
# def filter_by_length(df, min_len, max_len):
#     return df[(df["SMILES"].str.len() >= min_len) & (df["SMILES"].str.len() <= max_len)]
def filter_by_taste(df, taste):
    df = df[df["Taste"] == taste]
    df = df[df[taste] == 1]
    df = df[df["SMILES"].str.len() <= 70]  # filter out too long SMILES
    return df

In [5]:
df_sw_tr = filter_by_taste(df_sw_tr_org, "Sweet")
df_sw_te = filter_by_taste(df_sw_te_org, "Sweet")
df_bt_tr = filter_by_taste(df_bt_tr_org, "Bitter")
df_bt_te = filter_by_taste(df_bt_te_org, "Bitter")

In [6]:
df_sw_tr

Unnamed: 0,Name,Taste,Reference,SMILES,Canonical SMILES,Sweet
0,Sucrose,Sweet,Rojas et al. (2017),OCC1OC(C(C1O)O)(CO)OC1OC(CO)C(C(C1O)O)O,OCC1OC(C(C1O)O)(CO)OC1OC(CO)C(C(C1O)O)O,True
1,"Sucralose / 4,1',6'-Trichloro-galactosucrose",Sweet,Rojas et al. (2017),ClCC1OC(C(C1O)O)(CCl)OC1OC(CO)C(C(C1O)O)Cl,ClCC1OC(C(C1O)O)(CCl)OC1OC(CO)C(C(C1O)O)Cl,True
2,Aspartame/Aspartyl-phenylalanine methylester,Sweet,Rojas et al. (2017),COC(=O)C(NC(=O)C(CC(=O)O)N)Cc1ccccc1,COC(=O)C(NC(=O)C(CC(=O)O)N)Cc1ccccc1,True
3,Tagatose,Sweet,Rojas et al. (2017),OCC1(O)OCC(C(C1O)O)O,OCC1(O)OCC(C(C1O)O)O,True
4,Isomaltulose/Palatinose,Sweet,Rojas et al. (2017),OCC1OC(OCC2OC(C(C2O)O)(O)CO)C(C(C1O)O)O,OCC1OC(OCC2OC(C(C2O)O)(O)CO)C(C(C1O)O)O,True
...,...,...,...,...,...,...
2154,D-mannitol,Sweet,SuperSweet,C([C@H]([C@H]([C@@H]([C@@H](CO)O)O)O)O)O,OC[C@H]([C@H]([C@@H]([C@@H](CO)O)O)O)O,True
2158,Hernandulcin,Sweet,SuperSweet,CC1=CC(=O)[C@@H](CC1)[C@](C)(CCC=C(C)C)O,CC(=CCC[C@@]([C@@H]1CCC(=CC1=O)C)(O)C)C,True
2159,Phyllodulcin,Sweet,SuperSweet,COC1=C(C=C(C=C1)[C@H]2CC3=C(C(=CC=C3)O)C(=O)O2)O,COc1ccc(cc1O)[C@@H]1OC(=O)c2c(C1)cccc2O,True
2160,alpha-D-glucose,Sweet,The Good Scents Company Database,C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O,OC[C@H]1O[C@H](O)[C@@H]([C@H]([C@@H]1O)O)O,True


In [7]:
df_sw_tr = df_sw_tr[["Canonical SMILES", "Taste"]]
df_sw_te = df_sw_te[["Canonical SMILES", "Taste"]]
df_bt_tr = df_bt_tr[["Canonical SMILES", "Taste"]]
df_bt_te = df_bt_te[["Canonical SMILES", "Taste"]]

df_sw_tr["mol"] = df_sw_tr["Canonical SMILES"].apply(Chem.MolFromSmiles)
df_sw_te["mol"] = df_sw_te["Canonical SMILES"].apply(Chem.MolFromSmiles)
df_bt_tr["mol"] = df_bt_tr["Canonical SMILES"].apply(Chem.MolFromSmiles)
df_bt_te["mol"] = df_bt_te["Canonical SMILES"].apply(Chem.MolFromSmiles)

# filter out unconvertable SMILES
df_sw_tr = df_sw_tr.dropna()
df_sw_te = df_sw_te.dropna()
df_bt_tr = df_bt_tr.dropna()
df_bt_te = df_bt_te.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sw_tr["mol"] = df_sw_tr["Canonical SMILES"].apply(Chem.MolFromSmiles)
[12:13:27] Explicit valence for atom # 2 N, 4, is greater than permitted
[12:13:27] Explicit valence for atom # 2 N, 4, is greater than permitted
[12:13:27] Explicit valence for atom # 3 N, 4, is greater than permitted
[12:13:27] Explicit valence for atom # 5 N, 4, is greater than permitted
[12:13:27] Explicit valence for atom # 6 N, 4, is greater than permitted
[12:13:27] Explicit valence for atom # 7 N, 4, is greater than permitted
[12:13:27] Explicit valence for atom # 9 N, 4, is greater than permitted
[12:13:27] Explicit valence for atom # 11 N, 4, is greater than permitted
[12:13:27] Explicit valence for atom # 3 N, 4, is greater than permitted
[12:

In [8]:
def get_random_n(df, n, seed=0):
    return df.sample(n, random_state=seed)

In [9]:
df_sw_tr_100 = get_random_n(df_sw_tr, 100)
df_sw_te_20 = get_random_n(df_sw_te, 20)
df_bt_tr_100 = get_random_n(df_bt_tr, 100)
df_bt_te_20 = get_random_n(df_bt_te, 20)

In [10]:
df_sw_tr_100

Unnamed: 0,Canonical SMILES,Taste,mol
2020,COc1ccccc1OC(=O)Cc1ccccc1,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d7ca50>
936,OC[C@@H]([C@@H]([C@H]([C@H](CO)O)O)O)O,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d709e0>
520,OC/C=C(/CCC=C(C)C)\C,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d68270>
578,OCC=C(C)C,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d69bd0>
873,COC(=O)[C@H](NC(=O)[C@H](CC(=O)O)N)Cc1ccccc1,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d6fa70>
...,...,...,...
709,OC(=O)C1CCCCC1,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d6d5b0>
2037,CC(COC(=O)c1ccccc1N)C,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d7d1c0>
219,OCC(C(=O)O)N,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d62e30>
716,CCCC(CCOC(=O)C)SC,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d6d8c0>


In [11]:
print(df_sw_tr_100["mol"].isnull().sum())
print(df_sw_te_20["mol"].isnull().sum())
print(df_bt_tr_100["mol"].isnull().sum())
print(df_bt_te_20["mol"].isnull().sum())

0
0
0
0


In [12]:
# change label to 0 or 1 if sweet (1) or bitter (0)
df_sw_tr_100["Taste"] = 1
df_sw_te_20["Taste"] = 1
df_bt_tr_100["Taste"] = 0
df_bt_te_20["Taste"] = 0

In [13]:
df_sw_tr_100.head()

Unnamed: 0,Canonical SMILES,Taste,mol
2020,COc1ccccc1OC(=O)Cc1ccccc1,1,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d7ca50>
936,OC[C@@H]([C@@H]([C@H]([C@H](CO)O)O)O)O,1,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d709e0>
520,OC/C=C(/CCC=C(C)C)\C,1,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d68270>
578,OCC=C(C)C,1,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d69bd0>
873,COC(=O)[C@H](NC(=O)[C@H](CC(=O)O)N)Cc1ccccc1,1,<rdkit.Chem.rdchem.Mol object at 0x7f5e05d6fa70>


In [14]:
# save as csv
import os

save_data_dir = "../data/sampled_mix"
os.makedirs(save_data_dir, exist_ok=True)
df_sw_tr_100.iloc[:, :-1].to_csv(os.path.join(save_data_dir, "1.sweet-train-100.csv"))
df_sw_te_20.iloc[:, :-1].to_csv(os.path.join(save_data_dir, "1.sweet-test-20.csv"))
df_bt_tr_100.iloc[:, :-1].to_csv(os.path.join(save_data_dir, "1.bitter-train-100.csv"))
df_bt_te_20.iloc[:, :-1].to_csv(os.path.join(save_data_dir, "1.bitter-test-20.csv"))

In [19]:
# save as like prompt

train_data = pd.concat([df_sw_tr_100, df_bt_tr_100])
test_data = pd.concat([df_sw_te_20, df_bt_te_20])

# mix data
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)


str_format = "smiles: {}\nsweet_or_bitter: {}\n\n"
train_text = ""
for i, row in train_data.iterrows():
    train_text += str_format.format(row["Canonical SMILES"], row["Taste"])

test_text = ""
for i, row in test_data.iterrows():
    test_text += str_format.format(row["Canonical SMILES"], row["Taste"])

unlabeld_text = ""
for i, row in test_data.iterrows():
    unlabeld_text += str_format.format(row["Canonical SMILES"], "")

In [20]:
with open(os.path.join(save_data_dir, "1.train.txt"), "w") as f:
    f.write(train_text)

with open(os.path.join(save_data_dir, "1.test.txt"), "w") as f:
    f.write(test_text)

with open(os.path.join(save_data_dir, "1.unlabeled_test.txt"), "w") as f:
    f.write(unlabeld_text)