In [1]:
from data.utils.smiles_to_graphs import smiles2graph
import pickle
from numpy import random

def add_smiles_list(env,smiles_list,train_percentage,test_percentage):
    
    with env.begin(write=True) as txn:
        
        n_invalid = 0
        for i, smiles in enumerate(smiles_list):

            # Convert SMILES to graph
            graph = smiles2graph(smiles)

            # If SMILES gives valid molecule save it, otherwise skip
            if graph is None:
                n_invalid += 1
                continue
            else:
                data = pickle.dumps(graph)
                split = random.choice(["train", "valid"], p=[train_percentage, test_percentage])
                key = split+f"_{i}"
                txn.put(key.encode("ascii"), data)
    
    return n_invalid

In [2]:
import lmdb
import os 
import pandas as pd

file = f"data/graphs/pubchem.lmdb"

if os.path.exists(file):
    os.remove(file)

env = lmdb.open(
    file,
    subdir=False,
    readonly=False,
    lock=False,
    readahead=False,
    meminit=False,
    max_readers=1,
    map_size=1099511627776
)
    
dir = 'data/smiles/pubchem'
files = os.listdir(dir)

train_percentage = 0.95
test_percentage = 0.05  

In [3]:
from tqdm import tqdm
from multiprocessing import Pool

n_threads = 8

if n_threads == 1:

    n_invalid = 0
    n_total = 0
    for file in tqdm(files):
        smiles_list = pd.read_csv(os.path.join(dir, file)).values[:, 0]
        n_invalid += add_smiles_list(env, smiles_list, train_percentage=train_percentage, test_percentage=test_percentage)
        n_total += len(smiles_list)
        
else:
    
    def process_file(file):
        smiles_list = pd.read_csv(os.path.join(dir, file)).values[:, 0]
        n_invalid = add_smiles_list(env, smiles_list, train_percentage=train_percentage, test_percentage=test_percentage)
        return len(smiles_list), n_invalid
    
    with Pool(n_threads) as p:
        results = list(tqdm(p.imap(process_file, files), total=len(files)))
        

100%|██████████| 8/8 [00:51<00:00,  6.47s/it]


In [2]:
list = [i for i in range(10**9)]

: 

In [7]:
import lmdb
import pickle

file1 = "database1.lmdb"
file2 = "database2.lmdb"

env1 = lmdb.open(
        file1,
        subdir=False,
        readonly=False,
        lock=False,
        readahead=False,
        meminit=False,
        max_readers=1,
        map_size=1099511627776
    )

env2 = lmdb.open(
        file2,
        subdir=False,
        readonly=False,
        lock=False,
        readahead=False,
        meminit=False,
        max_readers=1,
        map_size=1099511627776
    )


with env1.begin(write=True) as txn1, env2.begin(write=True) as txn2:
    txn1.put("global".encode("ascii"), pickle.dumps("some info about the database 1"))
    txn2.put("global".encode("ascii"), pickle.dumps("some info about the database 2")) 
