In [3]:
def retrieveMoleculePDB(ligand_path):
    """
    Returns RDKit molecule objects for requested path PDB file.

    -- args
    ligand_path (str): path leading to molecule pdb file

    -- returns
    RDKit molecule object
    """
    mol = rdmolfiles.MolFromPDBFile(
                                    ligand_path, 
                                    sanitize=True
                                    )
    return mol

###################################################
### Molecular properties:                       ###
                                                ###
                                                ###

def computeLigMolProps(transfrm_path, 
                    working_dir="features/MOLPROPS/",
                    target_columns=None, 
                    verbose=False):
    """
    Compute molecular properties for the molecules in given transfrm_path and write to file.

    --args
    transfrm_path (str): path to directory containing ligand files
    working_dir (str): path to directory to pickle into
    verbose (bool): whether or not to print featurisation info to stdout

    --returns
    molprops_set (pandas dataframe): set of molecules with molecular properties

    """
    mol_paths = glob.glob(transfrm_path+"*")

    # generate RDKit mol objects from paths:
    mols_rdkit = [ self.retrieveMoleculePDB(mol) for mol in mol_paths ]

    # generate molecule name from paths for indexing:
    mols_names = [ mol.replace(transfrm_path, "").split(".")[0] for mol in mol_paths ]


    # generate all descriptors available in mordred:
    calc = Calculator(descriptors, ignore_3D=False)
    print("Computing molecular properties:")
    molprops_set = calc.pandas(mols_rdkit)

    # remove columns with bools or strings (not fit for subtraction protocol):
    if target_columns.any():
        # if variable is input the function is handling a testset and must 
        # keep the same columns as train dataset:
        molprops_set = molprops_set[target_columns]
    else:
        # if making a training dataset, decide which columns to retain:
        molprops_set = molprops_set.select_dtypes(include=["float64", "int64"])

    molprops_set.index = mols_names

    # pickle dataframe to specified directory:
    molprops_set.to_pickle(working_dir+"molprops.pickle")

    if verbose:
        print(molprops_set)

    return molprops_set

def computePertMolProps(
                        perturbation_paths, 
                        molprops_set=None,
                        free_path="SOLVATED/", 
                        working_dir="features/MOLPROPS/"):
    """
    Read featurised FEP molecules and generate matches based on user input perturbations.
    Writes each perturbation features by appending it to the features.csv file.

    --args
    perturbation_paths (list): nested list of shape [[A,B],[C, D]] with strings describing 
    the perturbations. These combinations will be used to make pairwise extractions 
    from molprops_set.

    molprops_set (pandas dataframe; optional): dataframe object that contains the
    featurised FEP dataset. If None, will attempt to pickle from working_dir

    free_path (str): path to directory containing perturbation directories

    working_dir (str): path to directory to pickle dataset from

    --returns
    None

    """

    # test if input is there:
    if molprops_set is None:
        try:
            molprops_set = pd.read_pickle(working_dir+"molprops.pickle")
        except FileNotFoundError:
            print("Unable to load pickle file with per-ligand molprop data in absence of molprops_set function input.")

    # clean slate featurised perturbations dataset; write column names:
    open(working_dir+"featurised_molprops.h5", "w").close()
    store = pd.HDFStore(working_dir+"featurised_molprops.h5") 

    # write list of column names to file for future testset feature generation:
    pd.DataFrame(molprops_set.columns).transpose().to_csv(working_dir+"featurised_molprops.csv", header=False)

    # iterate over perturbations:
    for perturbation in tqdm(perturbation_paths):
        perturbation_name = perturbation.replace(free_path, "").split(".")[0]

        ligandA = perturbation_name.split("~")[0]
        ligandB = perturbation_name.split("~")[1]

        # extract molprops from per-ligand:
        ligandA_molprops = molprops_set.loc[ligandA]
        ligandB_molprops = molprops_set.loc[ligandB]

        # subtract and transform to dataframe:
        perturbation_molprops = ligandB_molprops.subtract(ligandA_molprops).to_frame(name=perturbation_name).transpose()

        # append to the molprops HDF5 file:
        store.append(
                    working_dir+"featurised_molprops.h5", 
                    perturbation_molprops,
                    format="table",
                    index=False,
                    min_itemsize=500
                    )

    store.close()

In [4]:
computeLigMolProps(transfrm_path="./input/ligands/")

TypeError: computeLigMolProps() got multiple values for argument 'transfrm_path'