## Finding the IDs for receptor filenames

In [148]:
import pandas as pd
import numpy as np


def get_filesnames(rtype):
    # Read file list
    file_list = pd.read_csv("all_files.csv")
    # Read receptor list
    receptors = pd.read_csv("should_have/" + rtype  + ".csv")
    receptors["name_fmt"] = receptors.description.apply(lambda x: x.replace("-", "_").replace(" ", "_").replace(
    "'", "").replace("/", "").replace("[", "").replace("]", "").replace(",", "").lower())
    # Create column for paths
    receptors['path'] = np.zeros(len(receptors), dtype=str)
    print("There should be:", len(receptors))
    # Store matches
    locs = []
    # Store queries
    queries = []
    # Search matches
    for path in file_list.path:
        fname = path.split("/")[-1]
        query = fname.replace("decoys_", "").replace(".sdf", "").lower()
        queries.append(query)
        loc = np.where(receptors.name_fmt == query)[0]
        # Save path
        if len(loc > 0):
            locs.append(loc[0])
            receptors.path[loc[0]] = path
    # Check that the paths are all unique.
    for l, count in [(l, locs.count(l)) for l in locs]:
        if count > 1:
            print(receptors.name_fmt.iloc[l])
    assert len(np.unique(np.array(locs))) == len(locs), "Not all paths are unique!"
    print("All paths are unique!")
    print("Found:", len(locs))
    # Count missing items
    unk = []
    for f in receptors.name_fmt:
        if f not in queries:
            unk.append(f)
    print("Missing:", len(unk))
    print(receptors.shape)
    print("Saving csv.")
    receptors = receptors.loc[:, ["id", "description", "name", "uniprot",
                                  "num_actives", "name_fmt", "path" ]]
    receptors.to_csv(rtype + "_paths.csv", index=False)

In [149]:
rnames = ['Enzyme', 'Epigenetic', 'GPCR', 'IC', 'Kinase', 'NR', 'TF', 'TF; Epigenetic', 'Transporter']
for r in rnames:
    print("=========================")
    print("Searching for:", r)
    get_filesnames(r)

Searching for: Enzyme
There should be: 348


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


All paths are unique!
Found: 83
Missing: 265
(348, 12)
Saving csv.
Searching for: Epigenetic
There should be: 42
All paths are unique!
Found: 30
Missing: 12
(42, 12)
Saving csv.
Searching for: GPCR
There should be: 189
All paths are unique!
Found: 172
Missing: 17
(189, 12)
Saving csv.
Searching for: IC
There should be: 91
All paths are unique!
Found: 46
Missing: 45
(91, 12)
Saving csv.
Searching for: Kinase
There should be: 205
All paths are unique!
Found: 48
Missing: 157
(205, 12)
Saving csv.
Searching for: NR
There should be: 28
All paths are unique!
Found: 24
Missing: 4
(28, 12)
Saving csv.
Searching for: TF
There should be: 6
All paths are unique!
Found: 6
Missing: 0
(6, 12)
Saving csv.
Searching for: TF; Epigenetic
There should be: 5
All paths are unique!
Found: 5
Missing: 0
(5, 12)
Saving csv.
Searching for: Transporter
There should be: 35
All paths are unique!
Found: 16
Missing: 19
(35, 12)
Saving csv.
