In [1]:
import zipfile
import pandas as pd
import os

### See contents of the folders with pdbqt files

In [2]:
receptors = ["adora2a", "adrb2", "oprd1", "oprk1"]
shared_files = [r+"_tested.zip" for r in receptors]

In [79]:

missing_frames_pdbqt = {"receptor":[], "trajectory_index":[], "missing_file":[]}

for i, file in enumerate(shared_files):
    print(receptors[i])
    # zip file handler  
    zip = zipfile.ZipFile(file)
    name_list = zip.namelist()
    #print(name_list)
    prefix = "at-frame"
    postfix = ".pdbqt"
    frame_nums = set([int(elem[elem.find(prefix)+len(prefix):-len(postfix)]) for elem in name_list \
                      if prefix in elem and len(elem[elem.find(prefix)+len(prefix):-len(postfix)])>0])
    expected_frames = set([200*x for x in range(3000)])
    # list available files in the container
    #print(frame_nums)
    diff_frames = list(expected_frames-frame_nums)
    diff_frames.sort()
    diff_ind = [int(x/200) for x in diff_frames]
    diff_ind.sort()
    print(len(diff_frames))
    print(diff_frames)
    print(diff_ind)
    missing_frames_pdbqt["receptor"].extend([receptors[i] for x in range(len(diff_ind))])
    missing_frames_pdbqt["trajectory_index"].extend(diff_ind)
    missing_frames_pdbqt["missing_file"].extend(["at-frame"+str(x)+".pdbqt" for x in diff_frames])
    

adora2a
120
[3200, 17800, 30800, 31200, 34400, 35600, 36400, 39200, 57000, 61800, 68000, 70000, 72000, 83800, 85000, 90800, 95200, 95800, 97400, 97600, 100200, 100600, 103000, 128800, 129200, 129800, 135800, 150000, 157000, 165600, 167600, 168600, 177600, 177800, 179200, 186000, 191000, 191400, 201800, 204600, 209400, 213800, 215800, 217200, 223600, 238600, 241400, 254200, 255400, 259600, 260000, 276200, 276400, 286600, 303400, 304000, 306000, 306200, 306600, 307200, 307600, 316600, 317600, 319800, 328600, 330800, 332400, 333600, 336400, 338600, 342400, 346600, 348800, 356600, 358600, 366000, 370400, 377000, 382400, 388600, 394000, 400600, 403000, 408400, 424800, 425600, 437400, 438800, 439400, 441400, 448400, 460000, 461800, 464000, 464600, 467200, 468000, 479800, 486800, 489400, 491400, 510800, 511400, 513000, 516400, 521800, 522600, 527800, 533400, 533600, 538400, 557400, 574400, 574800, 575000, 575600, 581600, 591400, 596800, 597200]
[16, 89, 154, 156, 172, 178, 182, 196, 285, 309,

In [80]:
missing_pdbqts_df = pd.DataFrame(missing_frames_pdbqt)
missing_pdbqts_df

Unnamed: 0,receptor,trajectory_index,missing_file
0,adora2a,16,at-frame3200.pdbqt
1,adora2a,89,at-frame17800.pdbqt
2,adora2a,154,at-frame30800.pdbqt
3,adora2a,156,at-frame31200.pdbqt
4,adora2a,172,at-frame34400.pdbqt
...,...,...,...
896,oprk1,2397,at-frame479400.pdbqt
897,oprk1,2398,at-frame479600.pdbqt
898,oprk1,2401,at-frame480200.pdbqt
899,oprk1,2580,at-frame516000.pdbqt


In [81]:
missing_pdbqts_df.groupby(['receptor']).size()

receptor
adora2a    120
adrb2      556
oprd1      170
oprk1       55
dtype: int64

In [29]:
missing_pdbqts_df.to_csv("missing_pdbqt_files.csv")

## Get all the active ligands per receptor

In [42]:
actives_files = [["../actives_list(DUDE,GDD)/ADORA2A_actives_final.mol2"],
        ["../actives_list(DUDE,GDD)/ADRB2_actives_final.mol2"],
        ["../actives_list(DUDE,GDD)/OPRD_Agonist_Ligands.sdf", "../actives_list(DUDE,GDD)/OPRD_Antagonist_Ligands.sdf"],
        ["../actives_list(DUDE,GDD)/OPRK_Agonist_Ligands.sdf", "../actives_list(DUDE,GDD)/OPRK_Antagonist_Ligands.sdf"]]

In [59]:
def get_molecule(receptor, file):
    
    print(file)
    keyword = None
    if receptor == "adora2a" or receptor == "adrb2":
        keyword = "@<TRIPOS>MOLECULE"
    if receptor == "oprd1" or receptor == "oprk1":
        keyword = "> <PC_CID>"
        
    molecules = []
    with open(file) as f_handle:
        
        molecule=False
        for line in f_handle:
            if molecule == True:
                molecules.append(line[:-2])
                molecule = False
                
            if keyword in line:
                molecule=True
    return molecules
            

In [60]:
actives_files

[['../actives_list(DUDE,GDD)/ADORA2A_actives_final.mol2'],
 ['../actives_list(DUDE,GDD)/ADRB2_actives_final.mol2'],
 ['../actives_list(DUDE,GDD)/OPRD_Agonist_Ligands.sdf',
  '../actives_list(DUDE,GDD)/OPRD_Antagonist_Ligands.sdf'],
 ['../actives_list(DUDE,GDD)/OPRK_Agonist_Ligands.sdf',
  '../actives_list(DUDE,GDD)/OPRK_Antagonist_Ligands.sdf']]

In [61]:
active_molecules = {"receptor":[], "ligand":[]}
for i, fs in enumerate(actives_files):
    r = receptors[i]
    print(r)
    for f in fs:
        mols = get_molecule(r, f)
        active_molecules["receptor"].extend([r for x in range(len(mols))])
        active_molecules["ligand"].extend(list(mols))

adora2a
../actives_list(DUDE,GDD)/ADORA2A_actives_final.mol2
adrb2
../actives_list(DUDE,GDD)/ADRB2_actives_final.mol2
oprd1
../actives_list(DUDE,GDD)/OPRD_Agonist_Ligands.sdf
../actives_list(DUDE,GDD)/OPRD_Antagonist_Ligands.sdf
oprk1
../actives_list(DUDE,GDD)/OPRK_Agonist_Ligands.sdf
../actives_list(DUDE,GDD)/OPRK_Antagonist_Ligands.sdf


In [62]:
active_molecules_pd = pd.DataFrame(active_molecules)
active_molecules_pd

Unnamed: 0,receptor,ligand
0,adora2a,CHEMBL48055
1,adora2a,CHEMBL48055
2,adora2a,CHEMBL52124
3,adora2a,CHEMBL52124
4,adora2a,CHEMBL18732
...,...,...
1970,oprk1,966
1971,oprk1,442
1972,oprk1,442
1973,oprk1,528459


In [63]:
active_molecules_pd.to_csv("active_molecules.csv")

In [64]:
active_molecules_pd.groupby(['receptor']).size()

receptor
adora2a    844
adrb2      447
oprd1      377
oprk1      307
dtype: int64

## See the contents of docking results

#### Step 1 - extract frame:molecule information

In [3]:
files = ["adora2a-sorted_results-003.txt",
        "abrb2-sorted_results.txt",
        "oprd1-sorted_results.txt",
        "oprk1-sorted_results.txt"]

In [4]:

all_docking_res = []
all_ligands = []

for f_i, file in enumerate(files):
    print(receptors[f_i])
    ligands = set()
    frame_maps = {}
    
    with open(file) as f:
        i=0
        for line in f:
            split_line = line.split()
            prefix = "at-frame"
            suffix = ".pdbqt"
            mol_line = split_line[-1]
            if prefix in mol_line:
                dash_pos = mol_line.find("_")
                frame = mol_line[len(prefix):dash_pos]
                mol = mol_line[dash_pos+1:-len(suffix)]
                ligands.add(mol)
                if frame in frame_maps:
                    frame_maps[frame].append(mol)
                else:
                    frame_maps[frame] = [mol]
            i+=1
            if i%100000==0:
                print(i)
        all_docking_res.append(frame_maps)
        all_ligands.append(ligands)

adora2a
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000
6900000
7000000
7100000
7200000
7300000
7400000
7500000
7600000
7700000
7800000
7900000
8000000
8100000
8200000
8300000
8400000
8500000
8600000
8700000
8800000
8900000
9000000
9100000
9200000
9300000
9400000
9500000
9600000
9700000
9800000
9900000
10000000
10100000
10200000
10300000
10400000
10500000
10600000
10700000
10800000
10900000
11000000
11100000
11200000
11300000
11400000
11500000
11600000
11700000
11800000
11900000
12000000
12100000
12200000
12

22300000
22400000


#### Step 2 - see what frames are missing from the docking results

In [70]:
missing_frames_result = {"receptor": [], "missing_frame":[]}
for i, results in enumerate(all_docking_res):
    receptor = receptors[i]
    print(receptor)
    res_frames_keys = results.keys()
    res_frames = set([int(x) for x in list(res_frames_keys)])
    expected_frames = expected_frames = set([200*x for x in range(3000)])
    
    diff_frames = list(expected_frames-res_frames)
    diff_frames.sort()
    diff_ind = [int(x/200) for x in diff_frames]
    diff_ind.sort()
    print(len(diff_frames))
    print(diff_frames)
    print(diff_ind)
    missing_frames_result["receptor"].extend([receptors[i] for x in range(len(diff_ind))])
    #missing_frames_pdbqt["trajectory_index"].extend(diff_ind)
    missing_frames_result["missing_frame"].extend(["at-frame"+str(x) for x in diff_frames])

adora2a
746
[800, 1200, 2200, 2400, 3200, 3600, 5000, 6000, 6600, 7000, 8200, 8600, 9800, 10800, 11600, 12000, 13000, 14000, 14400, 14800, 15000, 15200, 17800, 18600, 19000, 19400, 19800, 20400, 21000, 21800, 22400, 23200, 24800, 25200, 25600, 26600, 27000, 27400, 28000, 28400, 29400, 29800, 30000, 30800, 31000, 31200, 31400, 32400, 34400, 35400, 35600, 36200, 36400, 36600, 37000, 37400, 37600, 38000, 38400, 39000, 39200, 39400, 39800, 40000, 40600, 42000, 42200, 42600, 43800, 44200, 44400, 44800, 46200, 46600, 47000, 48200, 49000, 50200, 51600, 53800, 54400, 56800, 57000, 57200, 57600, 58800, 59600, 61800, 62600, 65000, 65400, 65800, 67400, 68000, 70000, 71400, 72000, 73600, 74000, 74600, 75200, 78800, 79200, 79400, 81000, 81400, 82400, 83800, 84200, 85000, 86800, 87800, 88000, 88400, 89600, 90800, 92600, 94200, 94600, 95200, 95800, 96200, 96800, 97400, 97600, 97800, 98000, 100200, 100600, 101600, 102600, 103000, 105400, 106000, 107400, 107600, 107800, 111200, 111800, 112200, 112400, 

In [76]:
missing_frames_result_df = pd.DataFrame(missing_frames_result)
missing_frames_result_df.to_csv("missing_frames_in_docking_results.csv")

In [77]:
missing_frames_result_df.groupby(['receptor']).size()

receptor
adora2a     746
adrb2      1054
oprd1      1116
oprk1      1854
dtype: int64

#### Step 3 - see what ligand-frame results are missing from docking results

In [6]:

missing_frames_result = {"receptor": [], "frame":[], "ligand": []}
for i, rec in enumerate(receptors):
    print(i)
    print(rec)
    ligs = all_ligands[i]
    frame_res = all_docking_res[i]
    for frame, mols in frame_res.items():
        missing_mols = set(ligs)- set(mols)
        if len(missing_mols) > 0:
            for missing_mol in missing_mols:
                missing_frames_result["receptor"].append(rec)
                missing_frames_result["frame"].append(frame)
                missing_frames_result["ligand"].append(missing_mol)
                

0
adora2a
1
adrb2
2
oprd1
3
oprk1


In [7]:
missing_frames_result_df = pd.DataFrame(missing_frames_result)

In [8]:
missing_frames_result_df.to_csv("missing_frame_ligand_docking_result.csv")

In [9]:
missing_frames_result_df.groupby(['receptor', 'frame']).size()

receptor  frame 
adora2a   0          251
          1000      1412
          10000     3699
          100000     490
          100400    1807
                    ... 
oprk1     99200     4898
          99400        7
          995000       7
          99600        1
          998200    5071
Length: 7325, dtype: int64

In [10]:
missing_frames_result_df.groupby(['receptor']).size()

receptor
adora2a     2584203
adrb2      17543825
oprd1      11553713
oprk1       2543262
dtype: int64