Check in OPM pdbs if we have two plans to represent membrane cylinder. 
Look at DUM atoms, if there is two z coordinates or not. 

In [13]:
import glob, json

In [2]:
pdb_dir = "/mnt/arwen-dev/data/databases/mobi/detbelt/proteins/opmPDB_20210114-102025"

In [3]:
import pyproteinsExt.structure.coordinates as PDB
%load_ext autoreload
%autoreload 2

In [4]:
def createZDic(pdbObj):
    # { DUM : {z_coord : number of atoms}, others : z_list}
    z_dic = {"DUM" : {}, "others": []}
    for atom in pdbObj.atomRecord:
        if atom.resName == "DUM":
            if atom.z not in z_dic["DUM"]:
                z_dic["DUM"][atom.z] = 1
            else:
                z_dic["DUM"][atom.z] += 1
        else:
            z_dic["others"].append(atom.z)
    return z_dic

In [5]:
def isProtIntoMembrane(up, down, atoms_z):
    if min(atoms_z) < down and max(atoms_z) > up:
        return True
    return False

In [6]:
parser = PDB.Parser()

In [17]:
i = 0
one_membrane = []
two_membrane = []
two_membrane_incorrect = [] # 2 z coordinates but not 0 origin
parsing_error = []
number_dum_error = []
others = []
ectopic_prot = []
validated_prot = []
for pdb in glob.glob(pdb_dir + "/*.pdb"): 
    i+=1
    pdb_file = pdb.split("/")[-1]
    pdb_code = pdb_file.split(".")[0].upper()
    try:
        pdbObj = parser.load(file = pdb, heteroatoms = True)
    except:
        parsing_error.append((pdb_code,pdb_file))
        continue
        
    z_dic = createZDic(pdbObj)
    dum_dic = z_dic["DUM"]
    other_atoms = z_dic["others"]
    if len(dum_dic) == 1: #only one part of membrane
        one_membrane.append(pdb_file)
        #check membrane with artificial second disk
        z = list(dum_dic.keys())[0]
        coords = [z,-z]
        if isProtIntoMembrane(max(coords), min(coords), other_atoms):
            validated_prot.append(pdb_file)
        else:
            ectopic_prot.append(pdb_file)
    elif len(dum_dic) == 2:
        two_membrane.append(pdb_file)
        coords = list(dum_dic.keys())
        if abs(coords[0]) == abs(coords[1]): 
            if dum_dic[coords[0]] != dum_dic[coords[1]]: #not the same number of atoms in 2 disk
                number_dum_error.append(pdb_file)
            else:
                if isProtIntoMembrane(max(coords), min(coords), other_atoms):
                    validated_prot.append(pdb_file)
                else:
                    ectopic_prot.append(pdb_file)
        else: #  2 z coordinates but not 0 origin
            two_membrane_incorrect.append(pdb_file)
    else:
        others.append(pdb_file)
    
    if i % 100 == 0:
        print(i)

1A0T


In [8]:
print(len(glob.glob(pdb_dir + "/*.pdb")), "total prot")
print(len(parsing_error), "parsing error")
print(len(one_membrane), "1 disk")
print(len(two_membrane), "2 disks")
print(len(others), "other number of disk")
print(len(validated_prot), "valid")
print(len(ectopic_prot), "ectopic")
print(len(number_dum_error), "different number of DUM for the 2 disks")
print(len(two_membrane_incorrect), "inconsistency in z coords for the 2 disks")

1941 total prot
10 parsing error
49 1 disk
1881 2 disks
1 other number of disk
1822 valid
108 ectopic
0 different number of DUM for the 2 disks
0 inconsistency in z coords for the 2 disks


In [11]:
with open("ectopic_prot_210121.txt", "w") as o:
    o.write("\n".join(ectopic_prot))
with open("validated_prot_210121.txt", "w") as o:
    o.write("\n".join(validated_prot))
with open("parsing_error_prot_210121.txt", "w") as o:
    o.write("\n".join(parsing_error))

## Filter database by creating new json

In [12]:
current_json = "/mnt/arwen-dev/data/databases/mobi/detbelt/proteins/whiteDB_20210114-102025.json"
new_json = "/mnt/arwen-dev/data/databases/mobi/detbelt/proteins/whiteDB_20210114-102025_filter.json"

In [15]:
current = json.load(open(current_json))
for c in current:
    print(c)
    break

{'pdbCode': '1PTH', 'name': 'Ram Prostaglandin H2 synthase-1 (COX-1)', 'species': 'Ovis aries', 'taxonomicDomain': 'Eukaryota', 'expressedInSpecies': '', 'resolution': '3.4', 'description': 'In complex with bromoaspirin.', 'bibliography': {'pubMedId': '7552725', 'authors': 'Loll PJ, Picot D, &amp; Garavito RM', 'year': '1995', 'title': 'The structural basis of aspirin activity inferred from the crystal structure of inactivated prostaglandin H2 synthase.', 'journal': 'Nat Struct Biol', 'volume': '2', 'issue': '', 'pages': '637-643', 'doi': '', 'notes': ''}, 'secondaryBibliographies': '', 'relatedPdbEntries': '', 'group': 'MONOTOPIC MEMBRANE PROTEINS', 'subgroup': 'Cyclooxygenases', 'representativeOf': 'Ram Prostaglandin H2 synthase-1 (cyclooxygenase-1 or COX-1)'}
