# Collect protein PDBs

Collect additional protein structures from PDB database.  
Various summaries of current data in the PDB archive are available on [summaries_link](https://www.rcsb.org/pages/general/summaries).
Download [`pdb_entry_type.txt`](ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_entry_type.txt) contraining all protein IDs. Based on the protein ID, we will download the protein `*.pdb` files.
We are only interested in proteins whose structure was determined by **EM**.

In [26]:
import pandas as pd
import random
import pathlib

In [2]:
df = pd.read_csv("ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_entry_type.txt", header=None, names=["id", "acid", "structure_determination"], sep="\t")

In [3]:
print(df.shape)
df.head()

(168358, 3)


Unnamed: 0,id,acid,structure_determination
0,100d,nuc,diffraction
1,101d,nuc,diffraction
2,101m,prot,diffraction
3,102d,nuc,diffraction
4,102l,prot,diffraction


In [4]:
df.structure_determination.unique()

array(['diffraction', 'NMR', 'other', 'EM'], dtype=object)

In [5]:
df.acid.unique()

array(['nuc', 'prot', 'prot-nuc', 'other'], dtype=object)

In [6]:
df_EM = df[(df.structure_determination=='EM')&df.acid.isin(['prot', 'prot-nuc'])]
print(df_EM.shape)
df_EM.head()

(5520, 3)


Unnamed: 0,id,acid,structure_determination
3804,1d3e,prot,EM
3808,1d3i,prot,EM
4221,1dgi,prot,EM
4799,1dyl,prot,EM
5383,1eg0,prot-nuc,EM


In [7]:
rand_proteins = [random.randint(0,len(df_EM)) for _ in range(20)]

In [8]:
selected_proteins = list(df_EM.id.iloc[rand_proteins].values)

In [9]:
PDB_DIR = "/home/jelena/PDB"  #"/mnt/scratch/students/PDB"
pathlib.Path(PDB_DIR).mkdir(parents=True, exist_ok=True)

In [10]:
for i in selected_proteins:
    get_ipython().system_raw(f'wget http://files.rcsb.org/download/{i}.pdb -O {PDB_DIR}/{i}.pdb')

In [14]:
get_ipython().getoutput(f"ls {PDB_DIR}", split=True)

['2x8q.pdb',
 '5a79.pdb',
 '5aj0.pdb',
 '5oh0.pdb',
 '5tau.pdb',
 '5v4s.pdb',
 '5zzm.pdb',
 '6gv9.pdb',
 '6gzq.pdb',
 '6hu9.pdb',
 '6lve.pdb',
 '6o6c.pdb',
 '6peq.pdb',
 '6red.pdb',
 '6ulg.pdb',
 '6vaa.pdb',
 '6vum.pdb',
 '6w2d.pdb',
 '6xt9.pdb',
 '7jg8.pdb']

# EMAN2 script for PDB to MRC conversion

Installation instructions available [here](https://blake.bcm.edu/emanwiki/EMAN2/Install/BinaryInstallAnaconda/2.31).  
Download available [here](https://cryoem.bcm.edu/cryoem/downloads/view_eman2_versions).  
Command instruction `pdb2mrc` available [here](https://blake.bcm.edu/emanwiki/PdbToMrc).

In [28]:
MRC_DIR = "/home/jelena/MRC"  #"/mnt/scratch/students/MRC"
pathlib.Path(MRC_DIR).mkdir(parents=True, exist_ok=True)

In [29]:
EMAN2 = "/home/jelena/EMAN2"

In [32]:
for i in selected_proteins:
    get_ipython().system_raw(f'export PATH="{EMAN2}/bin:$PATH";{EMAN2}/bin/e2pdb2mrc.py {PDB_DIR}/{i}.pdb {MRC_DIR}/{i}.mrc res=5')

In [33]:
get_ipython().getoutput(f"ls {MRC_DIR}", split=True)

['2x8q.mrc',
 '5a79.mrc',
 '5oh0.mrc',
 '5v4s.mrc',
 '5zzm.mrc',
 '6gv9.mrc',
 '6hu9.mrc',
 '6lve.mrc',
 '6o6c.mrc',
 '6peq.mrc',
 '6red.mrc',
 '6ulg.mrc',
 '6vaa.mrc',
 '6vum.mrc',
 '6w2d.mrc',
 '6xt9.mrc',
 '7jg8.mrc']