In [1]:
import pandas as pd
from Bio import SeqIO

# Constants
PATH_ORIGINAL_DB = '../datasets/original.txt'
PATH_REFERENCE_DB = '../data/SwissProt_humans_reference_all.fasta'
PATH_PDB_UNIPROT_REL = '../data/pdb_chain_uniprot.tsv'

In [2]:
human = SeqIO.parse(PATH_REFERENCE_DB, 'fasta')

list_human = []
for sequence in human:
    name = sequence.id # name is in the form sp|P46108|CRK_HUMAN
    list_human.append(name.split('|')[1])

print('There are {} human proteins in SwissProt'.format(len(list_human)))

There are 20367 human proteins in SwissProt


In [3]:
# Import original dataset
original_proteins = []
with open(PATH_ORIGINAL_DB) as file:
    for line in file:
        original_proteins.append(line[:-1])

In [4]:
# Check output
print('Proteins in the original dataset:', len(original_proteins))
original_proteins[:5]

Proteins in the original dataset: 92


['Q7KZ85', 'O75791', 'P62993', 'Q13588', 'Q06124']

In [5]:
# Import pdb - uniprot relation file
pdb_rel = pd.read_csv(PATH_PDB_UNIPROT_REL, sep = '\t', header = 1)
pdb_rel.columns = list(map(lambda x: x.lower(), pdb_rel.columns.values))
pdb_rel.head()

Unnamed: 0,pdb,chain,sp_primary,res_beg,res_end,pdb_beg,pdb_end,sp_beg,sp_end
0,101m,A,P02185,1,154,0,153.0,1,154
1,102l,A,P00720,1,40,1,40.0,1,40
2,102l,A,P00720,42,165,41,,41,164
3,102m,A,P02185,1,154,0,153.0,1,154
4,103l,A,P00720,1,40,1,,1,40


In [6]:
# Retrive proteins in the original dataset with a PDB 
print('Proteins of the original dataset in PDB:', len(set(pdb_rel.sp_primary) & set(original_proteins)))

Proteins of the original dataset in PDB: 60


In [7]:
# Create first version of the PDB dataset: only original proteins in PDB
pdb_dataset = pdb_rel.loc[pdb_rel.sp_primary.isin(original_proteins),['pdb','sp_primary','chain']].copy()
pdb_dataset.head()

Unnamed: 0,pdb,sp_primary,chain
185,1a07,P12931,A
186,1a07,P12931,B
187,1a08,P12931,A
188,1a08,P12931,B
189,1a09,P12931,A


In [8]:
pdb_dataset.shape

(1252, 3)

In [9]:
# Create final version of the PDB dataset: original proteins in PDB + human proteins with same chain as the originals
frames = [pdb_dataset, pdb_rel[(pdb_rel.pdb.isin(pdb_dataset.pdb)) & ~(pdb_rel.sp_primary.isin(pdb_dataset.sp_primary))
                               & (pdb_rel.sp_primary.isin(list_human))]]
pdb_dataset = pd.concat(frames, sort = False)[['pdb','sp_primary','chain']]
pdb_dataset.head()

Unnamed: 0,pdb,sp_primary,chain
185,1a07,P12931,A
186,1a07,P12931,B
187,1a08,P12931,A
188,1a08,P12931,B
189,1a09,P12931,A


In [10]:
pdb_dataset.shape

(1443, 3)

In [11]:
pdb_dataset.to_csv('../datasets/pdb.cvs')