In [3]:
import random
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import pandas as pd
import math
import matplotlib.lines as mlines

In [4]:
import time, sys
from IPython.display import clear_output

def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [5]:
def readNPDBFiles(path, n):
    file = open(path, "r")
    lines = file.readlines()
    pdbs = [(line.strip()[:4], line.strip()[4:5]) for line in lines[1:]]
    random.shuffle(pdbs)
    return pdbs[1:n]

In [6]:
def getPDBFile(pdb):
        try:
            url = "https://files.rcsb.org/view/{}.pdb".format(pdb)
            response = urllib.request.urlopen(url)
            data = response.read()
            text = data.decode('utf-8')
            text = text.split('\n')
            return(text)
        except:
            return([])

In [113]:
def getAAsInHelix(lines, chain):
    helices = [line for line in lines if line.startswith("HELIX")]
    aaList = []
    for helix in helices:
        chainID = helix[19].strip()
        if(chainID == chain):       
            initSeqNum = int(helix[22:26].strip()) 
            endSeqNum = int(helix[34:38].strip())
            helixClass = helix[39:41].strip()
            for i in range(initSeqNum, endSeqNum+1):
                aaList.append((i, helixClass))
    return aaList

def getAAName(lines, chain, aaNum):
    atoms = [line for line in lines if line.startswith("ATOM")]
    for atom in atoms:
        name = atom[17:21].strip()
        chainID = atom[21].strip()
        resSeq = int(atom[23:26].strip())
        if(chainID == chain and aaNum == resSeq):
            return name
    return "???"
        
def AnalyzePDB_Q2(pdbs):
    processed = 1
    helixTypes = {}
    for pdb in pdbs:
        update_progress(processed / len(pdbs))
        print("Processing {}/{}".format(processed, len(pdbs)))
        lines = getPDBFile(pdb[0])
        
        # Do Helix
        #Get a list of all amino acid numbers in helices with chain pdb[1]
        aaList = getAAsInHelix(lines, pdb[1])
        #For each aa in the returned list, find it's name and count it.
        for aa in aaList:
            Type = aa[1]
            if(Type in helixTypes):
                name = getAAName(lines, pdb[1], aa[0])
                if(name == "???"):
                    continue
                if(name in helixTypes[Type]):
                    helixTypes[Type][name] += 1
                else:
                    helixTypes[Type][name] = 1
            else:
                helixTypes[Type] = {}
                
        #Do Sheet
        processed+=1
    print(helixTypes)
    return helixTypes

In [114]:
%%time
pdbs = readNPDBFiles("cullpdb_pc30_res3.0_R1.0_d191017_chains18877", 100)
helixDict = AnalyzePDB_Q2(pdbs)

Progress: [####################] 100.0%
Processing 99/99
{'1': {'PRO': 238, 'ALA': 1145, 'GLU': 876, 'ARG': 619, 'TRP': 154, 'GLY': 521, 'TYR': 379, 'ILE': 624, 'VAL': 668, 'LEU': 1231, 'ASP': 586, 'HIS': 220, 'LYS': 668, 'SER': 583, 'THR': 513, 'GLN': 465, 'MET': 233, 'PHE': 442, 'CYS': 93, 'ASN': 428}, '5': {'PHE': 60, 'TRP': 25, 'LEU': 118, 'TYR': 45, 'GLY': 87, 'HIS': 37, 'MET': 14, 'SER': 86, 'ARG': 55, 'ASP': 108, 'GLN': 50, 'THR': 55, 'ALA': 107, 'PRO': 98, 'LYS': 79, 'ILE': 55, 'ASN': 59, 'VAL': 49, 'GLU': 123, 'CYS': 12}}
CPU times: user 33.3 s, sys: 878 ms, total: 34.2 s
Wall time: 2min 9s


In [117]:
for item in helixDict:
    print(helixDict[item])
    print()

{'PRO': 238, 'ALA': 1145, 'GLU': 876, 'ARG': 619, 'TRP': 154, 'GLY': 521, 'TYR': 379, 'ILE': 624, 'VAL': 668, 'LEU': 1231, 'ASP': 586, 'HIS': 220, 'LYS': 668, 'SER': 583, 'THR': 513, 'GLN': 465, 'MET': 233, 'PHE': 442, 'CYS': 93, 'ASN': 428}

{'PHE': 60, 'TRP': 25, 'LEU': 118, 'TYR': 45, 'GLY': 87, 'HIS': 37, 'MET': 14, 'SER': 86, 'ARG': 55, 'ASP': 108, 'GLN': 50, 'THR': 55, 'ALA': 107, 'PRO': 98, 'LYS': 79, 'ILE': 55, 'ASN': 59, 'VAL': 49, 'GLU': 123, 'CYS': 12}

