In [1]:
import requests
import pdbreader
from Bio import PDB
from Bio.Align import substitution_matrices
from Bio.Align import PairwiseAligner

In [2]:
def read_pdb(pdb_filename, chain_id = "A"):
    """
    Read pdb file from local computer
    and return a chain
    """
    parser = PDB.PDBParser(QUIET=True)
    Id = pdb_filename.split("/")[-1].split(".pdb")[0]
    structure = parser.get_structure(Id, pdb_filename)
    model = structure[0]
    chain = model[chain_id]
    return chain

def fetch_pdb(entry_id, chain_id):
    """
    Download pdb file from server and read it
    then return a chain
    """
    search_url = f"https://files.rcsb.org/download/{entry_id}.pdb"
    response = requests.get(search_url)
    
    if response.status_code == 200:
        pdb_filename = f"{entry_id.lower()}.pdb"
        with open(pdb_filename, "w") as ofile:
            ofile.write(response.text)
        chain = read_pdb(pdb_filename, chain_id)
        return (pdb_filename, chain)
    else:
        print(f"Error: {response.status_code} {response.reason}")
        
    return (None, None)

In [3]:
def get_atoms_count_in_chain_by_seqNum(chain, aa_sequence_number):
    """
    1. Show the number of atoms in a particular Amino acid in a given chain (by aa sequence number).
    """
    count = 0
    for residue in chain:
        if residue.id[1] == aa_sequence_number:
            for atom in residue:
                count += 1
    return count


def get_amino_acids_sequence(chain):
    """
    2. Return the sequence of the amino acids make up the protein including amino
       acids missing the structural information.
    """
    sequence = ""
    for residue in chain:
        if PDB.is_aa(residue):
            sequence += PDB.Polypeptide.protein_letters_3to1[residue.get_resname()]
        else:
            sequence += 'X'  # Placeholder for missing structural information
    return sequence


def get_atoms_count_in_side_chain(chain, aa_sequence_number):
    """
    3. Show the number of atoms in the side chain for particular aa in a given chain
       (aa sequence number should be provided).
    """
    for residue in chain:
        if residue.id[1] == aa_sequence_number and PDB.is_aa(residue):
            counts = sum([1 for atom in residue if atom.id != 'CA'])
            return counts
        
def count_amino_acids_in_chain(chain):
    """
    4. Show the number of amino acids in a given chain.
    """
    count = sum([1 for residue in chain if PDB.is_aa(residue)])
    return count

def get_helices_and_sheets_count(pdb_filename, chain_id="A"):
    """
    5. Show the number of helices or sheets in the chain.
    """
    pdb = pdbreader.read_pdb(pdb_filename)
    
    helix_count, sheet_count = 0, 0
    
    if "HELIX" in pdb:
        df = pdb['HELIX']
        helix_count = df[(df['init_chain']==chain_id) & (df['end_chain']==chain_id)].shape[0]
    if "SHEET" in pdb:
        df = pdb['SHEET']
        sheet_count = df[(df['init_chain']==chain_id) & (df['end_chain']==chain_id)].shape[0]

    return (helix_count, sheet_count)

def get_atom_coordinates(chain, aa_sequence_number, atom_name):
    """
    6. Return the coordinates of a particular atom (The user should provide aa
       number and the name of the atom).
    """
    for residue in chain:
        if residue.id[1] == aa_sequence_number and PDB.is_aa(residue):
            for atom in residue:
                if atom.id == atom_name:
                    return atom.get_coord()
    return None

def get_name_and_number_of_aa(pdb_filename, helix_number, chain_id="A"):
    """
    7. show the name and number of the amino acid at the beginning or end of a given
       helix by its number. 
    """
    pdb = pdbreader.read_pdb(pdb_filename)
    start_resname, start_resnum, end_resname, end_resnum = 0, 0, 0, 0
    
    if "HELIX" in pdb:
        df = pdb['HELIX']
        df = df[(df['init_chain']==chain_id) & (df['end_chain']==chain_id)]
        df = df[df['seq'].astype(int) == helix_number]
        
        start_aa = ['init_resname', 'init_resid']
        end_aa = ['end_resname', 'end_resid']
        if not df[start_aa].empty:
            start_resname, start_resnum = df[start_aa].values[0]
            end_resname, end_resnum = df[end_aa].values[0]
        
    return (start_resname, start_resnum, end_resname, end_resnum)


def calculate_distance(chain, aa1_sequence_number, atom1_name, aa2_sequence_number, atom2_name):
    """
    8. Find the distance between any two atoms in the chain (the user should
       provide aa numbers and atom names).
    """
    coord1 = get_atom_coordinates(chain, aa1_sequence_number, atom1_name)
    coord2 = get_atom_coordinates(chain, aa2_sequence_number, atom2_name)
    if (coord1 is not None) and (coord2 is not None):
        distance = PDB.Vector(coord1) - PDB.Vector(coord2)
        return distance.norm()
    return None

def global_align(target, query):
    """
    9. Given a text file called sequences.txt that contains a list of IDs for proteins and
       a target chain, your program should globally align the chain of the protein against
       all chains in the list, one at a time, and report the alignment score and identity
       percentage. Use Blosum62 scoring matrix and gap score= -2.
    """
    # define pairwise alignment object
    aligner = PairwiseAligner()
    # set mode, gap score and matrix
    aligner.mode = "global"
    aligner.gap_score = -2
    aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
    # perform the alignment
    alignments = aligner.align(target, query)
    # take best alignment and get score and identity percentage
    best_alignment = alignments[0]
    alignment_score = best_alignment.score
    identity_percentage = (best_alignment.counts().identities / len(query)) * 100

    return (alignment_score, identity_percentage)

In [4]:
print("How you want to read pdb? 1. from personal computer 2. fetch from online servers")
choice = int(input("Enter your choice either 1 or 2 : "))  # only enter 1 or 2 as choice

if choice == 1:
    pdb_filename = input("Enter pdb file path: ")
    chain = read_pdb(pdb_filename)
elif choice == 2:
    entry_id = input("Enter protein id: ")
    chain_id = input("Enter chain id: ")
    pdb_filename, chain = fetch_pdb(entry_id, chain_id)
else:
    print("Invalid choice! Either choose 1 or 2.")


How you want to read pdb? 1. from personal computer 2. fetch from online servers
Enter your choice either 1 or 2 : 2
Enter protein id: 3ran
Enter chain id: D


In [5]:
def menu(pdb_filename, chain):
    while True:
        print("\nProtein Analysis Menu:")
        print("1. Atom Count by Amino Acid Sequence")
        print("2. Amino Acid Sequence (with missing structural info)")
        print("3. Side Chain Atom Count by Amino Acid Sequence")
        print("4. Amino Acid Count in Chain")
        print("5. Helices and Sheets Count")
        print("6. Atom Coordinates by Amino Acid and Atom Name")
        print("7. Amino Acid at Helix Start/End by Helix Number")
        print("8. Distance Between Two Atoms by Amino Acid and Atom Names")
        print("9. Global Alignment against sequences.txt")
        print("0. Exit")

        choice = int(input("Enter your choice: "))

        if choice == 1:
            aa_sequence_number = int(input("Enter the amino acid sequence number: "))
            atoms_count = get_atoms_count_in_chain_by_seqNum(chain, aa_sequence_number)
            print(f"Number of atoms in amino acid {aa_sequence_number}: {atoms_count}")

        elif choice == 2:
            amino_acids_sequence = get_amino_acids_sequence(chain)
            print(f"Amino acids sequence: {amino_acids_sequence}")

        elif choice == 3:
            aa_sequence_number = int(input("Enter the amino acid sequence number: "))
            side_chain_atoms_count = get_atoms_count_in_side_chain(chain, aa_sequence_number)
            print(f"Number of atoms in the side chain of amino acid {aa_sequence_number}: {side_chain_atoms_count}")

        elif choice == 4:
            amino_acids_count = count_amino_acids_in_chain(chain)
            print(f"Number of amino acids in the chain: {amino_acids_count}")

        elif choice == 5:
            helix_count, sheet_count = get_helices_and_sheets_count(pdb_filename, chain_id="A")
            print(f"Number of helices: {helix_count}")
            print(f"Number of sheets: {sheet_count}")

        elif choice == 6:
            aa_sequence_number = int(input("Enter the amino acid sequence number: "))
            atom_name = input("Enter the name of the atom: ")
            atom_coordinates = get_atom_coordinates(chain, aa_sequence_number, atom_name)
            print(f"Coordinates of {atom_name} in amino acid {aa_sequence_number}: {atom_coordinates}")

        elif choice == 7:
            helix_number = int(input("Enter the helix number: "))
            start_resname, start_resnum, end_resname, end_resnum = get_name_and_number_of_aa(pdb_filename, helix_number, chain_id="A")
            print(f"At the beginning of helix {helix_number}: {start_resname} {start_resnum}")
            print(f"At the end of helix {helix_number}: {end_resname} {end_resnum}")

        elif choice == 8:
            aa1_sequence_number = int(input("Enter the first amino acid sequence number: "))
            atom1_name = input("Enter the name of the first atom: ")
            aa2_sequence_number = int(input("Enter the second amino acid sequence number: "))
            atom2_name = input("Enter the name of the second atom: ")
            distance = calculate_distance(chain, aa1_sequence_number, atom1_name, aa2_sequence_number, atom2_name)
            print(f"Distance between {atom1_name} in amino acid {aa1_sequence_number} and {atom2_name} in amino acid {aa2_sequence_number}: {distance}")

        elif choice == 9:
            # 9. Global alignment against all chains in a list
            sequences_list = []
            with open("sequences.txt", "r") as f:
                for entry in f.readlines():
                    protein_id, chain_id = entry.strip().split()
                    filename, chain_unit = fetch_pdb(protein_id, chain_id)
                    chain_seq = get_amino_acids_sequence(chain_unit)
                    sequences_list.append((protein_id, chain_seq))

            for seqA in sequences_list:
                print(f"Target Protein ID: {seqA[0]}")
                for seqB in sequences_list:
                    print(f"\tQuery Protein ID: {seqB[0]}")
                    alignment_score, identity_percentage = global_align(seqA[1], seqB[1])
                    print(f"\t\tGlobal alignment score: {alignment_score}")
                    print(f"\t\tGlobal identity percentage: {identity_percentage}")

        elif choice == 0:
            print("Exiting the program.")
            break

        else:
            print("Invalid choice. Please choose a valid option from 0 t0 9.")

if __name__ == "__main__":
    # Execute the program
    menu(pdb_filename, chain)



Protein Analysis Menu:
1. Atom Count by Amino Acid Sequence
2. Amino Acid Sequence (with missing structural info)
3. Side Chain Atom Count by Amino Acid Sequence
4. Amino Acid Count in Chain
5. Helices and Sheets Count
6. Atom Coordinates by Amino Acid and Atom Name
7. Amino Acid at Helix Start/End by Helix Number
8. Distance Between Two Atoms by Amino Acid and Atom Names
9. Global Alignment against sequences.txt
0. Exit
Enter your choice: 1
Enter the amino acid sequence number: 10
Number of atoms in amino acid 10: 9

Protein Analysis Menu:
1. Atom Count by Amino Acid Sequence
2. Amino Acid Sequence (with missing structural info)
3. Side Chain Atom Count by Amino Acid Sequence
4. Amino Acid Count in Chain
5. Helices and Sheets Count
6. Atom Coordinates by Amino Acid and Atom Name
7. Amino Acid at Helix Start/End by Helix Number
8. Distance Between Two Atoms by Amino Acid and Atom Names
9. Global Alignment against sequences.txt
0. Exit
Enter your choice: 2
Amino acids sequence: QGEPQVQ

		Global alignment score: -46.0
		Global identity percentage: 41.15523465703971
	Query Protein ID: 5pso
		Global alignment score: 502.0
		Global identity percentage: 100.0
	Query Protein ID: 7osn
		Global alignment score: -32.0
		Global identity percentage: 18.272425249169437
Target Protein ID: 7osn
	Query Protein ID: 1mot
		Global alignment score: -419.0
		Global identity percentage: 89.28571428571429
	Query Protein ID: 1g8g
		Global alignment score: -709.0
		Global identity percentage: 13.72340425531915
	Query Protein ID: 4hhb
		Global alignment score: 10.0
		Global identity percentage: 36.318407960199
	Query Protein ID: 5o5z
		Global alignment score: 91.0
		Global identity percentage: 20.545073375262053
	Query Protein ID: 1crn
		Global alignment score: -338.0
		Global identity percentage: 65.21739130434783
	Query Protein ID: 1tsu
		Global alignment score: -57.0
		Global identity percentage: 34.61538461538461
	Query Protein ID: 3ran
		Global alignment score: 73.0
		Global identity pe