In [183]:
import pandas as pd
from Bio import SeqIO
from Bio import pairwise2
from Bio.PDB import *


In [185]:
from Bio import SeqIO

def get_gene_sequences(header_list):
    """Gets the sequences of the genes in a FASTA file that are in a list.

    Args:
    fasta_file: The FASTA file to read.
    header_list: The list of headers of the genes to get the sequences for.

    Returns:
    A list of the sequences of the genes in the header list.
    """
    
    fasta_file = '../genome_data_sets/query_proteomes/fasta_files/TriTrypDB-63_All_species_clean.fa'

    sequences = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        if record.id in header_list:
            sequences.append((record.id, record.seq))
            
            continue
            
    return sequences

def get_gene_sequences_parallel(num_threads):
    
    headers_uniprot = []
    with open('../../config/mandatory_files/fasta_header_to_uniprot.tsv') as file:

        for line in file.readlines():

            l = line.split()

            headers_uniprot.append([l[0] , l[1] ])
            
            
    # Get a list of all the file names.
    file_names = [x[0] for x in headers_uniprot]

    # Create a pool of processes.
    pool = multiprocessing.Pool(num_threads)

    # Map the extract_values function to each file name in the list.
    results = pool.map(get_gene_sequences, file_names)

    # Close the pool.
    pool.close()
    pool.join()
    
    return [x[0] for x in results]


In [176]:

def align_fasta_sequence(parameters_list):
    """Aligns a FASTA sequence against a PDB or CIFF file.

    Args:
    fasta_sequence: The FASTA sequence to align.
    pdb_file: The PDB or CIFF file to align against.

    Returns:
    The alignment of the FASTA sequence against the PDB or CIFF file.
    """

    fasta_sequence = parameters_list[0][1]
    pdb_file = parameters_list[1]
    print(pdb_file)
    #structure = Structure.from_file(pdb_file)
    #sequence = structure.get_sequence()
    
    p = PDBParser()
    structure = p.get_structure("xname", pdb_file)
    ppb=PPBuilder()
    PDBsequence = ppb.build_peptides(structure)[0].get_sequence()
    
    alignment = pairwise2.align.globalxx(fasta_sequence, PDBsequence)
    score = alignment[0].score
    
    return [score, len(fasta_sequence), len(PDBsequence)]


def align_fasta_sequence_parallel(num_threads):
    
    # Create a pool of processes.
    pool = multiprocessing.Pool(num_threads) 


    # Map the extract_values function to each file name in the list.
    results = pool.map(align_fasta_sequence, file_names)

In [186]:
SEQUENCE = get_gene_sequences()

Process ForkPoolWorker-285:
Process ForkPoolWorker-287:
Process ForkPoolWorker-281:
Process ForkPoolWorker-282:
Process ForkPoolWorker-292:
Process ForkPoolWorker-284:


KeyboardInterrupt: 

Process ForkPoolWorker-283:
Process ForkPoolWorker-299:
Process ForkPoolWorker-286:
Process ForkPoolWorker-288:
Process ForkPoolWorker-305:
Process ForkPoolWorker-291:
Process ForkPoolWorker-308:
Process ForkPoolWorker-304:
Process ForkPoolWorker-301:
Process ForkPoolWorker-303:
Process ForkPoolWorker-309:
Process ForkPoolWorker-310:
Process ForkPoolWorker-300:
Process ForkPoolWorker-293:
Process ForkPoolWorker-296:
Process ForkPoolWorker-297:
Process ForkPoolWorker-306:
Process ForkPoolWorker-298:
Process ForkPoolWorker-295:
Process ForkPoolWorker-294:
Process ForkPoolWorker-307:
Process ForkPoolWorker-289:
Process ForkPoolWorker-290:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/jtrinidad/anaconda3/lib/pyt

In [177]:
align_fasta_sequence([r[0] , f'../genome_data_sets/query_proteomes/pdb_files/prot_structure_download_from_AlphaFoldDB/AF-{UNIPROTaccession}-F1-model_v4.pdb'])

../genome_data_sets/query_proteomes/pdb_files/prot_structure_download_from_AlphaFoldDB/AF-A0A7G2BYP5-F1-model_v4.pdb


[621.0, 621, 621]

In [178]:
df = pd.read_csv('../../config/mandatory_files/fasta_header_to_uniprot.tsv', sep='\t', header=None)

In [None]:
with open('../../config/mandatory_files/fasta_header_to_uniprot.tsv', 'r') as file:
    
    
    for line in file.readlines():
        
        line = line.split()
        
        
        
        
        UNIPROTaccession = line[1]
        
        pars = [line[0], f'../genome_data_sets/query_proteomes/pdb_files/prot_structure_download_from_AlphaFoldDB/AF-{UNIPROTaccession}-F1-model_v4.pdb']
        
        #print(pars)

In [182]:
pars

['TvY486_1118050',
 '../genome_data_sets/query_proteomes/pdb_files/prot_structure_download_from_AlphaFoldDB/AF-G0U9N5-F1-model_v4.pdb']

In [104]:
df[df[0] == 'ADEAN_000005000']

Unnamed: 0,0,1
0,ADEAN_000005000,A0A7G2BYP5


In [107]:
UNIPROTaccession = 'A0A7G2BYP5'

In [114]:
pdb_file = f'../genome_data_sets/query_proteomes/pdb_files/prot_structure_download_from_AlphaFoldDB/AF-{UNIPROTaccession}-F1-model_v4.pdb'

In [110]:
Structure.fr

<module 'Bio.PDB.Structure' from '/home/jtrinidad/anaconda3/lib/python3.8/site-packages/Bio/PDB/Structure.py'>

In [129]:
p = PDBParser()
structure = p.get_structure("xname", pdb_file)
ppb=PPBuilder()
for pp in ppb.build_peptides(structure):
    print(pp.get_sequence() )


MIVLFEREGNFEEAYSWRTRLETMQRHPSPQWPLHLKCPSLLDIQSLLRLSVAHSKTMDMLIEVQKFLQTDYYRGMRTSRTIKTTSLSMEDVRLAVERGKFELCDELNRALDQYARSSDGGPRTRFGRLPSEYHSVNAFYVTELKGRRRLITEPILNSIVDVETIPSVESLGRLEKRQALKHCKWLLQIDFNSFYDAIPLHDVHVRNKFVFLGKDRQYYRLRTLPTGARWSVMVGQTITSTIVDIDTPLTILTMIDNIIIGGREGQEEVFVVTVRRILERIRCVNLETSPDRDVLLCTSSDDLLSMALANNTFLGEEYAWDTRLKERVVRNSTKTVAKLNLCMRKCPYYTCRSFVSMISLVLFAFHTTNKNPAGLFFLLKVYRAVYTEVSRTGGEWDVALNHVAPSVQEELRRVIDLLSRNEYARVSDKMQVTYDENAYDFIVYTDASDAGWGAIVHDTQTGETTGLQKEWVDELVVNRYYGPRGEERTTWFNRKHSAHAEPRCIVEVLQYLIETRVLTAGKRVAVVTDHEAIVAAQRKLNGFGGIGRGYTLNRLFELTYNMLYTEGILVAYFYIAGPQNPADTLSRVFHHHNSFGEIRTSDASGLRLPSLKETFCPLAED
