In [17]:
import random
from Bio.PDB import PDBParser, Polypeptide
from Bio.SeqUtils import seq1
import os, gzip
from molytica_m.data_tools import alpha_fold_tools
from tqdm import tqdm

parser = PDBParser()

def get_amino_acid_sequence(uniprot_id, parser, alphafold_folder_path="data/curated_chembl/alpha_fold_data", fixed_size = None):
    concat_character = " " # TODO: Find out what the correct character is to join the sequences

    combined_sequence = []

    pdb_file_name = os.path.join(alphafold_folder_path, f"AF-{uniprot_id}-F1-model_v4.pdb.gz")
    with gzip.open(pdb_file_name, 'rt') as f_in:
        structure = parser.get_structure("protein", f_in)
        for model in structure:
            for chain in model:
                sequence = ""
                for residue in chain:
                    if Polypeptide.is_aa(residue):
                        sequence += seq1(residue.get_resname()) + " "
                combined_sequence.append(sequence)

        combined_sequence = concat_character.join(combined_sequence)

    if fixed_size:
        if len(combined_sequence) < fixed_size:
            combined_sequence += " " * (fixed_size - len(combined_sequence))
        elif len(combined_sequence) > fixed_size:
            combined_sequence = combined_sequence[:fixed_size]

    return combined_sequence
                

# Replace 'your_file.pdb' with the path to your PDB file
af_uniprots = alpha_fold_tools.get_alphafold_uniprot_ids()
random.shuffle(af_uniprots)
count = 0
for af_uniprot in tqdm(af_uniprots, desc="Getting protein lengths"):
    protein_sequence = get_amino_acid_sequence(af_uniprot, parser, fixed_size=512)

    print(f"Protein {af_uniprot} has {protein_sequence} sub-proteins")

print(f"Total number of proteins with sub-proteins: {count}")

Getting protein lengths:   0%|          | 0/20504 [00:00<?, ?it/s]

Protein Q9BVA6 has M M L I P M A S V M A V T E P K W V S V W S R F L W V T L L S M V L G S L L A L L L P L G A V E E Q C L A V L K G L Y L L R S K P D R A Q H A A T K C T S P S T E L S I T S R G A T L L V A K T K A S P A G K L E A R A A L N Q A L E M K R Q G K R E K A Q K L F M H A L K M D P D F V D A L T E F G I F S E E D K D I I Q A D Y L Y T R A L T I S P Y H E K A L V N R D R T L P L V E E I D Q R Y F S I I D S K V K K V M S I P K G N S A L R R V M E E T Y Y H H I Y H T V A I E G N T L T L S E I R H I L E T R Y A V P G K  sub-proteins


Getting protein lengths:   0%|          | 2/20504 [00:00<46:09,  7.40it/s]

Protein P06213 has M A T G G R R G A A A A P L L V A V A A L L L G A A G H L Y P G E V C P G M D I R N N L T R L H E L E N C S V I E G H L Q I L L M F K T R P E D F R D L S F P K L I M I T D Y L L L F R V Y G L E S L K D L F P N L T V I R G S R L F F N Y A L V I F E M V H L K E L G L Y N L M N I T R G S V R I E K N N E L C Y L A T I D W S R I L D S V E D N Y I V L N K D D N E E C G D I C P G T A K G K T N C P A T V I N G Q F V E R C W T H S H C Q K V C P T I C K S H G C T A E G L C C H S E C L G N C S Q P D D P T K C V A C R  sub-proteins


Getting protein lengths:   0%|          | 6/20504 [00:00<38:14,  8.93it/s]  

Protein Q5VTE6 has M E A W R C V R K G Y G H C V V G R G R Y P M F P H H S R S L G R D W T T P W E N L Q R C C W N R H I S S C M R W P G H Y S R A P Y P Y F S S R H F S L N W R P P C L F E S R T Q F Q Y C N W R P D N L S Q T S L I H L S S Y V M N A E G D E P S S K R R K H Q G V I K R N W E Y I C S H D K E K T K I L G D K N V D P K C E D S E N K F D F S V M S Y N I L S Q D L L E D N S H L Y R H C R R P V L H W S F R F P N I L K E I K H F D A D V L C L Q E V Q E D H Y G A E I R P S L E S L G Y H C E Y K M R T G R K P D G C A I  sub-proteins
Protein A0A0B4J2D5 has M A A V R A L V A S R L A A A S A F T S L S P G G R T P S Q R A A L H L S V P R P A A R V A L V L S G C G V Y D G T E I H E A S A I L V H L S R G G A E V Q I F A P D V P Q M H V I D H T K G Q P S E G E S R N V L T E S A R I A R G K I T D L A N L S A A N H D A A I F P G G F G A A K N L S T F A V D G K D C K V N K E V E R V L K E F H Q A G K P I G L C C I A P V L A A K V L R G V E V T V G H E Q E E G G K W P Y A G T A E A I K A L 

Getting protein lengths:   0%|          | 11/20504 [00:01<33:15, 10.27it/s]

Protein Q99952 has M S R S L D S A R S F L E R L E A R G G R E G A V L A G E F S D I Q A C S A A W K A D G V C S T V A G S R P E N V R K N R Y K D V L P Y D Q T R V I L S L L Q E E G H S D Y I N G N F I R G V D G S L A Y I A T Q G P L P H T L L D F W R L V W E F G V K V I L M A C R E I E N G R K R C E R Y W A Q E Q E P L Q T G L F C I T L I K E K W L N E D I M L R T L K V T F Q K E S R S V Y Q L Q Y M S W P D R G V P S S P D H M L A M V E E A R R L Q G S G P E P L C V H C S A G C G R T G V L C T V D Y V R Q L L L T Q M I P P  sub-proteins
Protein Q13451 has M T T D E G A K N N E E S P T A T V A E Q G E D I T S K K D R G V L K I V K R V G N G E E T P M I G D K V Y V H Y K G K L S N G K K F D S S H D R N E P F V F S L G K G Q V I K A W D I G V A T M K K G E I C H L L C K P E Y A Y G S A G S L P K I P S N A T L F F E I E L L D F K G E D L F E D G G I I R R T K R K G E G Y S N P N E G A T V E I H L E G R C G G R M F D C R D V A F T V G E G E D H D I P I G I D K A L E K M Q R E E Q C I L Y 

Getting protein lengths:   0%|          | 14/20504 [00:01<27:02, 12.63it/s]

Protein P0CG31 has M E T D L A E M P E K G V L S S Q D S P H F Q E K S T E E G E V A A L R L T A R S Q A A A A A A A P G S R S L R G V H V P P P L H P A P A R E E I K S T C S L K A C F S L S L T L T Y Y R T A F L L S T E N E G N L H F Q C P S D V E T R P Q S K D S T S V Q D F S K A E S C K V A I I D R L T R N S V Y D S N L E A A L E C E N W L E K Q Q G N Q E R H L R E M F T H M N S L S E E T D H E H D V Y W K S F N Q K S V L I T E D R V P K G S Y A F H T L E K S L K Q K S N L M K K Q R T Y K E K K P H K C N D C G E L F T C H  sub-proteins


Getting protein lengths:   0%|          | 16/20504 [00:01<41:21,  8.26it/s]

Protein Q9H0G5 has M A I P G R Q Y G L I L P K K T Q Q L H P V L Q K P S V F G N D S D D D D E T S V S E S L Q R E A A K K Q A M K Q T K L E I Q K A L A E D A T V Y E Y D S I Y D E M Q K K K E E N N P K L L L G K D R K P K Y I H N L L K A V E I R K K E Q E K R M E K K I Q R E R E M E K G E F D D K E A F V T S A Y K K K L Q E R A E E E E R E K R A A A L E A C L D V T K Q K D L S G F Y R H L L N Q A V G E E E V P K C S F R E A R S G I K E E K S R G F S N E V S S K N R I P Q E K C I L Q T D V K V E E N P D A D S D F D A K S S A  sub-proteins
Protein Q9UHC9 has M A E A G L R G W L L W A L L L R L A Q S E P Y T T I H Q P G Y C A F Y D E C G K N P E L S G S L M T L S N V S C L S N T P A R K I T G D H L I L L Q K I C P R L Y T G P N T Q A C C S A K Q L V S L E A S L S I T K A L L T R C P A C S D N F V N L H C H N T C S P N Q S L F I N V T R V A Q L G A G Q L P A V V A Y E A F Y Q H S F A E Q S Y D S C S R V R V P A A A T L A V G T M C G V Y G S A L C N A Q R W L N F Q G D T G N G L A P L D I 

Getting protein lengths:   0%|          | 20/20504 [00:02<36:38,  9.32it/s]

Protein O15533 has M K S L S L L L A V A L G L A T A V S A G P A V I E C W F V E D A S G K G L A K R P G A L L L R Q G P G E P P P R P D L D P E L Y L S V H D P A G A L Q A A F R R Y P R G A P A P H C E M S R F V P L P A S A K W A S G L T P A Q N C P R A L D G A W L M V S I S S P V L S L S S L L R P Q P E P Q Q E P V L I T M A T V V L T V L T H T P A P R V R L G Q D A L L D L S F A Y M P P T S E A A S S L A P G P P P F G L E W R R Q H L G K G H L L L A A T P G L N G Q M P A A Q E G A V A F A A W D D D E P W G P W T G N G T F  sub-proteins
Protein Q6UWW8 has M E R A V R V E S G V L V G V V C L L L A C P A T A T G P E V A Q P E V D T T L G R V R G R Q V G V K G T D R L V N V F L G I P F A Q P P L G P D R F S A P H P A Q P W E G V R D A S T A P P M C L Q D V E S M N S S R F V L N G K Q Q I F S V S E D C L V L N V Y S P A E V P A G S G R P V M V W V H G G A L I T G A A T S Y D G S A L A A Y G D V V V V T V Q Y R L G V L G F F S T G D E H A P G N Q G F L D V V A A L R W V Q E N I A P F G G 

Getting protein lengths:   0%|          | 25/20504 [00:02<35:07,  9.72it/s]

Protein Q86UY5 has M S R S R H L G K I R K R L E D V K S Q W V R P A R A D F S D N E S A R L A T D A L L D G G S E A Y W R V L S Q E G E V D F L S S V E A Q Y I Q A Q A R E P P C P P D T L G G A E A G P K G L D S S S L Q S G T Y F P V A S E G S E P A L L H S W A S A E K P Y L K E K S S A T V Y F Q T V K H N N I R D L V R R C I T R T S Q V L V I L M D V F T D V E I F C D I L E A A N K R G V F V C V L L D Q G G V K L F Q E M C D K V Q I S D S H L K N I S I R S V E G E I Y C A K S G R K F A G Q I R E K F I I S D W R F V L S G S  sub-proteins
Protein Q2I0M5 has M R A P L C L L L L V A H A V D M L A L N R R K K Q V G T G L G G N C T G C I I C S E E N G C S T C Q Q R L F L F I R R E G I R Q Y G K C L H D C P P G Y F G I R G Q E V N R C K K C G A T C E S C F S Q D F C I R C K R Q F Y L Y K G K C L P T C P P G T L A H Q N T R E C Q G E C E L G P W G G W S P C T H N G K T C G S A W G L E S R V R E A G R A G H E E A A T C Q V L S E S R K C P I Q R P C P G E R S P G Q K K G R K D R R P R K D R K 

Getting protein lengths:   0%|          | 31/20504 [00:03<26:17, 12.98it/s]

Protein Q9H1E5 has M A G G R C G P Q L T A L L A A W I A A V A A T A G P E E A A L P P E Q S R V Q P M T A S N W T L V M E G E W M L K F Y A P W C P S C Q Q T D S E W E A F A K N G E I L Q I S V G K V D V I Q E P G L S G R F F V T T L P A F F H A K D G I F R R Y R G P G I F E D L Q N Y I L E K K W Q S V E P L T G W K S P A S L T M S G M A G L F S I S G K I W H L H N Y F T V T L G I P A W C S Y V F F V I A T L V F G L F M G L V L V V I S E C F Y V P L P R H L S E R S E Q N R R S E E A H R A E Q L Q D A E E E K D D S N E E E N  sub-proteins
Protein Q9H6Z9 has M P L G H I M R L D L E K I A L E Y I V P C L H E V G F C Y L D N F L G E V V G D C V L E R V K Q L H C T G A L R D G Q L A G P R A G V S K R H L R G D Q I T W I G G N E E G C E A I S F L L S L I D R L V L Y C G S R L G K Y Y V K E R S K A M V A C Y P G N G T G Y V R H V D N P N G D G R C I T C I Y Y L N K N W D A K L H G G I L R I F P E G K S F I A D V E P I F D R L L F F W S D R R N P H E V Q P S Y A T R Y A M T V W Y F D A E E R 

Getting protein lengths:   0%|          | 33/20504 [00:03<41:43,  8.18it/s]

Protein Q8IZH2 has M G V P K F Y R W I S E R Y P C L S E V V K E H Q I P E F D N L Y L D M N G I I H Q C S H P N D D D V H F R I S D D K I F T D I F H Y L E V L F R I I K P R K V F F M A V D G V A P R A K M N Q Q R G R R F R S A K E A E D K I K K A I E K G E T L P T E A R F D S N C I T P G T E F M A R L H E H L K Y F V N M K I S T D K S W Q G V T I Y F S G H E T P G E G E H K I M E F I R S E K A K P D H D P N T R H C L Y G L D A D L I M L G L T S H E A H F S L L R E E V R F G G K K T Q R V C A P E E T T F H L L H L S L M R E  sub-proteins
Protein Q96LU7 has M D V V G E N E A L Q Q F F E A Q G A N G T L E N P A L D T S L L E E F L G N D F D L G A L Q R Q L P D T P P Y S A S D S C S P P Q V K G A C Y P T L R P T A G R T P A P F L H P T A A P A M P P M H P L Q S T S G M G D S C Q I H G G F H S C H S N A S H L A T P L D Q S V S S H L G I G C S Y P Q Q P L C H S P G A S L P P T K K R K C T Q A L E D S G E C R V W A C H C R P M T S R S R S S E V Q D P D S E G Q N R M P T D Q C S P A L K W Q 

Getting protein lengths:   0%|          | 35/20504 [00:04<47:17,  7.21it/s]

Protein Q8N2M8 has M W H E A R K H E R K L R G M M V D Y K K R A E R R R E Y Y E K I K K D P A Q F L Q V H G R A C K V H L D S A V A L A A E S P V N M M P W Q G D T N N M I D R F D V R A H L D H I P D Y T P P L L T T I S P E Q E S D E R K C N Y E R Y R G L V Q N D F A G I S E E Q C L Y Q I Y I D E L Y G G L Q R P S E D E K K K L A E K K A S I G Y T Y E D S T V A K V E K A A E K P E E E E S A A E E E S N S D E D E V I P D I D V E V D V D E L N Q E Q V A D L N K Q A T T Y G M A D G D F V R M L R K D K E E A E A I K H A K A L E  sub-proteins


Getting protein lengths:   0%|          | 39/20504 [00:04<42:35,  8.01it/s]

Protein O75122 has M A M G D D K S F D D E E S V D G N R P S S A A S A F K V P A P K T S G N P A N S A R K P G S A G G P K V G G A S K E G G A G A V D E D D F I K A F T D V P S I Q I Y S S R E L E E T L N K I R E I L S D D K H D W D Q R A N A L K K I R S L L V A G A A Q Y D C F F Q H L R L L D G A L K L S A K D L R S Q V V R E A C I T V A H L S T V L G N K F D H G A E A I V P T L F N L V P N S A K V M A T S G C A A I R F I I R H T H V P R L I P L I T S N C T S K S V P V R R R S F E F L D L L L Q E W Q T H S L E R H A A V L V  sub-proteins
Protein P35227 has M H R T T R I K I T E L N P H L M C A L C G G Y F I D A T T I V E C L H S F C K T C I V R Y L E T N K Y C P M C D V Q V H K T R P L L S I R S D K T L Q D I V Y K L V P G L F K D E M K R R R D F Y A A Y P L T E V P N G S N E D R G E V L E Q E K G A L S D D E I V S L S I E F Y E G A R D R D E K K G P L E N G D G D K E K T G V R F L R C P A A M T V M H L A K F L R N K M D V P S K Y K V E V L Y E D E P L K E Y Y T L M D I A Y I Y P W R 

Getting protein lengths:   0%|          | 45/20504 [00:05<32:19, 10.55it/s]

Protein Q6NY19 has M A K F A L N Q N L P D L G G P R L C P V P A A G G A R S P S S P Y S V E T P Y G F H L D L D F L K Y I E E L E R G P A A R R A P G P P T S R R P R A P R P G L A G A R S P G A W T S S E S L A S D D G G A P G I L S Q G A P S G L L M Q P L S P R A P V R N P R V E H T L R E T S R R L E L A Q T H E R A P S P G R G V P R S P R G S G R S S P A P N L A P A S P G P A Q L Q L V R E Q M A A A L R R L R E L E D Q A R T L P E L Q E Q V R A L R A E K A R L L A G R A Q P E P D G E A E T R P D K L A Q L R R L T E R L A T  sub-proteins
Protein Q9BUK0 has M P S V T Q R L R D P D I N P C L S E S D A S T R C L D E N N Y D R E R C S T Y F L R Y K N C R R F W N S I V M Q R R K N G V K P F M P T A A E R D E I L R A V G N M P Y                                                                                                                                                                                                                                                                           

Getting protein lengths:   0%|          | 49/20504 [00:05<32:45, 10.41it/s]

Protein Q9BV36 has M G K K L D L S K L T D E E A Q H V L E V V Q R D F D L R R K E E E R L E A L K G K I K K E S S K R E L L S D T A H L N E T H C A R C L Q P Y Q L L V N S K R Q C L E C G L F T C K S C G R V H P E E Q G W I C D P C H L A R V V K I G S L E W Y Y E H V K A R F K R F G S A K V I R S L H G R L Q G G A G P E L I S E E R S G D S D Q T D E D G E P G S E A Q A Q A Q P F G S K K K R L L S V H D F D F E G D S D D S T Q P Q G H S L H L S S V P E A R D S P Q S L T D E S C S E K A A P H K A E G L E E A D T G A S G C H S  sub-proteins
Protein P27701 has M G S A C I K V T K Y F L F L F N L I F F I L G A V I L G F G V W I L A D K S S F I S V L Q T S S S S L R M G A Y V F I G V G A V T M L M G F L G C I G A V N E V R C L L G L Y F A F L L L I L I A Q V T A G A L F Y F N M G K L K Q E M G G I V T E L I R D Y N S S R E D S L Q D A W D Y V Q A Q V K C C G W V S F Y N W T D N A E L M N R P E V T Y P C S C E V K G E E D N S L S V R K G F C E A P G N R T Q S G N H P E D W P V Y Q E G C M E 

Getting protein lengths:   0%|          | 52/20504 [00:05<26:40, 12.78it/s]

Protein P0CB38 has M N V A A K Y R M A S L Y V G D L H A D V T E D L L F R K F S T V G P V L S I R I C R D Q V T R R S L G Y A Y V N F L Q L A D A Q K A L D T M N F D I I K G K S I R L M W S Q R D A Y L R R S G I G N V F I K N L D K S I D N K T L Y E H F S A F G K I L S S K V M S D D Q G S K G Y A F V H F Q N Q S A A D R A I E E M N G K L L K G C K V F V G R F K N R K D R E A E L R S K A S E F T N V Y I K N F G G D M D D E R L K D V F S K Y G K T L S V K V M T D S S G K S K G F G F V S F D S H E A A K K A V E E M N G R D I N  sub-proteins


Getting protein lengths:   0%|          | 57/20504 [00:06<25:27, 13.39it/s]

Protein Q8WW52 has M V C R E Q L S K N Q V K W V F A G I T C V S V V V I A A I V L A I T L R R P G C E L E A C S P D A D M L D Y L L S L G Q I S R R D A L E V T W Y H A A N S K K A M T A A L N S N I T V L E A D V N V E G L G T A N E T G V P I M A H P P T I Y S D N T L E Q W L D A V L G S S Q K G I K L D F K N I K A V G P S L D L L R Q L T E E G K V R R P I W I N A D I L K G P N M L I S T E V N A T Q F L A L V Q E K Y P K A T L S P G W T T F Y M S T S P N R T Y T Q A M V E K M H E L V G G V P Q R V T F P V R S S M V R A A W P  sub-proteins
Protein Q9UPG8 has M T T F F T S V P P W I Q D A K Q E E E V G W K L V P R P R G R E A E S Q V K C Q C E I S G T P F S N G E K L R P H S L P Q P E Q R P Y S C P Q L H C G K A F A S K Y K L Y R H M A T H S A Q K P H Q C M Y C D K M F H R K D H L R N H L Q T H D P N K E A L H C S E C G K N Y N T K L G Y R R H L A M H A A S S G D L S C K V C L Q T F E S T Q A L L E H L K A H S R R V A G G A K E K K H P C D H C D R R F Y T R K D V R R H L V V H T G R K D 

Getting protein lengths:   0%|          | 62/20504 [00:06<24:18, 14.01it/s]

Protein Q6ZRP5 has M R I F R G C T Q P S T L G Q G V H S P L M K A Q F I T H H S R K Q V K P G E G W G R S S F T R A C R D H T T I L S G N R S F S A V A A T P A K H K H M H T R T H T H M H T H T G M H T L T G T H V H T P H T Q M H T R I L T L S H M H T H A H T H A H T H G H T H T R A H S T H A H T H A H S H Y H T R T L T L T H S H A H S C T L T S T I T H M H T H T H M H T H T S T L T R T L T L T H T H M H T F L S L V S H L A G Y I S C Q F I F S S E N P R L C H                                                                    sub-proteins
Protein Q4KMZ1 has M E P E L L V R K V S A L Q A C V R G F L V R R Q F Q S L R A E Y E A I V R E V E G D L G T L Q W T E G R I P R P R F L P E K A K S H Q T W K A G D R V A N P E Q G L W N H F P C E E S E G E A T W E E M V L K K S G E S S A N Q G S L C R D H S S W L Q M K Q N R K P S Q E K T R D T T R M E N P E A T D Q R L P H S Q P Q L Q E L Q Y H R S H L A M E L L W L Q Q A I N S R K E Y L L L K Q T L R S P E A G P I R E E P R V F L E H G E Q A C E 

Getting protein lengths:   0%|          | 66/20504 [00:06<28:19, 12.03it/s]

Protein Q9UKC9 has M V F S N N D E G L I N K K L P K E L L L R I F S F L D I V T L C R C A Q I S K A W N I L A L D G S N W Q R I D L F N F Q T D V E G R V V E N I S K R C G G F L R K L S L R G C I G V G D S S L K T F A Q N C R N I E H L N L N G C T K I T D S T C Y S L S R F C S K L K H L D L T S C V S I T N S S L K G I S E G C R N L E Y L N L S W C D Q I T K D G I E A L V R G C R G L K A L L L R G C T Q L E D E A L K H I Q N Y C H E L V S L N L Q S C S R I T D E G V V Q I C R G C H R L Q A L C L S G C S N L T D A S L T A L G  sub-proteins
Protein Q9P1C3 has M V R P H L L K K K I L G R V W W L M P V V L A L W E A E V G G S L E V R S L R P A W P T W                                                                                                                                                                                                                                                                                                                                                         

Getting protein lengths:   0%|          | 68/20504 [00:07<38:15,  8.90it/s]

Protein P0DSO1 has M A E P G R P W A Q A R S A Y R A S E V L R R G T G R R R D P G P Q S N G P G Q E D A R A P G R M A R L R G Q L R A E A A S R S E V P R L L K L V E R A G A G A A G R G R E D R R A Q P R A P C A R Y A G S P A A G P P T R R G S W R S A S G G C R R A W R Q C A R S W A P G L R R C A R S F E R S W M P C A R C C R R R R P R L P A A S P A P S P A P R P A A R P C R G R S A P L A P W S P P P G P Q T T P R T A Q Q N A E R T E P R P G R T T R R C Q C R L G P R K V A G T E G G R T R A A                                  sub-proteins
Protein Q6RW13 has M E L P A V N L K V I L L G H W L L T T W G C I V F S G S Y A W A N F T I L A L G V W A V A Q R D S I D A I S M F L G G L L A T I F L D I V H I S I F Y P R V S L T D T G R F G V G M A I L S L L L K P L S C C F V Y H M Y R E R G G E L L V H T G F L G S S Q D R S A Y Q T I D S A E A P A D P F A V P E G R S Q D A R G Y                                                                                                                       

Getting protein lengths:   0%|          | 71/20504 [00:07<33:28, 10.17it/s]

Protein A0FGR9 has M R A E E P C A P G A P S A L G A Q R T P G P E L R L S S Q L L P E L C T F V V R V L F Y L G P V Y L A G Y L G L S I T W L L L G A L L W M W W R R N R R G K L G R L A A A F E F L D N E R E F I S R E L R G Q H L P A W I H F P D V E R V E W A N K I I S Q T W P Y L S M I M E S K F R E K L E P K I R E K S I H L R T F T F T K L Y F G Q K C P R V N G V K A H T N T C N R R R V T V D L Q I C Y I G D C E I S V E L Q K I Q A G V N G I Q L Q G T L R V I L E P L L V D K P F V G A V T V F F L Q K P H L Q I N W T G L T  sub-proteins


Getting protein lengths:   0%|          | 73/20504 [00:07<42:34,  8.00it/s]

Protein Q3V5L5 has M I T V N P D G K I M V R R C L V T L R P F R L F V L G I G F F T L C F L M T S L G G Q F S A R R L G D S P F T I R T E V M G G P E S R G V L R K M S D L L E L M V K R M D A L A R L E N S S E L H R A G G D L H F P A D R M P P G A G L M E R I Q A I A Q N V S D I A V K V D Q I L R H S L L L H S K V S E G R R D Q C E A P S D P K F P D C S G K V E W M R A R W T S D P C Y A F F G V D G T E C S F L I Y L S E V E W F C P P L P W R N Q T A A Q R A P K P L P K V Q A V F R S N L S H L L D L M G S G K E S L I F M K K  sub-proteins
Protein Q14739 has M P S R K F A D G E V V R G R W P G S S L Y Y E V E I L S H D S T S Q L Y T V K Y K D G T E L E L K E N D I K P L T S F R Q R K G G S T S S S P S R R R G S R S R S R S R S P G R P P K S A R R S A S A S H Q A D I K E A R R E V E V K L T P L I L K P F G N S I S R Y N G E P E H I E R N D A P H K N T Q E K F S L S Q E S S Y I A T Q Y S L R P R R E E V K L K E I D S K E E K Y V A K E L A V R T F E V T P I R A K D L E F G G V P G V F L I 

Getting protein lengths:   0%|          | 76/20504 [00:08<43:33,  7.82it/s]

Protein Q96KP1 has M S R S R Q P P L V T G I S P N E G I P W T K V T I R G E N L G T G P T D L I G L T I C G H N C L L T A E W M S A S K I V C R V G Q A K N D K G D I I V T T K S G G R G T S T V S F K L L K P E K I G I L D Q S A V W V D E M N Y Y D M R T D R N K G I P P L S L R P A N P L G I E I E K S K F S Q K D L E M L F H G M S A D F T S E N F S A A W Y L I E N H S N T S F E Q L K M A V T N L K R Q A N K K S E G S L A Y V K G G L S T F F E A Q D A L S A I H Q K L E A D G T E K V E G S M T Q K L E N V L N R A S N T A D T L  sub-proteins
Protein Q9BWT6 has M S K K K G L S A E E K R T R M M E I F S E T K D V F Q L K D L E K I A P K E K G I T A M S V K E V L Q S L V D D G M V D C E R I G T S N Y Y W A F P S K A L H A R K H K L E V L E S Q L S E G S Q K H A S L Q K S I E K A K I G R C E T E E R T R L A K E L S S L R D Q R E Q L K A E V E K Y K D C D P Q V V E E I R Q A N K V A K E A A N R W T D N I F A I K S W A K R K F G F E E N K I D R T F G I P E D F D Y I D                           

Getting protein lengths:   0%|          | 79/20504 [00:08<53:21,  6.38it/s]

Protein Q6AWC2 has M P R R A G S G Q L P L P R G W E E A R D Y D G K V F Y I D H N T R R T S W I D P R D R L T K P L S F A D C V G D E L P W G W E A G F D P Q I G V Y Y I D H I N K T T Q I E D P R K Q W R G E Q E K M L K D Y L S V A Q D A L R T Q K E L Y H V K E Q R L A L A L D E Y V R L N D A Y K E K S S S H T S L F S G S S S S T K Y D P D I L K A E I S T T R L R V K K L K R E L S Q M K Q E L L Y K E Q G F E T L Q Q I D K K M S G G Q S G Y E L S E A K A I L T E L K S I R K A I S S G E K E K Q D L M Q S L A K L Q E R F H L D  sub-proteins
Protein O75129 has M A A A G A R L S P G P G S G L R G R P R L C F H P G P P P L L P L L L L F L L L L P P P P L L A G A T A A A S R E P D S P C R L K T V T V S T L P A L R E S D I G W S G A R A G A G A G T G A G A A A A A A S P G S P G S A G T A A E S R L L L F V R N E L P G R I A V Q D D L D N T E L P F F T L E M S G T A A D I S L V H W R Q Q W L E N G T L Y F H V S M S S S G Q L A Q A T A P T L Q E P S E I V E E Q M H I L H I S V M G G L I A L L L 

Getting protein lengths:   0%|          | 83/20504 [00:09<40:38,  8.37it/s]  

Protein P0DPD8 has M A S P G A G R A P P E L P E R N C G Y R E V E Y W D Q R Y Q G A A D S A P Y D W F G D F S S F R A L L E P E L R P E D R I L V L G C G N S A L S Y E L F L G G F P N V T S V D Y S S V V V A A M Q A R H A H V P Q L R W E T M D V R K L D F P S A S F D V V L E K G T L D A L L A G E R D P W T V S S E G V H T V D Q V L S E V G F Q K G T R Q L L G S R T Q L E L V L A G A S L L L A A L L L G C L V A L G V Q Y H R D P S H S T C L T E A C I R V A G K I L E S L D R G V S P C E D F Y Q F S C G G W I R R N P L P D G R  sub-proteins
Protein P0C7X0 has M M A R R D P T S W A K R L V R A Q T L Q K Q R R A P V G P R A P P P D E E D P R L K C K N C G A F G H T A R S T R C P M K C W K A A L V P A T L G K K E G K E N L K P W K P R G E A N P G P L N K D K G E K E E R P R Q Q D P Q R K A L L H M F S G K P P E K P L P N G K G S T E S S D Y L R V A S G P M P V H T T S K R P R L D P V L A D R S A T E M S G R G S V L A S L S P L R K A S L S S S S S L G P K E R Q T G A A A D M P Q P A V R H Q 

Getting protein lengths:   0%|          | 88/20504 [00:09<36:08,  9.41it/s]

Protein P78371 has M A S L S L A P V N I F K A G A D E E R A E T A R L T S F I G A I A I G D L V K S T L G P K G M D K I L L S S G R D A S L M V T N D G A T I L K N I G V D N P A A K V L V D M S R V Q D D E V G D G T T S V T V L A A E L L R E A E S L I A K K I H P Q T I I A G W R E A T K A A R E A L L S S A V D H G S D E V K F R Q D L M N I A G T T L S S K L L T H H K D H F T K L A V E A V L R L K G S G N L E A I H I I K K L G G S L A D S Y L D E G F L L D K K I G V N Q P K R I E N A K I L I A N T G M D T D K I K I F G S R V  sub-proteins
Protein P48764 has M W G L G A R G P D R G L L L A L A L G G L A R A G G V E V E P G G A H G E S G G F Q V V T F E W A H V Q D P Y V I A L W I L V A S L A K I G F H L S H K V T S V V P E S A L L I V L G L V L G G I V W A A D H I A S F T L T P T V F F F Y L L P P I V L D A G Y F M P N R L F F G N L G T I L L Y A V V G T V W N A A T T G L S L Y G V F L S G L M G D L Q I G L L D F L L F G S L M A A V D P V A V L A V F E E V H V N E V L F I I V F G E S L 

Getting protein lengths:   0%|          | 90/20504 [00:09<30:46, 11.05it/s]

Protein Q6ZSC3 has M A S V L N V K E S K A P E R T V V V A G L P V D L F S D Q L L A V L V K S H F Q D I K N E G G D V E D V I Y P T R T K G V A Y V I F K E K K V A E N V I R Q K K H W L A R K T R H A E L T V S L R V S H F G D K I F S S V N A I L D L S V F G K E V T L E T L V K D L K K K I P S L S F S P L K P N G R I S V E G S F L A V K R L R E S L L A R A C S L L E K D R N F T S E E R K W N R Q N P Q R N L Q R S N N S L A S V R T L V P E T A R S G E M L V L D T D V F L Y L K H K C G S Y E S T L K K F H I L S Q E K V D G E I  sub-proteins
Protein Q9NP97 has M A E V E E T L K R L Q S Q K G V Q G I I V V N T E G I P I K S T M D N P T T T Q Y A S L M H S F I L K A R S T V R D I D P Q N D L T F L R I R S K K N E I M V A P D K D Y F L I V I Q N P T E                                                                                                                                                                                                                                                     

Getting protein lengths:   0%|          | 92/20504 [00:10<35:23,  9.61it/s]

Protein P09104 has M S I E K I W A R E I L D S R G N P T V E V D L Y T A K G L F R A A V P S G A S T G I Y E A L E L R D G D K Q R Y L G K G V L K A V D H I N S T I A P A L I S S G L S V V E Q E K L D N L M L E L D G T E N K S K F G A N A I L G V S L A V C K A G A A E R E L P L Y R H I A Q L A G N S D L I L P V P A F N V I N G G S H A G N K L A M Q E F M I L P V G A E S F R D A M R L G A E V Y H T L K G V I K D K Y G K D A T N V G D E G G F A P N I L E N S E A L E L V K E A I D K A G Y T E K I V I G M D V A A S E F Y R D G K  sub-proteins
Protein Q9ULE3 has M D M F S L D M I I S D P A A E A S R A G K K Q L R G V Q N P C P S A R A R P R H K S L N I K D K I S E W E G K K E V P T P A P S R R A D G Q E D Y L P S S T V E R R S S D G V R T Q V T E A K N G M R P G T E S T E K E R N K G A V N V G G Q D P E P G Q D L S Q P E R E V D P S W G R G R E P R L G K L R F Q N D P L S V L K Q V K K L E Q A L K D G S A G L D P Q L P G T C Y S P H C P P D K A E A G S T L P E N L G G G S G S E V S Q R V H 

Getting protein lengths:   0%|          | 94/20504 [00:10<36:12,  9.39it/s]

Protein Q8TF42 has M A Q Y G H P S P L G M A A R E E L Y S K V T P R R N R Q Q R P G T I K H G S A L D V L L S M G F P R A R A Q K A L A S T G G R S V Q A A C D W L F S H V G D P F L D D P L P R E Y V L Y L R P T G P L A Q K L S D F W Q Q S K Q I C G K N K A H N I F P H I T L C Q F F M C E D S K V D A L G E A L Q T T V S R W K C K F S A P L P L E L Y T S S N F I G L F V K E D S A E V L K K F A A D F A A E A A S K T E V H V E P H K K Q L H V T L A Y H F Q A S H L P T L E K L A Q N I D V K L G C D W V A T I F S R D I R F A N H  sub-proteins


Getting protein lengths:   0%|          | 96/20504 [00:10<42:32,  8.00it/s]

Protein Q9H903 has M T V P V R G F S L L R G R L G R A P A L G R S T A P S V R A P G E P G S A F R G F R S S G V R H E A I I I S G T E M A K H I Q K E I Q R G V E S W V S L G N R R P H L S I I L V G D N P A S H T Y V R N K I R A A S A V G I C S E L I L K P K D V S Q E E L L D V T D Q L N M D P R V S G I L V Q L P L P D H V D E R T I C N G I A P E K D V D G F H I I N I G R L C L D Q H S L I P A T A S A V W E I I K R T G I Q T F G K N V V V A G R S K N V G M P I A M L L H T D G E H E R P G G D A T V T I A H R Y T P K E Q L K I  sub-proteins
Protein Q8N8A6 has M A L F Y V A R Y P G P D A A A A A G P E G A E A G A H G R A R A L L E R L Q S R A R E R Q Q Q R E P A Q T E A A A S T E P A T R R R R R P R R R R R V N D A E P G S P E A P Q G K R R K A D G E D A G A E S N E E A P G E P S A G S S E E A P G E P S A G S S E E A P G E R S T S A S A E A A P D G P A L E E A A G P L V P G L V L G G F G K R K A P K V Q P F L P R W L A E P N C V R R N V T E D L V P I E D I P D V H P D L Q K Q L R A H G I 

Getting protein lengths:   0%|          | 98/20504 [00:10<39:37,  8.58it/s]

Protein Q9UPW0 has M G L Y G Q A C P S V T S L R M T S E L E S S L T S M D W L P Q L T M R A A I Q K S D A T Q N A H G T G I S K K N A L L D P N T T L D Q E E V Q Q H K D G K P P Y S Y A S L I T F A I N S S P K K K M T L S E I Y Q W I C D N F P Y Y R E A G S G W K N S I R H N L S L N K C F L K V P R S K D D P G K G S Y W A I D T N P K E D V L P T R P K K R A R S V E R A S T P Y S I D S D S L G M E C I I S G S A S P T L A I N T V T N K V T L Y N T D Q D G S D S P R S S L N N S L S D Q S L A S V N L N S V G S V H S Y T P V T S  sub-proteins


Getting protein lengths:   0%|          | 101/20504 [00:11<40:47,  8.34it/s]

Protein A6NI47 has M V A E A G S M P A A S S V K K P F G L R S K M G K W C R H C F P W C R G S G K S N V G T S G D H D D S A M K T L R S K M G K W C R H C F P W C R G S G K S N V G T S G D H D D S A M K T L R S K M G K W C C H C F P C C R G S G K S K V G P W G D Y D D S A F M E P R Y H V R R E D L D K L H R A A W W G K V P R K D L I V M L K D T D M N K K D K Q K R T A L H L A S A N G N S E V V K L L L D R R C Q L N I L D N K K R T A L T K A V Q C Q E D E C A L M L L E H G T D P N I P D E Y G N T A L H Y A I Y N E D K L M A K  sub-proteins
Protein Q9C0H6 has M S V S G K K E F D V K Q I L R L R W R W F S H P F Q G S T N T G S C L Q Q E G Y E H R G T P V Q G R L K S H S R D R N G L K K S N S P V H H N I L A P V P G P A P A H Q R A V Q N L Q Q H N L I V H F Q A N E D T P K S V P E K N L F K E A C E K R A Q D L E M M A D D N I E D S T A R L D T Q H S E D M N A T R S E E Q F H V I N H A E Q T L R K M E N Y L K E K Q L C D V L L I A G H L R I P A H R L V L S A V S D Y F A A M F T N D V L E A 

Getting protein lengths:   1%|          | 105/20504 [00:12<53:09,  6.40it/s]  

Protein Q5S007 has M A S G S C Q G C E E D E E T L K K L I V R L N N V Q E G K Q I E T L V Q I L E D L L V F T Y S E R A S K L F Q G K N I H V P L L I V L D S Y M R V A S V Q Q V G W S L L C K L I E V C P G T M Q S L M G P Q D V G N D W E V L G V H Q L I L K M L T V H N A S V N L S V I G L K T L D L L L T S G K I T L L I L D E E S D I F M L I F D A M H S F P A N D E V Q K L G C K A L H V L F E R V S E E Q L T E F V E N K D Y M I L L S A L T N F K D E E E I V L H V L H C L H S L A I P C N N V E V L M S G N V R C Y N I V V E A  sub-proteins
Protein P23258 has M P R E I I T L Q L G Q C G N Q I G F E F W K Q L C A E H G I S P E G I V E E F A T E G T D R K D V F F Y Q A D D E H Y I P R A V L L D L E P R V I H S I L N S P Y A K L Y N P E N I Y L S E H G G G A G N N W A S G F S Q G E K I H E D I F D I I D R E A D G S D S L E G F V L C H S I A G G T G S G L G S Y L L E R L N D R Y P K K L V Q T Y S V F P N Q D E M S D V V V Q P Y N S L L T L K R L T Q N A D C V V V L D N T A L N R I A T D R L 

Getting protein lengths:   1%|          | 107/20504 [00:12<58:23,  5.82it/s]

Protein Q96NX5 has M G R K E E D D C S S W K K Q T T N I R K T F I F M E V L G S G A F S E V F L V K Q R L T G K L F A L K C I K K S P A F R D S S L E N E I A V L K K I K H E N I V T L E D I Y E S T T H Y Y L V M Q L V S G G E L F D R I L E R G V Y T E K D A S L V I Q Q V L S A V K Y L H E N G I V H R D L K P E N L L Y L T P E E N S K I M I T D F G L S K M E Q N G I M S T A C G T P G Y V A P E V L A Q K P Y S K A V D C W S I G V I T Y I L L C G Y P P F Y E E T E S K L F E K I K E G Y Y E F E S P F W D D I S E S A K D F I C H  sub-proteins
Protein Q9ULM2 has M R R N S S L S F Q M E R P L E E Q V Q S K W S S S Q G R T G T G G S D V L Q M Q N S E H H G Q S I K T Q T D S I S L E D V A V N F T L E E W A L L D P G Q R N I Y R D V M R A T F K N L A C I G E K W K D Q D I E D E H K N Q G R N L R S P M V E A L C E N K E D C P C G K S T S Q I P D L N T N L E T P T G L K P C D C S V C G E V F M H Q V S L N R H M R S H T E Q K P N E C H E Y G E K P H K C K E C G K T F T R S S S I R T H E R I H T G 

Getting protein lengths:   1%|          | 109/20504 [00:12<52:27,  6.48it/s]

Protein O00410 has M A A A A A E Q Q Q F Y L L L G N L L S P D N V V R K Q A E E T Y E N I P G Q S K I T F L L Q A I R N T T A A E E A R Q M A A V L L R R L L S S A F D E V Y P A L P S D V Q T A I K S E L L M I I Q M E T Q S S M R K K V C D I A A E L A R N L I D E D G N N Q W P E G L K F L F D S V S S Q N V G L R E A A L H I F W N F P G I F G N Q Q Q H Y L D V I K R M L V Q C M Q D Q E H P S I R T L S A R A T A A F I L A N E H N V A L F K H F A D L L P G F L Q A V N D S C Y Q N D D S V L K S L V E I A D T V P K Y L R P H L E  sub-proteins
Protein Q9H4I3 has M D G E E Q Q P P H E A N V E P V V P S E A S E P V P R V L S G D P Q N L S D V D A F N L L L E M K L K R R R Q R P N L P R T V T Q L V A E D G S R V Y V V G T A H F S D D S K R D V V K T I R E V Q P D V V V V E L C Q Y R V S M L K M D E S T L L R E A Q E L S L E K L Q Q A V R Q N G L M S G L M Q M L L L K V S A H I T E Q L G M A P G G E F R E A F K E A S K V P F C K F H L G D R P I P V T F K R A I A A L S F W Q K V R L A W G L C F 

Getting protein lengths:   1%|          | 114/20504 [00:13<39:59,  8.50it/s]

Protein O75891 has M K I A V I G Q S L F G Q E V Y C H L R K E G H E V V G V F T V P D K D G K A D P L G L E A E K D G V P V F K Y S R W R A K G Q A L P D V V A K Y Q A L G A E L N V L P F C S Q F I P M E I I S A P R H G S I I Y H P S L L P R H R G A S A I N W T L I H G D K K G G F S I F W A D D G L D T G D L L L Q K E C E V L P D D T V S T L Y N R F L F P E G I K G M V Q A V R L I A E G K A P R L P Q P E E G A T Y E G I Q K K E T A K I N W D Q P A E A I H N W I R G N D K V P G A W T E A C E Q K L T F F N S T L N T S G L V P  sub-proteins
Protein Q3LI66 has M C G S Y Y G N Y Y G D H G Y G C C G Y E G L G Y G Y G S L R C G Y S S C C G Y G H G Y G S R F F C G C G Y G C G S G Y Y Y                                                                                                                                                                                                                                                                                                                         

Getting protein lengths:   1%|          | 116/20504 [00:13<42:39,  7.96it/s]

Protein P52569 has M I P C R A A L T F A R C L I R R K I V T L D S L E D T K L C R C L S T M D L I A L G V G S T L G A G V Y V L A G E V A K A D S G P S I V V S F L I A A L A S V M A G L C Y A E F G A R V P K T G S A Y L Y T Y V T V G E L W A F I T G W N L I L S Y V I G T S S V A R A W S G T F D E L L S K Q I G Q F L R T Y F R M N Y T G L A E Y P D F F A V C L I L L L A G L L S F G V K E S A W V N K V F T A V N I L V L L F V M V A G F V K G N V A N W K I S E E F L K N I S A S A R E P P S E N G T S I Y G A G G F M P Y G F T G  sub-proteins
Protein Q8IYN2 has M Q K S C E E N E G K P Q N M P K A E E D R P L E D V P Q E A E G N P Q P S E E G V S Q E A E G N P R G G P N Q P G Q G F K E D T P V R H L D P E E M I R G V D E L E R L R E E I R R V R N K F V M M H W K Q R H S R S R P Y P V C F R P                                                                                                                                                                                                           

Getting protein lengths:   1%|          | 121/20504 [00:14<36:58,  9.19it/s]

Protein P11498 has M L K F R T V H G G L R L L G I R R T S T A P A A S P N V R R L E Y K P I K K V M V A N R G E I A I R V F R A C T E L G I R T V A I Y S E Q D T G Q M H R Q K A D E A Y L I G R G L A P V Q A Y L H I P D I I K V A K E N N V D A V H P G Y G F L S E R A D F A Q A C Q D A G V R F I G P S P E V V R K M G D K V E A R A I A I A A G V P V V P G T D A P I T S L H E A H E F S N T Y G F P I I F K A A Y G G G G R G M R V V H S Y E E L E E N Y T R A Y S E A L A A F G N G A L F V E K F I E K P R H I E V Q I L G D Q Y G N  sub-proteins
Protein Q9NX00 has M G G G W W W A R A A R L A R L R F R R S L L P P Q R P R S G G A R G S F A P G H G P R A G A S P P P V S E L D R A D A W L L R K A H E T A F L S W F R N G L L A S G I G V I S F M Q S D M G R E A A Y G F F L L G G L C V V W G S A S Y A V G L A A L R G P M Q L T L G G A A V G A G A V L A A S L L W A C A V G L Y M G Q L E L D V E L V P E D D G T A S A E G P D E A G R P P P E                                                             

Getting protein lengths:   1%|          | 127/20504 [00:14<30:29, 11.14it/s]

Protein Q8IWY8 has M M A K S A L R E N G T N S E T F R Q R F R R F H Y Q E V A G P R E A F S Q L W E L C C R W L R P E V R T K E Q I V E L L V L E Q F L T V L P G E I Q N W V Q E Q C P E N G E E A V T L V E D L E R E P G R P R S S V T V S V K G Q E V R L E K M T P P K S S Q E L L S V R Q E S V E P Q P R G V P K K E R A R S P D L G P Q E Q M N P K E K L K P F Q R S G L P F P K S G V V S R L E Q G E P W I P D L L G S K E K E L P S G S H I G D R R V H A D L L P S K K D R R S W V E Q D H W S F E D E K V A G V H W G Y E E T R T L  sub-proteins
Protein Q9Y4U1 has M E P K V A E L K Q K I E D T L C P F G F E V Y P F Q V A W Y N E L L P P A F H L P L P G P T L A F L V L S T P A M F D R A L K P F L Q S C H L R M L T D P V D Q C V A Y H L G R V R E S L P E L Q I E I I A D Y E V H P N R R P K I L A Q T A A H V A G A A Y Y Y Q R Q D V E A D P W G N Q R I S G V C I H P R F G G W F A I R G V V L L P G I E V P D L P P R K P H D C V P T R A D R I A L L E G F N F H W R D W T Y R D A V T P Q E R Y S E E 

Getting protein lengths:   1%|          | 129/20504 [00:15<35:43,  9.51it/s]

Protein Q9P055 has M A V D I Q P A C L G L Y C G K T L L F K N G S T E I Y G E C G V C P R G Q R T N A Q K Y C Q P C T E S P E L Y D W L Y L G F M A M L P L V L H W F F I E W Y S G K K S S S A L F Q H I T A L F E C S M A A I I T L L V S D P V G V L Y I R S C R V L M L S D W Y T M L Y N P S P D Y V T T V H C T H E A V Y P L Y T I V F I Y Y A F C L V L M M L L R P L L V K K I A C G L G K S D R F K S I Y A A L Y F F P I L T V L Q A V G G G L L Y Y A F P Y I I L V L S L V T L A V Y M S A S E I E N C Y D L L V R K K R L I V L F S  sub-proteins
Protein Q9P1J3 has M S E Q N I C N Q K D K S T L P F C Q A H L C E E T T N R L C V S N K A V Y S L E C K W A E S E N R V S E G R W G R G C F I G V G                                                                                                                                                                                                                                                                                                                   

Getting protein lengths:   1%|          | 131/20504 [00:15<37:34,  9.04it/s]

Protein Q9P2K9 has M D T E D D P L L Q D V W L E E E Q E E E E A T G E T F L G A Q K P G P Q P G A G G Q C C W R H W P L A S R P P A S G F W S T L G W A F T N P C C A G L V L F L G C S I P M A L S A F M F L Y Y P P L D I D I S Y N A F E I R N H E A S Q R F D A L T L A L K S Q F G S W G R N R R D L A D F T S E T L Q R L I S E Q L Q Q L H L G N R S R Q A S R A P R V I P A A S L G G P G P Y R D T S A A Q K P T A N R S G R L R R E T P P L E D L A A N Q S E D P R N Q R L S K N G R Y Q P S I P P H A A V A A N Q S R A R R G A S R W  sub-proteins
Protein Q7LBR1 has M S N M E K H L F N L K F A A K E L S R S A K K C D K E E K A E K A K I K K A I Q K G N M E V A R I H A E N A I R Q K N Q A V N F L R M S A R V D A V A A R V Q T A V T M G K V T K S M A G V V K S M D A T L K T M N L E K I S A L M D K F E H Q F E T L D V Q T Q Q M E D T M S S T T T L T T P Q N Q V D M L L Q E M A D E A G L D L N M E L P Q G Q T G S V G T S V A S A E Q D E L S Q R L A R L R D Q V                                       

Getting protein lengths:   1%|          | 135/20504 [00:15<35:26,  9.58it/s]

Protein Q99614 has M G E K S E N C G V P E D L L N G L K V T D T Q E A E C A G P P V P D P K N Q H S Q S K L L R D D E A H L Q E D Q G E E E C F H D C S A S F E E E P G A D K V E N K S N E D V N S S E L D E E Y L I E L E K N M S D E E K Q K R R E E S T R L K E E G N E Q F K K G D Y I E A E S S Y S R A L E M C P S C F Q K E R S I L F S N R A A A R M K Q D K K E M A I N D C S K A I Q L N P S Y I R A I L R R A E L Y E K T D K L D E A L E D Y K S I L E K D P S I H Q A R E A C M R L P K Q I E E R N E R L K E E M L G K L K D L G N  sub-proteins
Protein Q9NUQ2 has M L L S L V L H T Y S M R Y L L P S V V L L G T A P T Y V L A W G V W R L L S A F L P A R F Y Q A L D D R L Y C V Y Q S M V L F F F E N Y T G V Q I L L Y G D L P K N K E N I I Y L A N H Q S T V D W I V A D I L A I R Q N A L G H V R Y V L K E G L K W L P L Y G C Y F A Q H G G I Y V K R S A K F N E K E M R N K L Q S Y V D A G T P M Y L V I F P E G T R Y N P E Q T K V L S A S Q A F A A Q R G L A V L K H V L T P R I K A T H V A F D C M 

Getting protein lengths:   1%|          | 139/20504 [00:16<40:50,  8.31it/s]

Protein O60306 has M A A P A Q P K K I V A P T V S Q I N A E F V T Q L A C K Y W A P H I K K K S P F D I K V I E D I Y E K E I V K S R F A I R K I M L L E F S Q Y L E N Y L W M N Y S P E V S S K A Y L M S I C C M V N E K F R E N V P A W E I F K K K P D H F P F F F K H I L K A A L A E T D G E F S L H E Q T V L L L F L D H C F N S L E V D L I R S Q V Q Q L I S L P M W M G L Q L A R L E L E L K K T P K L R K F W N L I K K N D E K M D P E A R E Q A Y Q E R R F L S Q L I Q K F I S V L K S V P L S E P V T M D K V H Y C E R F I E L  sub-proteins
Protein Q5FYA8 has M T R N A R P N I V L L M A D D L G V G D L C C Y G N N S V S T P N I D R L A S E G V R L T Q H L A A A S M C T P S R A A F L T G R Y P I R S G M V S A Y N L N R A F T W L G G S G G L P T N E T T F A K L L Q H R G Y R T G L I G K W H L G L S C A S R N D H C Y H P L N H G F H Y F Y G V P F G L L S D C Q A S K T P E L H R W L R I K L W I S T V A L A L V P F L L L I P K F A R W F S V P W K V I F V F A L L A F L F F T S W Y S S Y G F T 

Getting protein lengths:   1%|          | 141/20504 [00:16<48:27,  7.00it/s]

Protein Q14593 has M S S A P R G P P S V A P L P A G I G R S T A K T P G L P G S L E M G P L T F R D V A I E F S L E E W Q C L D T S Q Q N L Y R N V M L D N Y R N L V F L G I A V S K P D L I T C L E Q G K E P C N M K R H A M V A K P P V V C S H F A Q D L W P K Q G L K D S F Q K V I L R R Y G K Y G H E N L Q L R K G C K S A D E H K V H K R G Y N G L N Q C L T T T Q S K I F Q C D K Y V K V L H K F S N S N I H K K R Q T G K K P F K C K E C G K S C C I L S Q L T Q H K K T A T R V N F Y K C K T C G K A F N Q F S N L T K H K I I H  sub-proteins
Protein P20823 has M V S K L S Q L Q T E L L A A L L E S G L S K E A L I Q A L G E P G P Y L L A G E G P L D K G E S C G G G R G E L A E L P N G L G E T R G S E D E T D D D G E D F T P P I L K E L E N L S P E E A A H Q K A V V E T L L Q E D P W R V A K M V K S Y L Q Q H N I P Q R E V V D T T G L N Q S H L S Q H L N K G T P M K T Q K R A A L Y T W Y V R K Q R E V A Q Q F T H A G Q G G L I E E P T G D E L P T K K G R R N R F K W G P A S Q Q I L F Q A Y 

Getting protein lengths:   1%|          | 144/20504 [00:17<44:16,  7.67it/s]

Protein Q6UXZ4 has M G R A A A T A G G G G G A R R W L P W L G L C F W A A G T A A A R G T D N G E A L P E S I P S A P G T L P H F I E E P D D A Y I I K S N P I A L R C K A R P A M Q I F F K C N G E W V H Q N E H V S E E T L D E S S G L K V R E V F I N V T R Q Q V E D F H G P E D Y W C Q C V A W S H L G T S K S R K A S V R I A Y L R K N F E Q D P Q G R E V P I E G M I V L H C R P P E G V P A A E V E W L K N E E P I D S E Q D E N I D T R A D H N L I I R Q A R L S D S G N Y T C M A A N I V A K R R S L S A T V V V Y V N G G W S  sub-proteins
Protein Q8N0V5 has M M G S W K H C L F S A S L I S A L I F V F V Y N T E L W E N K R F L R A A L S N A S L L A E A C H Q I F E G K V F Y P T E N A L K T T L D E A T C Y E Y M V R S H Y V T E T L S E E E A G F P L A Y T V T I H K D F G T F E R L F R A I Y M P Q N V Y C V H L D Q K A T D A F K G A V K Q L L S C F P N A F L A S K K E S V V Y G G I S R L Q A D L N C L E D L V A S E V P W K Y V I N T C G Q D F P L K T N R E I V Q Y L K G F K G K N I T P G 

Getting protein lengths:   1%|          | 147/20504 [00:17<38:35,  8.79it/s]

Protein Q6ZNJ1 has M A A S E R L Y E L W L L Y Y A Q K D L G Y L Q Q W L K A F V G A F K K S I S L S S L E P R R P E E A G A E V P L L P L D E L H V L A E Q L H Q A D L E Q A L L L L K L F I I L C R N L E N I E A G R G Q V L V P R V L A L L T K L V A E L K G C P P P Q G R G T Q L E N V A L H A L L L C E G L F D P Y Q T W R R Q R S G E V I S S K E K S K Y K F P P A A L P Q E F S A F F Q E S L Q N A D H L P P I L L L R L I H L F C A V L A G G K E N G Q M A V S D G S V K G L L S V V R G W S R G P A P D P C L V P L A L E A L V G  sub-proteins


Getting protein lengths:   1%|          | 152/20504 [00:17<32:31, 10.43it/s]

Protein Q9HDC9 has M S E A D G L R Q R R P L R P Q V V T D D D G Q A P E A K D G S S F S G R V F R V T F L M L A V S L T V P L L G A M M L L E S P I D P Q P L S F K E P P L L L G V L H P N T K L R Q A E R L F E N Q L V G P E S I A H I G D V M F T G T A D G R V V K L E N G E I E T I A R F G S G P C K T R D D E P V C G R P L G I R A G P N G T L F V A D A Y K G L F E V N P W K R E V K L L L S S E T P I E G K N M S F V N D L T V T Q D G R K I Y F T D S S S K W Q R R D Y L L L V M E G T D D G R L L E Y D T V T R E V K V L L D Q L  sub-proteins
Protein Q3SY46 has M S Y N C C S R N F S S C S H G G Y L H Y P G S S C G S S Y P S N L V Y S T D L C S P S T C Q L G S S L Y R G C Q E T C W R P N S C Q T L C V E S S P C H T S C Y Y P R T H M L C N S C L T M H V G S R G F G S N S C C S L S C G S R S C S S L G C G S N G F R Y L N Y R I H T S P S Q S Y R S R F C H P I Y F P P R R W F H S S C Y Q P F C R S G F Y                                                                                             

Getting protein lengths:   1%|          | 160/20504 [00:18<24:51, 13.64it/s]

Protein Q13177 has M S D N G E L E D K P P A P P V R M S S T I F S T G G K D P L S A N H S L K P L P S V P E E K K P R H K I I S I F S G T E K G S K K K E K E R P E I S P P S D F E H T I H V G F D A V T G E F T G M P E Q W A R L L Q T S N I T K L E Q K K N P Q A V L D V L K F Y D S N T V K Q K Y L S F T P P E K D G F P S G T P A L N A K G T E A P A V V T E E E D D D E E T A P P V I A P R P D H T K S I Y T R S V I D P V P A P V G D S H V D G A A K S L D K Q K K K T K M T D E E I M E K L R T I V S I G D P K K K Y T R Y E K I G  sub-proteins
Protein O15172 has M A S A S C S P G G A L A S P E P G R K I L P R M I S H S E L R K L F Y S A D A V C F D V D S T V I S E E G I G C F H W I W R K C D Q A T S Q G                                                                                                                                                                                                                                                                                                     

Getting protein lengths:   1%|          | 164/20504 [00:18<29:46, 11.38it/s]

Protein Q9Y223 has M E K N G N N R K L R V C V A T C N R A D Y S K L A P I M F G I K T E P E F F E L D V V V L G S H L I D D Y G N T Y R M I E Q D D F D I N T R L H T I V R G E D E A A M V E S V G L A L V K L P D V L N R L K P D I M I V H G D R F D A L A L A T S A A L M N I R I L H I E G G E V S G T I D D S I R H A I T K L A H Y H V C C T R S A E Q H L I S M C E D H D R I L L A G C P S Y D K L L S A K N K D Y M S I I R M W L G D D V K S K D Y I V A L Q H P V T T D I K H S I K M F E L T L D A L I S F N K R T L V L F P N I D A  sub-proteins
Protein Q9UHW9 has M H P P E T T T K M A S V R F M V T P T K I D D I P G L S D T S P D L S S R S S S R V R F S S R E S V P E T S R S E P M S E M S G A T T S L A T V A L D P P S D R T S H P Q D V I E D L S Q N S I T G E H S Q L L D D G H K K A R N A Y L N N S N Y E E G D E Y F D K N L A L F E E E M D T R P K V S S L L N R M A N Y T N L T Q G A K E H E E A E N I T E G K K K P T K T P Q M G T F M G V Y L P C L Q N I F G V I L F L R L T W V V G T A G V L 

Getting protein lengths:   1%|          | 166/20504 [00:18<33:08, 10.23it/s]

Protein P04080 has M M C G A P S A T Q P A T A E T Q H I A D Q V R S Q L E E K E N K K F P V F K A V S F K S Q V V A G T N Y F I K V H V G D E D F V H L R V F Q S L P H E N K P L T L S N Y Q T N K A K H D E L T Y F                                                                                                                                                                                                                                                                                                                              sub-proteins
Protein P15884 has M H H Q Q R M A A L G T D K E L S D L L D F S A M F S P P V S S G K N G P T S L A S G H F T G S N V E D R S S S G S W G N G G H P S P S R N Y G D G T P Y D H M T S R D L G S H D N L S P P F V N S R I Q S K T E R G S Y S S Y G R E S N L Q G C H Q Q S L L G G D M D M G N P G T L S P T K P G S Q Y Y Q Y S S N N P R R R P L H S S A M E V Q T K K V R K V P P G L P S S V Y A P S A S T A D Y N R D S P G Y P S S K P A T S T F P S S F F M Q 

Getting protein lengths:   1%|          | 168/20504 [00:19<31:00, 10.93it/s]

Protein A0A1W2PN81 has M R C S P G G V W L A L A A S L L H V S L Q G E F Q R K L Y K E L V K N Y N P L E R P V A N D S Q P L T V Y F S L S L L Q I M D V D Q K R Q V L T T N I W L Q M S W T D H Y L Q W N V S E Y P G V K T V R F P D G Q I W K P D I L L Y N S A D E R F D A T F H T N V L V N S S G H C Q Y L P P G I F K S S C Y I D V R W F P F D V Q H C K L K F G S W S Y G G W S L D L Q M Q E A D I S G Y I P N G E W D L V G I P G K R S E R F Y E C C K E P Y P D V T F T V T M R R R T L Y Y G L N L L I P C V L I S A L A L L V F L L P A  sub-proteins


Getting protein lengths:   1%|          | 170/20504 [00:19<36:53,  9.19it/s]

Protein Q14123 has M E S P T K E I E E F E S N S L K Y L Q P E Q I E K I W L R L R G L R K Y K K T S Q R L R S L V K Q L E R G E A S V V D L K K N L E Y A A T V L E S V Y I D E T R R L L D T E D E L S D I Q S D A V P S E V R D W L A S T F T R Q M G M M L R R S D E K P R F K S I V H A V Q A G I F V E R M Y R R T S N M V G L S Y P P A V I E A L K D V D K W S F D V F S L N E A S G D H A L K F I F Y E L L T R Y D L I S R F K I P I S A L V S F V E A L E V G Y S K H K N P Y H N L M H A A D V T Q T V H Y L L Y K T G V A N W L T E L  sub-proteins
Protein Q2VPK5 has M C Q V G E D Y G E P A P E E P P P A P R P S R E Q K C V K C K E A Q P V V V I R A G D A F C R D C F K A F Y V H K F R A M L G K N R L I F P G E K V L L A W S G G P S S S S M V W Q V L E G L S Q D S A K R L R F V A G V I F V D E G A A C G Q S L E E R S K T L A E V K P I L Q A T G F P W H V V A L E E V F S L P P S V L W C S A Q E L V G S E G A Y K A A V D S F L Q Q Q H V L G A G G G P G P T Q G E E Q P P Q P P L D P Q N L A R P P A 

Getting protein lengths:   1%|          | 172/20504 [00:19<45:49,  7.40it/s]

Protein O00468 has M A G R S H P G P L R P L L P L L V V A A C V L P G A G G T C P E R A L E R R E E E A N V V L T G T V E E I L N V D P V Q H T Y S C K V R V W R Y L K G K D L V A R E S L L D G G N K V V I S G F G D P L I C D N Q V S T G D T R I F F V N P A P P Y L W P A H K N E L M L N S S L M R I T L R N L E E V E F C V E D K P G T H F T P V P P T P P D A C R G M L C G F G A V C E P N A E G P G R A S C V C K K S P C P S V V A P V C G S D A S T Y S N E C E L Q R A Q C S Q Q R R I R L L S R G P C G S R D P C S N V T C S F G  sub-proteins
Protein Q9UK08 has M S N N M A K I A E A R K T V E Q L K L E V N I D R M K V S Q A A A E L L A F C E T H A K D D P L V T P V P A A E N P F R D K R L F C V L L                                                                                                                                                                                                                                                                                                         

Getting protein lengths:   1%|          | 177/20504 [00:20<40:18,  8.41it/s]

Protein Q14839 has M A S G L G S P S P C S A G S E E E D M D A L L N N S L P P P H P E N E E D P E E D L S E T E T P K L K K K K K P K K P R D P K I P K S K R Q K K E R M L L C R Q L G D S S G E G P E F V E E E E E V A L R S D S E G S D Y T P G K K K K K K L G P K K E K K S K S K R K E E E E E E D D D D D S K E P K S S A Q L L E D W G M E D I D H V F S E E D Y R T L T N Y K A F S Q F V R P L I A A K N P K I A V S K M M M V L G A K W R E F S T N N P F K G S S G A S V A A A A A A A V A V V E S M V T A T E V A P P P P P V E V P  sub-proteins
Protein Q9H8V8 has M K P D W P R R G A A G T R V R S R G E G D G T Y F A R R G A G R R R R E I K A P I R A A W S P P S A A M S G L Q S G R R W R P Q G T G T G A R A A G A L A A L R L G P R L R A A P L L A P L W L L A P T P D S H M T P A P L A L R A S R G W R E N N L S D Y Q Y S W M Q K C                                                                                                                                                                       

Getting protein lengths:   1%|          | 179/20504 [00:20<45:43,  7.41it/s]

Protein Q75T13 has M F L H S V N L W N L A F Y V F M V F L A T L G L W D V F F G F E E N K C S M S Y M F E Y P E Y Q K I E L P K K L A K R Y P A Y E L Y L Y G E G S Y A E E H K I L P L T G I P V L F L P G N A G S Y K Q V R S I G S I A L R K A E D I D F K Y H F D F F S V N F N E E L V A L Y G G S L Q K Q T K F V H E C I K T I L K L Y K G Q E F A P K S V A I I G H S M G G L V A R A L L T L K N F K H D L I N L L I T Q A T P H V A P V M P L D R F I T D F Y T T V N N Y W I L N A R H I N L T T L S V A G G F R D Y Q V R S G L T F L  sub-proteins
Protein Q8N370 has M A P T L A T A H R R R W W M A C T A V L E N L L F S A V L L G W G S L L I M L K S E G F Y S Y L C T E P E N V T N G T V G G T A E P G H E E V S W M N G W L S C Q A Q D E M L N L A F T V G S F L L S A I T L P L G I V M D K Y G P R K L R L L G S A C F A V S C L L I A Y G A S K P N A L S V L I F I A L A L N G F G G M C M T F T S L T L P N M F G D L R S T F I A L M I G S Y A S S A V T F P G I K L I Y D A G V S F I V V L V V W A G C S 

Getting protein lengths:   1%|          | 184/20504 [00:21<35:43,  9.48it/s]

Protein Q9Y580 has M G A A A A E A D R T L F V G N L E T K V T E E L L F E L F H Q A G P V I K V K I P K D K D G K P K Q F A F V N F K H E V S V P Y A M N L L N G I K L Y G R P I K I Q F R S G S S H A P Q D V S L S Y P Q H H V G N S S P T S T S P S R Y E R T M D N M T S S A Q I I Q R S F S S P E N F Q R Q A V M N S A L R Q M S Y G G K F G S S P L D Q S G F S P S V Q S H S H S F N Q S S S S Q W R Q G T P S S Q R K V R M N S Y P Y L A D R H Y S R E Q R Y T D H G S D H H Y R G K R D D F F Y E D R N H D D W S H D Y D N R R D S S  sub-proteins
Protein O15232 has M P R P A P A R R L P G L L L L L W P L L L L P S A A P D P V A R P G F R R L E T R G P G G S P G R R P S P A A P D G A P A S G T S E P G R A R G A G V C K S R P L D L V F I I D S S R S V R P L E F T K V K T F V S R I I D T L D I G P A D T R V A V V N Y A S T V K I E F Q L Q A Y T D K Q S L K Q A V G R I T P L S T G T M S G L A I Q T A M D E A F T V E A G A R E P S S N I P K V A I I V T D G R P Q D Q V N E V A A R A Q A S G I E L Y 

Getting protein lengths:   1%|          | 189/20504 [00:21<31:58, 10.59it/s]

Protein P25440 has M L Q N V T P H N K L P G E G N A G L L G L G P E A A A P G K R I R K P S L L Y E G F E S P T M A S V P A L Q L T P A N P P P P E V S N P K K P G R V T N Q L Q Y L H K V V M K A L W K H Q F A W P F R Q P V D A V K L G L P D Y H K I I K Q P M D M G T I K R R L E N N Y Y W A A S E C M Q D F N T M F T N C Y I Y N K P T D D I V L M A Q T L E K I F L Q K V A S M P Q E E Q E L V V T I P K N S H K K G A K L A A L Q G S V T S A H Q V P A V S S V S H T A L Y T P P P E I P T T V L N I P H P S V I S S P L L K S L H S  sub-proteins
Protein Q8WU68 has M A E Y L A S I F G T E K D K V N C S F Y F K I G V C R H G D R C S R L H N K P T F S Q T I V L L N L Y R N P Q N T A Q T A D G S H C H V S D V E V Q E H Y D S F F E E V F T E L Q E K Y G E I E E M N V C D N L G D H L V G N V Y V K F R R E E D G E R A V A E L S N R W F N G Q A V H G E L S P V T D F R E S C C R Q Y E M G E C T R G G F C N F M H L R P I S Q N L Q R Q L Y G R G P R R R S P P R F H T G H H P R E R N H R C S P D H W H G 

Getting protein lengths:   1%|          | 194/20504 [00:22<35:21,  9.57it/s]

Protein Q6AI39 has M D D D D D S C L L D L I G D P Q A L N Y F L H G P S N K S S N D D L T N A G Y S A A N S N S I F A N S S N A D P K S S L K G V S N Q L G E G P S D G L P L S S S L Q F L E D E L E S S P L P D L T E D Q P F D I L Q K S L Q E A N I T E Q T L A E E A Y L D A S I G S S Q Q F A Q A Q L H P S S S A S F T Q A S N V S N Y S G Q T L Q P I G V T H V P V G A S F A S N T V G V Q H G F M Q H V G I S V P S Q H L S N S S Q I S G S G Q I Q L I G S F G N H P S M M T I N N L D G S Q I I L K G S G Q Q A P S N V S G G L L V H  sub-proteins
Protein Q49AM1 has M L W K L L L R S Q S C R L C S F R K M R S P P K Y R P F L A C F T Y T T D K Q S S K E N T R T V E K L Y K C S V D I R K I R R L K G W V L L E D E T Y V E E I A N I L Q E L G A D E T A V A S I L E R C P E A I V C S P T A V N T Q R K L W Q L V C K N E E E L I K L I E Q F P E S F F T I K D Q E N Q K L N V Q F F Q E L G L K N V V I S R L L T A A P N V F H N P V E K N K Q M V R I L Q E S Y L D V G G S E A N M K V W L L K L L S Q N P F 

Getting protein lengths:   1%|          | 198/20504 [00:22<33:49, 10.01it/s]

Protein Q8IYI6 has M A M A M S D S G A S R L R R Q L E S G G F E A R L Y V K Q L S Q Q S D G D R D L Q E H R Q R I Q A L A E E T A Q N L K R N V Y Q N Y R Q F I E T A R E I S Y L E S E M Y Q L S H L L T E Q K S S L E S I P L T L L P A A A A A G A A A A S G G E E G V G G A G G R D H L R G Q A G F F S T P G G A S R D G S G P G E E G K Q R T L T T L L E K V E G C R H L L E T P G Q Y L V Y N G D L V E Y D A D H M A Q L Q R V H G F L M N D C L L V A T W L P Q R R G M Y R Y N A L Y S L D G L A V V N V K D N P P M K D M F K L L M F  sub-proteins
Protein O75298 has M G Q V L P V F A H C K E A P S T A S S T P D S T E G G N D D S D F R E L H T A R E F S E E D E E E T T S Q D W G T P R E L T F S Y I A F D G V V G S G G R R D S T A R R P R P Q G R S V S E P R D Q H P Q P S L G D S L E S I P S L S Q S P E P G R R G D P D T A P P S E R P L E D L R L R L D H L G W V A R G T G S G E D S S T S S S T P L E D E E P Q E P N R L E T G E A G E E L D L R L R L A Q P S S P E V L T P Q L S P G S G T P Q A G T 

Getting protein lengths:   1%|          | 200/20504 [00:23<43:49,  7.72it/s]

Protein P54289 has M A A G C L L A L T L T L F Q S L L I G P S S E E P F P S A V T I K S W V D K M Q E D L V T L A K T A S G V N Q L V D I Y E K Y Q D L Y T V E P N N A R Q L V E I A A R D I E K L L S N R S K A L V R L A L E A E K V Q A A H Q W R E D F A S N E V V Y Y N A K D D L D P E K N D S E P G S Q R I K P V F I E D A N F G R Q I S Y Q H A A V H I P T D I Y E G S T I V L N E L N W T S A L D E V F K K N R E E D P S L L W Q V F G S A T G L A R Y Y P A S P W V D N S R T P N K I D L Y D V R R R P W Y I Q G A A S P K D M L I  sub-proteins
Protein Q8TF21 has M K T L R A R F K K T E L R L S P T D L G S C P P C G P C P I P K P A A R G R R Q S Q D W G K S D E R L L Q A V E N N D A P R V A A L I A R K G L V P T K L D P E G K S A F H L A A M R G A A S C L E V M I A H G S N V M S A D G A G Y N A L H L A A K Y G H P Q C L K Q L L Q A S C V V D V V D S S G W T A L H H A A A G G C L S C S E V L C S F K A H L N P Q D R S G A T P L I I A A Q M C H T D L C R L L L Q Q G A A A N D Q D L Q G R T A L 

Getting protein lengths:   1%|          | 204/20504 [00:23<33:55,  9.97it/s]

Protein P0CZ25 has M H S L P R S G S I R R T H S D T Q A T G W P P P Q R I G D S P G P S P A F L S C P P S L C G G A A Q T G D P V A L P H G P E K W V W G G G L S P R N P H S W G I K A H G L R P P W A P R L E R C M V P E S E W A P W Q P Q L P C E P K W L G S R K S K P H R E S G L R G G G P S R C A K R G T H S C G P R E S G G P D T C H L P C H                                                                                                                                                                                            sub-proteins
Protein P0CW20 has M A F S G R A R P C I I P E N E E I P R A A L N T V H E A N G T E D E R A V S K L Q R R H S D V K V Y K E F C D F Y A K F N M A N A L A S A T C E R C K G G F A P A E T I V N S N G E L Y H E Q C F V C A Q C F Q Q F P E G L F Y E E R T                                                                                                                                                                                                           

Getting protein lengths:   1%|          | 207/20504 [00:23<27:01, 12.52it/s]

Protein Q7Z769 has M A L L V D R V R G H W R I A A G L L F N L L V S I C I V F L N K W I Y V Y H G F P N M S L T L V H F V V T W L G L Y I C Q K L D I F A P K S L P P S R L L L L A L S F C G F V V F T N L S L Q N N T I G T Y Q L A K A M T T P V I I A I Q T F C Y Q K T F S T R I Q L T L I P I T L G V I L N S Y Y D V K F N F L G M V F A A L G V L V T S L Y Q V W V G A K Q H E L Q V N S M Q L L Y Y Q A P M S S A M L L V A V P F F E P V F G E G G I F G P W S V S A L L M V L L S G V I A F M V N L S I Y W I I G N T S P V T Y N M F  sub-proteins
Protein Q96C57 has M A A P S G T V S D S E S S N S S S D A E E L E R C R E A A M P A W G L E Q R P H V A G K P R A G A A N S Q L S T S Q P S L R H K V N E H E Q D G N E L Q T T P E F R A H V A K K L G A L L D S F I T I S E A A K E P A K A K V Q K V A L E D D G F R L F F T S V P G G R E K E E S P Q P R R K R Q P S S S S E D S D E E W R R C R E A A V S A S D I L Q E S A I H S P G T V E K E A K K K R K L K K K A K K V A S V D S A V A A T T P T S M A T V 

Getting protein lengths:   1%|          | 209/20504 [00:23<30:31, 11.08it/s]

Protein O15034 has M R E A A E R R Q Q L Q L E H D Q A L A V L S A K Q Q E I D L L Q K S K V R E L E E K C R T Q S E Q F N L L S R D L E K F R Q H A G K I D L L G G S A V A P L D I S T A P S K P F P Q F M N G L A T S L G K G Q E S A I G G S S A I G E Y I R P L P Q P G D R P E P L S A K P T F L S R S G S A R C R S E S D M E N E R N S N T S K Q R Y S G K V H L C V A R Y S Y N P F D G P N E N P E A E L P L T A G K Y L Y V Y G D M D E D G F Y E G E L L D G Q R G L V P S N F V D F V Q D N E S R L A S T L G N E Q D Q N F I N H S G  sub-proteins
Protein Q149N8 has M S S R R K R A P P V R V D E E K R Q Q L H W N M H E D R R N E P I I I S D D D E Q P C P G S D T S S A H Y I I L S D S L K E E V A H R D K K R C S K V V S F S K P I E K E E T V G I F S P L S V K L N I V I S P Y H F D N S W K A F L G E L T L Q L L P A Q S L I E N F S E R S I T L M S S E S S N Q F L I Y V H S K G E D V E K Q K K E P M S I C D K G I L V E S S F S G E M L E D L G W L Q K K R R I K L Y Q K P E G N H I I K V G I Y L L E 

Getting protein lengths:   1%|          | 210/20504 [00:24<38:59,  8.67it/s]


KeyboardInterrupt: 

In [None]:
Polypeptide.three_to_one('LEU')

AttributeError: module 'Bio.PDB.Polypeptide' has no attribute 'three_to_one'

In [None]:
from Bio.SeqUtils import seq1

three_letter_sequence = "GLU"
one_letter_sequence = seq1(three_letter_sequence)
print(one_letter_sequence)


E


In [None]:
def get_amino_acid_sequence(uniprot_id):
    concat_character = " " # TODO: Find out what the correct character is to join the sequences

    combined_sequence = []

    pdb_file_name = os.path.join("data", "curated_chembl", "alpha_fold_data", f"AF-{uniprot_id}-F1-model_v4.pdb.gz")
    with gzip.open(pdb_file_name, 'rt') as f_in:
        structure = parser.get_structure("protein", f_in)
        for model in structure:
            for chain in model:
                sequence = ""
                for residue in chain:
                    if Polypeptide.is_aa(residue):
                        sequence += seq1(residue.get_resname()) + " "
                combined_sequence.append(sequence)

        combined_sequence = concat_character.join(combined_sequence)

    return combined_sequence