In [1]:
import os

DIR = r'c://downloads'

In [2]:
import csv
import json

from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.SubsMat.MatrixInfo import blosum62

# A patch required to let the csv library parsing long fields
csv.field_size_limit(2 ** 31 - 1)

viralzone_records = []

with open(os.path.join(DIR, 'viralzone.csv'), 'r') as f:
    
    csv_reader = csv.reader(f)
    
    # Skip the header line.
    next(csv_reader) 
    
    for group, family, genus, strain, _, _, _, _, raw_genome in csv_reader:
        viralzone_records.append({'group': group, 'family': family, 'genus': genus, 'strain': strain, \
                'genome': json.loads(raw_genome)})
                
def get_record_by_strain(strain):
    global viralzone_records
    record, = [record for record in viralzone_records if record['strain'] == strain]
    return record
    
def get_proteins(viralzone_record):
    return [(coding_region['product'], coding_region['translation']) for genome_segment in viralzone_record['genome'] \
            for coding_region in genome_segment['coding_regions']]

record1 = get_record_by_strain('Human papillomavirus 1')
record2 = get_record_by_strain('European elk papillomavirus')
proteins1 = get_proteins(record1)
proteins2 = get_proteins(record2)

best_score = None
closest_match = None
checked_pairs = 0

for name1, seq1 in proteins1:
    for name2, seq2 in proteins2:
        
        # We measure resemblance by the score of global alignment normalized by sequence length (in order not to give an
        # unfair advantage to sequences of extreme lengths). We normalize by the length of the shorter sequence in each pair. 
        score = pairwise2.align.globalds(seq1, seq2, blosum62, -10.0, -0.5, score_only = True) / min(len(seq1), len(seq2))

        if best_score is None or score > best_score:
            best_score = score
            closest_match = (name1, seq1, name2, seq2)
            
        checked_pairs += 1
        print('%d/%d' % (checked_pairs, len(proteins1) * len(proteins2)), end = '\r')
        
name1, seq1, name2, seq2 = closest_match
print('Closest match: %s <---> %s (score: %.2f)' % (name1, name2, best_score))

alignment, = pairwise2.align.globalds(seq1, seq2, blosum62, -10.0, -0.5, one_alignment_only = True)
print(format_alignment(*alignment))

Closest match: major capsid L1 protein <---> major capsid L1 protein (score: 2.58)
MYNVFQMAVWLPAQNKFYLPPQPITRILSTDEYVTRTNLFYHATSERLLLVGHPLFEIS-SNQTVTIPKVSPNAFRVFRVRFADPNRFAFGDKAIFNPETERLVWGLRGIEIGRGQPLGIGITGHPLLNKLDDAENPTNYINTHANGDSRQNTAFDAKQTQMFLVGCTPASGEHWTSSR-CPGEQVKLGDCPRVQMIESVIEDGDMMDIGFGAMDFAALQQDKSDVPLDVVQATCKYPDYIRMNHEAYGNSMFFFARREQMYTRHFFTRGGSVGDKEAVPQSLYLTADAEPRTTLATTNYVGTPSGSMVSSDVQLFNRSYWLQRCQGQNNGICWRNQLFITVGDNTRGTSLSISMKNNAS--TTYSNANFNDFLRHTEEFDLSFIVQLCKVKLTPENLAYIHTMDPNILEDWQLSVSQPPTNP-LEDQYRFLGSSLAAKCPEQAPPEPQTDPYSQYKFWEVDLTERMSEQLDQFPLGRKFLYQSGM----TQRTA-TSSTTKRKTVRVSTSAKRRRKA
      ||.|.|.| ..||||.|.|..|....|..|...|||...||.|.||||..||. |....||||||||..||||....|||.||..|||...|..|||||...|....||||||....||...|.|.||||.....|.....|..|. ..|.||.|..|.|||||.||.||..| |.......|.||........|||||||||||||..|..|...|||.|||.....|.||||..|..||.|||||||||.||.|.||...|||.  |||..|....|.......|........|.||||.||.|.|||||.|||.|.||.||||||.||||.|||||||||.|.|......|  |.|....||.|.||.||..|.|..|||.|.|.||..

In [3]:
import gzip
from collections import Counter

from Bio import SeqIO

def get_total_feature_length(uniprot_record, feature_type):
    return sum([feature.location.end - feature.location.start for feature in uniprot_record.features if \
            feature.type == feature_type])

subcellular_location_counter = Counter()
highest_helix_proportion = None
highest_helix_proportion_record = None

# To get the file, run the following query:
# http://www.uniprot.org/uniprot/?query=*&fil=reviewed%3Ayes+AND+organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22&sort=score
# and then choose: Download --> Format XML (Compressed) --> Go
with gzip.open(os.path.join(DIR, 'uniprot_human_reviewed.xml.gz'), 'rt') as f:
    for i, record in enumerate(SeqIO.parse(f, 'uniprot-xml')):
        
        print('Processing record #%d...' % (i + 1), end = '\r')
        subcellular_location_counter.update(record.annotations.get('comment_subcellularlocation_location', []))
        helix_proportion = get_total_feature_length(record, 'helix') / len(record.seq)
        
        if highest_helix_proportion is None or highest_helix_proportion < helix_proportion:
            highest_helix_proportion = helix_proportion
            highest_helix_proportion_record = record

print('%s has the highest helix proportion: %.2f%%' % (highest_helix_proportion_record.id, 100 * highest_helix_proportion))
print('The 5 most common subcellular locations: %s' % subcellular_location_counter.most_common(5))

Q16873 has the highest helix proportion: 86.67%
The 5 most common subcellular locations: [('Cytoplasm', 5193), ('Nucleus', 5175), ('Cell membrane', 3435), ('Membrane', 2090), ('Secreted', 2069)]
