# Setup

In [1]:
import os
import gzip
import csv
import json
from collections import defaultdict

from Bio import SeqIO, pairwise2
from Bio.pairwise2 import format_alignment
from Bio.SubsMat.MatrixInfo import blosum62
from Bio.SeqUtils import molecular_weight

from scipy.stats import pearsonr

# A patch required to let the csv library parsing long fields
csv.field_size_limit(2 ** 31 - 1)

DIR = r'c://downloads'

# Q1

In [2]:
# Parse the sequence of the human ring finger protein (C3H2C3 type) 6
# Downloaded from UniProt at: https://www.uniprot.org/uniprot/A0A024RDP2.fasta
human_record, = SeqIO.parse(os.path.join(DIR, 'A0A024RDP2.fasta'), 'fasta')
human_seq = human_record.seq
print(len(human_seq))
print(human_seq)

685
MNQSRSRSDGGSEETLPQDHNHHENERRWQQERLHREEAYYQFINELNDEDYRLMRDHNLLGTPGEITSEELQQRLDGVKEQLASQPDLRDGTNYRDSEVPRESSHEDSLLEWLNTFRRTGNATRSGQNGNQTWRAVSRTNPNNGEFRFSLEIHVNHENRGFEIHGEDYTDIPLSDSNRDHTANRQQRSTSPVARRTRSQTSVNFNGSSSNIPRTRLASRGQNPAEGSFSTLGRLRNGIGGAAGIPRANASRTNFSSHTNQSGGSELRQREGQRFGAAHVWENGARSNVTVRNTNQRLEPIRLRSTSNSRSRSPIQRQSGTVYHNSQRESRPVQQTTRRSVRRRGRTRVFLEQDRERERRGTAYTPFSNSRLVSRITVEEGEESSRSSTAVRRHPTITLDLQVRRIRPGENRDRDSIANRTRSRVGLAENTVTIESNSGGFRRTISRLERSGIRTYVSTITVPLRRISENELVEPSSVALRSILRQIMTGFGELSSLMEADSESELQRNGQHLPDMHSELSNLGTDNNRSQHREGSSQDRQAQGDSTEMHGENETTQPHTRNSDSRGGRQLRNPNNLVETGTLPILRLAHFFLLNESDDDDRIRGLTKEQIDNLSTRHYEHNSIDSELGKICSVCISDYVTGNKLRQLPCMHEFHIHCIDRWLSENCTCPICRQPVLGSNIANNG


In [3]:
# Parse the ViralZone records.

viralzone_records = []

with open(os.path.join(DIR, 'viralzone.csv'), 'r') as f:
    
    csv_reader = csv.reader(f)
    
    # Skip the header line.
    next(csv_reader) 
    
    for group, family, genus, strain, _, _, _, _, raw_genome in csv_reader:
        viralzone_records.append({'group': group, 'family': family, 'genus': genus, 'strain': strain, \
                'genome': json.loads(raw_genome)})

print('Parsed %d ViralZone records.' % len(viralzone_records))

Parsed 384 ViralZone records.


In [4]:
# Extracting from ViralZone all the proteins from members of the Herpesviridae viral family that infect human (filtered by
# demanding "human" to be in the virus's strain name).

relevant_herpes_proteins = []

for record in viralzone_records:
    if record['family'] == 'Herpesviridae' and 'human' in record['strain'].lower():
        for genome_segment in record['genome']:
            for coding_region in genome_segment['coding_regions']:
                relevant_herpes_proteins.append((coding_region['translation'], coding_region['product'], record['strain']))

print('Extracted %d relevant proteins for the search.' % len(relevant_herpes_proteins))

Extracted 556 relevant proteins for the search.


In [5]:
# Among the candidates, find the viral protein with the best match.
# Note that we choose to run a local alignment, because we want to allow distant homologues (including small pieces hijacked
# by the virus).
# Like in the lab exercise, we normalize the score by the length of the alignment, but we require it to be not too short (as
# an arbitrary choice, at least 30 residues).

best_score = None
best_match = None

for i, (seq, protein_name, viral_strain) in enumerate(relevant_herpes_proteins):
    
    print('%d/%d' % (i + 1, len(relevant_herpes_proteins)), end = '\r')

    (_, _, score, begin, end), = pairwise2.align.localds(human_seq, seq, blosum62, -10.0, -0.5, one_alignment_only = True)
    alignment_length = end - begin
    
    if alignment_length < 30:
        continue
    
    score /= alignment_length
    
    if best_score is None or best_score < score:
        best_score = score
        best_match = (seq, protein_name, viral_strain)

seq, protein_name, viral_strain = best_match
print('Best match: %s, of the viral strain %s (score: %.2f)' % (protein_name, viral_strain, best_score))
print('Lengths: human = %d, viral = %d' % (len(human_seq), len(seq)))
print('Alignment:')
alignment, = pairwise2.align.localds(human_seq, seq, blosum62, -10.0, -0.5, one_alignment_only = True)
print(format_alignment(*alignment))

Best match: ubiquitin E3 ligase ICP0, of the viral strain Human herpesvirus 2 (score: 2.35)
Lengths: human = 685, viral = 824
Alignment:
MNQSRSRSDGGSEETLPQDHNHHENERRWQQERLHREEAYYQFINELNDEDYRLMRDHNLLGTPGEITSEELQQRLDGVKEQLASQPDLRDGTNYRDSEVPRESSHEDSLLEWLNTFRRTGNATRSGQNGNQTWRAVSRTNPNNGEFRFSLEIHVNHENRGFEIHGEDYTDIPLSDSNRDHTANRQQRSTSPVARRTRSQTSVNFNGSSSNIPRTRLASRGQNPAEGSFSTLGRLRNGIGGAAGIPRANASRTNFSSHTNQSGGSELRQREGQRFGAAHVWENGARSNVTVRNTNQRLEPIRLRSTSNSRSRSPIQRQSGTVYHNSQRESRPVQQTTRRSVRRRGRTRVFLEQDRERERRGTAYTPFSNSRLVSRITVEEGEESSRSSTAVRRHPTITLDLQVRRIRPGENRDRDSIANRTRSRVGLAENTVTIESNSGGFRRTISRLERSGIRTYVSTITVPLRRISENELVEPSSVALRSILRQIMTGFGELSSLMEADSESELQRNGQHLPDMHSELSNLGTDNNRSQHREGSSQDRQAQGDSTEMHGENETTQPHTRNSDSRGGRQLRNPNNLVETGTLPILRLAHFFLLNESDDDDRIRGLTKEQIDNLSTRHYEHNSIDSELGKICSVCISDYVTGNKLRQLPCMHEFHIHCIDRWLSENCTCPICRQPVLGSNIANNG----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Virus-human homologues and, more generally, virus-host homologues, sometimes occur through the process of horizontal gene transfer. Such horizontal transfer can occur in both directions (i.e. either pieces of the viral DNA are integrated into the host's genome, or the virus hijacks genes from the host). See for example: [Liu, H. et al. Widespread horizontal gene transfer from double-stranded RNA viruses to eukaryotic nuclear genomes. J. Virol. 84, 11876â€“11887 (2010).](https://jvi.asm.org/content/84/22/11876.short)

In this specific case, the homology we have found seems quite weak. If I was forced to make a guess, I'd say that the homology does seem genuine, but I am far from certain (a more educated judgement would require more advanced statistical tools and evolutionary models, and/or better expertise with the subject matter).

The reasons that I tend to accept this homology are:
1. The local alignment is continuous (without any gaps in it).
2. There are many exact matches in the shared segment (17 of 48 residues, within proteins of 685 and 824 amino-acids).
3. Even among the mismatches, many are reasonable amino-acid pairs (e.g. K/R, D/E).

On the other hand, we need to keep in mind that we have attempted 556 protein pairs, searching for the best local alignment within each pair, so some measure of similarity is expected even at random.

# Q2

About whether the researchers' conclusion is justified - hell no! The human genome is far larger than the E. coli genome (~3 billion compared to ~5 million bp - a three orders-of-magnitude difference!), so it is expected that any random sequence would find a better match to some segment of the human genome than of the E. coli genome. Another major problem stems from the fact that we are talking about a meta-genomic sample, which is likely a mix of many organisms and viruse. In reality, the sample is likely comprised of both eukaryotes AND prokaryotes (so a meaningful conclusion from this analysis could be, for example, an estimate of the relative abundance of organisms belonging to each of the two domains in the sequenced sample).

If we want this analysis to make sense, it should be accompanied by some sort of probabilistic calculation that accounts for the different genome lengths of the two species. It would also be better to compare the seqeunces to all prokaryote and eukaryote organisms (instead of just human and E. coli as two arbitary representatives).

# Q3

In [6]:
# We iterate over all reviewed UniProt human proteins, extracting the total number of residues for each of the three secondary structures, and
# calculating the molecular weight of each protein from the reviewed human proteins downloaded from UniProt.

SS_OPTIONS = [
    'helix',
    'strand',
    'turn',
]

def get_total_feature_length(uniprot_record, feature_type):
    return sum([feature.location.end - feature.location.start for feature in uniprot_record.features if \
            feature.type == feature_type])

relative_masses = []
relative_lengths_per_ss = defaultdict(list)

# From: http://www.uniprot.org/uniprot/?query=organism%3A%22Homo+sapiens+%5B9606%5D%22+AND+reviewed%3Ayes&sort=score
with gzip.open(os.path.join(DIR, 'uniprot_human_reviewed.xml.gz'), 'r') as f:
    for i, record in enumerate(SeqIO.parse(f, 'uniprot-xml')):
    
        print(i, end = '\r')
        record_relative_lengths_per_ss = {ss: get_total_feature_length(record, ss) / len(record.seq) for ss in SS_OPTIONS}
        
        # If there are no residues with any secondary structure (i.e. the sum of the relative lengths, over all secondary
        # structures, is 0), then it the protein is excluded from the analysis (it probably just has missing annotations).
        if sum(record_relative_lengths_per_ss.values()) > 0:
            
            for ss, record_ss_relative_length in record_relative_lengths_per_ss.items():
                relative_lengths_per_ss[ss].append(record_ss_relative_length)

            relative_masses.append(molecular_weight(record.seq, seq_type = 'protein') / len(record.seq))
            
print('Correlations between relative mass to any of the three 2D structures:')
            
for ss, ss_relative_lengths in relative_lengths_per_ss.items():
    print('\t' + '%s: r = %.2f, p-value = %.2e' % ((ss.capitalize(),) + pearsonr(relative_masses, ss_relative_lengths)))

Correlations between relative mass to any of the three 2D structures:
	Helix: r = 0.23, p-value = 3.59e-72
	Strand: r = 0.03, p-value = 3.24e-02
	Turn: r = 0.13, p-value = 8.05e-24


All three secondary structures give a significant, positive correlation with molecular weight (though in the case of Strand, the significance is borderline). The correlation is strongest for Helix (Pearson's coefficient of 0.23), and most significant (a p-value of 3.6e-72 cannot be attributed to chance). We conclude that molecular weight is in fact associated with secondary structure, but whether this is considered "substantial" is a matter of definition (personally, I would consider it a medium strength association for Helix and Turn, and weak, if any, for Strand). 