In [1]:
import os

DIR = r'c://downloads'

In [2]:
import json
from collections import Counter

f = open(os.path.join(DIR, 'herpesvirus_genome.json'), 'r')
data = json.load(f)
f.close()

def noramlize(counter):
    total = sum(counter.values())
    return {key: count / total for key, count in counter.items()}

all_aa_counter = Counter()

for coding_region in data['coding_regions']:
    all_aa_counter.update(coding_region['translation'])

all_aa_freq = noramlize(all_aa_counter)
print(all_aa_freq)

{'M': 0.023263426153234668, 'E': 0.058988567019709615, 'L': 0.08994412101657666, 'R': 0.07816043580931005, 'G': 0.06993056042645718, 'P': 0.06144350143789016, 'S': 0.07776297023684273, 'D': 0.0585443407916579, 'A': 0.11942671436253537, 'Y': 0.027238081877907928, 'C': 0.020644829440508754, 'V': 0.06700801945243272, 'Q': 0.026934137616609385, 'T': 0.05536461621191929, 'N': 0.028383717939725515, 'W': 0.011456360618175867, 'F': 0.03574852119426714, 'H': 0.02029412452362582, 'I': 0.03523415398283884, 'K': 0.03422879988777443}


In [3]:
from operator import itemgetter

aa_counter_per_category = {'envelope': Counter(), 'membrane': Counter(), 'capsid': Counter()}

for coding_region in data['coding_regions']:
    for category, aa_counter in aa_counter_per_category.items():
        if category in coding_region['product'].lower():
            aa_counter.update(coding_region['translation'])
            
aa_freq_per_category = {}
aa_relative_freq_per_category = {}

for category, aa_counter in aa_counter_per_category.items():
    aa_freq = noramlize(aa_counter)
    aa_freq_per_category[category] = aa_freq
    aa_relative_freq = {aa: freq / all_aa_freq[aa] for aa, freq in aa_freq.items()}
    aa_relative_freq_per_category[category] = aa_relative_freq
    
def get_aa_stats(aa, category):
    return '%s: freq ratio = %.2f, freq in category = %.4f, general freq = %.4f' % (aa, \
            aa_relative_freq_per_category[category][aa], aa_freq_per_category[category][aa], all_aa_freq[aa])
    
for category, aa_relative_freq in aa_relative_freq_per_category.items():
    
    max_aa, _ = max(aa_relative_freq.items(), key = itemgetter(1))
    min_aa, _ = min(aa_relative_freq.items(), key = itemgetter(1))
    
    print('%s: ' % category)
    print('\t' + get_aa_stats(max_aa, category))
    print('\t' + get_aa_stats(min_aa, category))

envelope: 
	T: freq ratio = 1.43, freq in category = 0.0792, general freq = 0.0554
	R: freq ratio = 0.75, freq in category = 0.0586, general freq = 0.0782
membrane: 
	T: freq ratio = 1.29, freq in category = 0.0712, general freq = 0.0554
	E: freq ratio = 0.67, freq in category = 0.0396, general freq = 0.0590
capsid: 
	Q: freq ratio = 1.40, freq in category = 0.0377, general freq = 0.0269
	W: freq ratio = 0.71, freq in category = 0.0081, general freq = 0.0115


In [None]:
import csv

HEADERS = ['entry_id', 'family', 'genus', 'title', 'resolution', 'genome', 't', 'subunits', 'inner_radius', \
        'outer_radius', 'average_radius', 'net_surface_charge', 'outside_sasa']

f = open(os.path.join(DIR, 'viperdb.csv'), 'r')
csv_reader = csv.reader(f)
next(csv_reader) # Skip headers
dict_records = [dict(zip(HEADERS, record)) for record in csv_reader]
f.close()

In [4]:
biggest_record_per_genus = {}

for record in dict_records:
    if record['genus'] not in biggest_record_per_genus or int(record['outer_radius']) > \
            int(biggest_record_per_genus[record['genus']]['outer_radius']):
        
        biggest_record_per_genus[record['genus']] = record
    
print('There are %d genera, the biggest records are:' % len(biggest_record_per_genus))

for genus, record in sorted(biggest_record_per_genus.items()):
    print('\t' + '%s: %s (outer radius = %s)' % (genus, record['entry_id'], record['outer_radius']))

There are 72 genera, the biggest records are:
	Alfamovirus: amv (outer radius = 111)
	Allolevivirus: 1qbe (outer radius = 147)
	Alphanodavirus: 1nov (outer radius = 179)
	Alphapapillomavirus: 3j6r (outer radius = 301)
	Alpharetrovirus: 2x8q (outer radius = 120)
	Alphavirus: 2xfc (outer radius = 349)
	Aphthovirus: 1qgc (outer radius = 216)
	Aquabirnavirus: 3ide (outer radius = 133)
	Aquareovirus: 3iyl (outer radius = 476)
	Avibirnavirus: 1wce (outer radius = 374)
	Betatetravirus: 2qqp (outer radius = 210)
	Brevidensovirus: 3n7x (outer radius = 123)
	Bromovirus: ccmv_swln_2 (outer radius = 168)
	Calicivirus: 3m8l (outer radius = 208)
	Cardiovirus: 1tmf (outer radius = 168)
	Carmovirus: 3zx9 (outer radius = 188)
	Chlorovirus: 1m4x (outer radius = 929)
	Chrysovirus: 3j3i (outer radius = 208)
	Circovirus: 3r0r (outer radius = 102)
	Comovirus: 1bmv (outer radius = 160)
	Cripavirus: 3nap (outer radius = 168)
	Cucumovirus: 1f15 (outer radius = 151)
	Cypovirus: 3j17 (outer radius = 484)
	Cystov

In [5]:
sasas_per_group = {'ssRNA': [], 'dsRNA': [], 'ssDNA': [], 'dsDNA': []}

for record in biggest_record_per_genus.values():

    if record['outside_sasa'] == 'N/A':
        continue
        
    sasa = int(record['outside_sasa'])

    for group, sasas in sasas_per_group.items():
        if group in record['genome']:
            sasas_per_group[group].append(sasa)
            
for group, sasas in sasas_per_group.items():
    print('%s: Avg. SASA = %d (%d genera)' % (group, sum(sasas) / len(sasas), len(sasas)))

ssRNA: Avg. SASA = 1710132 (29 genera)
dsRNA: Avg. SASA = 1994869 (12 genera)
ssDNA: Avg. SASA = 895916 (6 genera)
dsDNA: Avg. SASA = 4403428 (21 genera)
