# JSON

In [None]:
import json

f = open(r'c://downloads/herpesvirus_genome.json', 'r')
data = json.load(f)
f.close()
print(type(data))

In [None]:
# It's safer to check how big something is before printing it.
print(len(data))

In [None]:
for key, value in data.items():
    print(key, type(value))

In [None]:
for key, value in data.items():
    if key != 'coding_regions':
        print('%s: %s' % (key, value))

In [None]:
coding_regions = data['coding_regions']
print(type(coding_regions))
print(len(coding_regions))

In [None]:
coding_region = coding_regions[0]
print(type(coding_region))

In [None]:
print(len(coding_region))

In [None]:
for key, value in coding_region.items():
    print(key, type(value))

In [None]:
for key, value in coding_region.items():
    if key != 'intervals':
        print('%s: %s' % (key, value))

In [None]:
print(len(coding_region['intervals']))

In [None]:
interval, = coding_region['intervals']
print(type(interval))

In [None]:
print(len(interval))

In [None]:
for key, value in interval.items():
    print(key, type(value))

In [None]:
print(interval)

In [None]:
print(coding_region)

In [None]:
products = []

for coding_region in coding_regions:
    products.append(coding_region['product'])
    
print(products)

In [None]:
lengths_per_group = {'envelope': [], 'membrane': [], 'capsid': []}
all_lengths = []

for coding_region in coding_regions:
    
    product_name = coding_region['product'].lower()
    length = len(coding_region['translation'])
    
    for group_name, group_lengths in lengths_per_group.items():
        if group_name in product_name:
            group_lengths.append(length)
    
    all_lengths.append(length)
    
lengths_per_group['all'] = all_lengths

for group_name, group_lengths in sorted(lengths_per_group.items()):
    avg = sum(group_lengths) / len(group_lengths)
    print('%s: # = %d, avg. = %.2f aa' % (group_name, len(group_lengths), avg))

In [None]:
f = open(r'c://downloads/protein_lengths_per_group.json', 'w')
json.dump(lengths_per_group, f)
f.close()

In [None]:
raw_json = json.dumps(lengths_per_group)
print(type(raw_json))
print(raw_json)

In [None]:
data = json.loads(raw_json)
print(type(data))
print(data)

# CSV

In [None]:
# Reading human gene annotations from gencode.v29lift37.annotation.gtf.gz at:
# https://www.gencodegenes.org/human/release_29lift37.html

import gzip

f = gzip.open(r'c://downloads/gencode.v29lift37.annotation.gtf.gz', 'rt')
print(f.read(1000))
f.close()

In [None]:
# When using gzip with Python 3, you need to explictly ask for text (t) mode
f = gzip.open(r'c://downloads/gencode.v29lift37.annotation.gtf.gz', 'r')
print(f.read(1000))
f.close()

In [None]:
import csv

f = gzip.open(r'c://downloads/gencode.v29lift37.annotation.gtf.gz', 'rt')

# Default delimiter is comma (,)
csv_reader = csv.reader(f, delimiter = '\t')

# Skip 5 first header lines 
for _ in range(5):
    next(csv_reader)
    
annotations = []

for _ in range(100):
    annotations.append(next(csv_reader))

f.close()

print(len(annotations))
print(annotations[:5])

In [None]:
# If want to go over all lines, just iterate over the csv reader with a for loop (no need to use the 'next' function)
# (this will take too long for this demo)

for line in csv_reader:
    # Do something...
    pass

In [None]:
genes = []

def parse_extra_fields(raw_extra_fields):

    extra_fields = {}

    for raw_extra_field in raw_extra_fields[:-1].split(';'):
        key, raw_value = raw_extra_field.strip().split(' ')
        value = raw_value.strip('"')
        extra_fields[key] = value
        
    return extra_fields

for a_chr, _, a_type, a_start, a_end, _, _, _, raw_extra_fields in annotations:
    if a_type == 'gene':
        extra_fields = parse_extra_fields(raw_extra_fields)
        genes.append([extra_fields['gene_name'], extra_fields['gene_type'], a_chr, int(a_start), int(a_end)])

print(len(genes))
print(genes)

In [None]:
f = open(r'c://downloads/genes.csv', 'w', newline = '')
csv_writer = csv.writer(f)
csv_writer.writerows(genes)
f.close()

Notes:
* `newline = ''` is for avoiding blank lines between each two content lines
* `writerows` expects a list of lists of strings (or objects to convert to strings)
* You can write one row at a time using `writerow`, which expects a list of strings