In [1]:
import os
import gzip
import csv
import json
from collections import defaultdict

DIR = r'c://downloads'

# Preprocessing and exploratory analysis of GENCODE

In [2]:
# File taken from:
# ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/GRCh37_mapping/gencode.v29lift37.annotation.gtf.gz
f = gzip.open(os.path.join(DIR, 'gencode.v29lift37.annotation.gtf.gz'), 'rt')
csv_reader = csv.reader(f, delimiter = '\t')

for _ in range(5):
    next(csv_reader)

def parse_extra_fields(raw_extra_fields):

    extra_fields = {}

    for raw_extra_field in raw_extra_fields[:-1].split(';'):
        key, raw_value = raw_extra_field.strip().split(' ')
        value = raw_value.strip('"')
        extra_fields[key] = value
        
    return extra_fields

# A dictionary from chromosome name to a list of genes.
# Each gene is a tuple of 4 elements: gene_name (str), gene_type (str), start (int), end (int)
genes_per_chromosome = defaultdict(list)
# A dictionry from gene name to a list of exons.
# Each exon is a (start, end) tuple
exons_per_gene = defaultdict(list)

# We explore the GENCODE database, and are first interested to see what annotation types and gene types are available
# within this dataset.
all_annotation_types = set()
all_gene_types = set()

for a_chr, _, a_type, a_start, a_end, _, _, _, raw_extra_fields in csv_reader:

    all_annotation_types.add(a_type)
    
    if a_type == 'gene':
        extra_fields = parse_extra_fields(raw_extra_fields)
        all_gene_types.add(extra_fields['gene_type'])
        genes_per_chromosome[a_chr].append((extra_fields['gene_name'], extra_fields['gene_type'], int(a_start), int(a_end)))
    elif a_type == 'exon':
        exons_per_gene[extra_fields['gene_name']].append((int(a_start), int(a_end)))
        
f.close()

print('All annotation types: ' + ', '.join(sorted(all_annotation_types)))
print('*' * 50)
print('All gene types: ' + ', '.join(sorted(all_gene_types)))
print('*' * 50)
print('Extracted %d genes in %d chromosomes and %d exons in %d genes.' % (sum(map(len, genes_per_chromosome.values())), \
        len(genes_per_chromosome), sum(map(len, exons_per_gene.values())), len(exons_per_gene)))

All annotation types: CDS, Selenocysteine, UTR, exon, gene, start_codon, stop_codon, transcript
**************************************************
All gene types: 3prime_overlapping_ncRNA, IG_C_gene, IG_C_pseudogene, IG_D_gene, IG_J_gene, IG_J_pseudogene, IG_V_gene, IG_V_pseudogene, IG_pseudogene, Mt_rRNA, Mt_tRNA, TEC, TR_C_gene, TR_D_gene, TR_J_gene, TR_J_pseudogene, TR_V_gene, TR_V_pseudogene, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, miRNA, misc_RNA, non_coding, polymorphic_pseudogene, processed_pseudogene, processed_transcript, protein_coding, pseudogene, rRNA, scRNA, sense_intronic, sense_overlapping, snRNA, snoRNA, transcribed_processed_pseudogene, transcribed_unitary_pseudogene, transcribed_unprocessed_pseudogene, translated_processed_pseudogene, unitary_pseudogene, unprocessed_pseudogene, vaultRNA
**************************************************
Extracted 60880 genes in 37 chromosomes and 1267972 exons in 57781 genes.


# Q1A

In [3]:
def filter_genes_of_type(genes, gene_type):
    return [gene for gene in genes if gene[1] == gene_type]

protein_coding_genes_per_chromosome = {}

for chromosome, genes in sorted(genes_per_chromosome.items()):
    protein_coding_genes = filter_genes_of_type(genes, 'protein_coding')
    miRNA_genes = filter_genes_of_type(genes, 'miRNA')
    protein_coding_genes_per_chromosome[chromosome] = protein_coding_genes
    print('%s: %d genes, of which %d protein coding and %d miRNA' % (chromosome, len(genes), len(protein_coding_genes), \
            len(miRNA_genes)))
    
f = open(os.path.join(DIR, 'protein_coding_genes_per_chromosome.json'), 'w')
json.dump(protein_coding_genes_per_chromosome, f)
f.close()

GL000192.1: 1 genes, of which 0 protein coding and 0 miRNA
GL000193.1: 3 genes, of which 0 protein coding and 0 miRNA
GL000195.1: 16 genes, of which 0 protein coding and 0 miRNA
GL000199.1: 1 genes, of which 0 protein coding and 0 miRNA
GL000202.1: 1 genes, of which 0 protein coding and 0 miRNA
GL000204.1: 1 genes, of which 0 protein coding and 0 miRNA
GL000205.1: 2 genes, of which 0 protein coding and 0 miRNA
GL000212.1: 8 genes, of which 0 protein coding and 0 miRNA
GL000220.1: 10 genes, of which 0 protein coding and 0 miRNA
GL000228.1: 6 genes, of which 0 protein coding and 0 miRNA
GL000237.1: 1 genes, of which 0 protein coding and 0 miRNA
GL000241.1: 1 genes, of which 0 protein coding and 0 miRNA
chr1: 5556 genes, of which 2094 protein coding and 240 miRNA
chr10: 2367 genes, of which 753 protein coding and 120 miRNA
chr11: 3333 genes, of which 1328 protein coding and 136 miRNA
chr12: 3073 genes, of which 1056 protein coding and 122 miRNA
chr13: 1375 genes, of which 328 protein codi

# Q1B

In [4]:
for chromosome, genes in sorted(genes_per_chromosome.items()):

    exon_lengths = []

    for gene_name, gene_type, _, _ in genes:
        if gene_type == 'protein_coding':
            for start, end in exons_per_gene[gene_name]:
                exon_lengths.append(end - start + 1)
                
    if len(exon_lengths) > 0:
        avg_exon_length = sum(exon_lengths) / len(exon_lengths)
        print('%s: avg exon length = %d' % (chromosome, avg_exon_length))

chr1: avg exon length = 248
chr10: avg exon length = 242
chr11: avg exon length = 234
chr12: avg exon length = 225
chr13: avg exon length = 261
chr14: avg exon length = 241
chr15: avg exon length = 239
chr16: avg exon length = 230
chr17: avg exon length = 223
chr18: avg exon length = 247
chr19: avg exon length = 230
chr2: avg exon length = 239
chr20: avg exon length = 248
chr21: avg exon length = 245
chr22: avg exon length = 234
chr3: avg exon length = 235
chr4: avg exon length = 246
chr5: avg exon length = 244
chr6: avg exon length = 255
chr7: avg exon length = 236
chr8: avg exon length = 240
chr9: avg exon length = 244
chrM: avg exon length = 876
chrX: avg exon length = 258
chrY: avg exon length = 228


# Q1C

In [5]:
longest_intergenic_region_length = 0
longest_intergenic_region = None

for chromosome, genes in genes_per_chromosome.items():
    
    # For each chromosome, we extract the coordinates of all genes (regardless of their type), and sort them
    # by the start coordinate, so we can then iterate them one after the next with an increasing start coordinate
    # (this will be crucial).
    gene_coordinates = [(start, end) for _, _, start, end in genes]
    gene_coordinates.sort()
    
    # This variable stores the last cooridnate along the chromosome we have processed so far. For example, if we have
    # last_processed_coordinate = 7,123,456, then it means we have already processed all the intergenic regions until coordinate
    # 7,123,456 on the chromosome and there is a gene ending in that coordinate. 
    last_processed_coordinate = 0
    
    for start, end in gene_coordinates:
        
        # The last genes we processed ended in 'last_processed_coordinate', and the current gene starts in 'start' coordinate.
        # Therefore, this expression calculates the length of the intergenic region we have just encountered.
        # Note that it might be that start < last_processed_coordinate, if we have just encountered a gene overlapping with the
        # last gene. In that case, 'intergenic_length' will become a negative number, and the next if statement won't run.
        # Therefore, it is not an issue.
        # Remember we are dealing with 1-based indices. 
        intergenic_length = start - last_processed_coordinate - 1

        if intergenic_length > longest_intergenic_region_length:
            # We have just broken the record and found the biggest intergenic regiob thus far.
            longest_intergenic_region_length = intergenic_length
            longest_intergenic_region = (chromosome, last_processed_coordinate + 1, start - 1)
                
        # Now we have processed all the genetic loci up entil the 'end' coordinate of the current gene.
        # In rare instances (when we have overlapping genes), it might be that end < last_gene_coordinate (i.e. the
        # current gene overlaps with the last processed gene but ends before it). That's the reason why we take
        # the maximum between the two. 
        last_processed_coordinate = max(last_processed_coordinate, end)
        
print('Longest intergenic region: %s:%d-%d (length = %d bp)' % (longest_intergenic_region + \
        (longest_intergenic_region_length,)))

Longest intergenic region: chrY:28780800-59001390 (length = 30220591 bp)


According to this analysis, the longest intergenic region is a ~30Mbp region on chromosome Y.

Of course, the very notion of "intergenic region" is very ill-defined. Crucially, it depends on (i) what we define as a gene, and (ii) what gene annotations we are using. The notion of "longest intergenic region" is particularly sensitive, because it is sufficient to have one additional gene annotation (in the middle of that region) to cut its length in half, deeming it no longer the longest intergenic region.

In our case, we relied on GENCODE (release 29, mapped to version GRCh37 of the human reference genome); we considered all the genomic elements describes as "genes". It is very likely that if we used different annotations (even just a different version of GENCODE) we would get a totally different genomic region, due to the considerations mentioned above.