In [None]:
import os
import gzip

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display
from ipywidgets import FloatProgress

%matplotlib inline

DIR = r'c://downloads'

plt.style.use('ggplot')

# PyVCF

In [None]:
# Phase 3 data from the 1000 Genomes Project (2504 individuals)
# From: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/
# ALL.chr17.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
VCF_FILE_NAME = 'ALL.chr17.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'
vcf_file_path = os.path.join(DIR, VCF_FILE_NAME)

with gzip.open(vcf_file_path, 'rt') as f:
    print(f.read(100000))

In [None]:
import vcf

vcf_reader = vcf.Reader(filename = vcf_file_path, compressed = True)

In [None]:
print(vcf_reader.metadata)
print('*' * 50)

for key, value in vcf_reader.metadata.items():
    print('%s: %s' % (key, value))

In [None]:
for name, info in vcf_reader.infos.items():
    print('%s [%s]: %s' % (name, info.type, info.desc))

In [None]:
print(len(vcf_reader.samples))
print(vcf_reader.samples[:10])

In [None]:
record = next(vcf_reader)
print('Chromosome %s at %d: %s --> %s (%s)' % (record.CHROM, record.POS, record.REF, record.ALT, record.ID))
print(type(record.ALT))
print(record.INFO)

In [None]:
print(record.var_type)
print(record.is_snp, record.is_indel)
print(record.alleles)

In [None]:
print(len(record.samples))
print(record.samples[:10])

In [None]:
call = record.samples[0]
print(call)
print(call.sample)
print(call.gt_alleles)
print(call.gt_bases)
print(call.is_variant, call.is_het)

In [None]:
print(record.genotype('HG00107'))
print(record.get_hom_refs()[:3])
print(record.get_hets())
print(record.get_hom_alts())
print(record.get_unknowns())
print(record.num_called, record.num_hom_ref, record.num_het, record.num_hom_alt, record.num_unknown)

# Example: building DNA substitution matrix

In [None]:
from itertools import islice

BASES = list('ACGT')

dna_sub_matrix = pd.DataFrame(0, index = BASES, columns = BASES)

N_RECORDS = 5000
progress_bar = FloatProgress(max = N_RECORDS)
display(progress_bar)

for i, record in enumerate(islice(vcf.Reader(filename = vcf_file_path, compressed = True), N_RECORDS)):

    progress_bar.value = i

    if record.is_snp:
        for alt_allele, frequency in zip(record.ALT, record.INFO['AF']):
            dna_sub_matrix.loc[record.REF, alt_allele.sequence] += frequency
            
dna_sub_matrix = dna_sub_matrix.div(dna_sub_matrix.sum(axis = 1), axis = 0)
display(dna_sub_matrix)

In [None]:
fig, ax = plt.subplots(figsize = (8, 6))
heatmap = ax.pcolor(dna_sub_matrix, cmap = 'cool', vmin = 0)
fig.colorbar(heatmap)

ax.set_xticks(np.arange(len(BASES)) + 0.5)
ax.set_xticklabels(BASES)
ax.set_yticks(np.arange(len(BASES)) + 0.5)
ax.set_yticklabels(BASES)

for i in range(len(BASES)):
    for j in range(len(BASES)):
        ax.text(i + 0.5, j + 0.5, '%d%%' % (100 * dna_sub_matrix.iloc[i, j]), ha = 'center', va = 'center', color = 'black',
                fontsize = 16)

_ = ax.set_title('DNA Nucleotide Substitution Matrix')

In-group transitions of Purines (Adenine & Guanine) and Pyrimidines (Cytosine & Thymine) are more common than cross-group transversions.