In [1]:
import glob

manta_germ_vcf_path = "data/other/CPCT02130085T_vs_CPCT02130085R.manta.germline.vep.vcf"
manta_som_vcf_path = "data/other/CPCT02130085T_vs_CPCT02130085R.manta.somatic.vep.vcf"
strelka_som_indels_path = "data/other/CPCT02130085T_vs_CPCT02130085R.strelka.somatic.indels.norm.vcf.gz"
ascat_vep_vcf_paths = glob.glob("data/ascat_vep/*.vcf")


In [3]:
import time

import glow
spark = glow.register(spark)
import pyspark.sql.functions as fx
from pyspark.sql.types import *
from pyspark.sql.functions import *
from random import sample
from pyspark.sql.functions import expr

In [4]:
def load_vcf(path, includeSampleIds=True):
    vcf = (
      spark
      .read
      .format("vcf")
      .load(path, includeSampleIds=includeSampleIds)
    )
    return vcf


## ASCAT VEPed VCF

In [12]:
ascat = load_vcf(ascat_vep_vcf_paths[0])
ascat.printSchema()

root
 |-- contigName: string (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- referenceAllele: string (nullable = true)
 |-- alternateAlleles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- qual: double (nullable = true)
 |-- filters: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- splitFromMultiAllelic: boolean (nullable = true)
 |-- INFO_END: integer (nullable = true)
 |-- INFO_CSQ: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Allele: string (nullable = true)
 |    |    |-- Consequence: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- IMPACT: string (nullable = true)
 |    |    |-- SYMBOL: string (nullable = true)
 |    |    |-- Gene: string (nullable = true)
 |    |    |-- Feature_type: string (nullable = true)
 

In [13]:
ascat.groupBy("INFO_SVTYPE").count().collect()

                                                                                

[Row(INFO_SVTYPE='CNV', count=284)]

## Strelka somatic indels

In [14]:
ss_indel = load_vcf(strelka_som_indels_path)
ss_indel.printSchema()

root
 |-- contigName: string (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- referenceAllele: string (nullable = true)
 |-- alternateAlleles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- qual: double (nullable = true)
 |-- filters: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- splitFromMultiAllelic: boolean (nullable = true)
 |-- INFO_RU: string (nullable = true)
 |-- INFO_SomaticEVS: double (nullable = true)
 |-- INFO_QSI_NT: integer (nullable = true)
 |-- INFO_MQ0: integer (nullable = true)
 |-- INFO_QSI: integer (nullable = true)
 |-- INFO_RC: integer (nullable = true)
 |-- INFO_OVERLAP: boolean (nullable = true)
 |-- INFO_TQSI_NT: integer (nullable = true)
 |-- INFO_TQSI: integer (nullable = true)
 |-- INFO_IHP: integer (nullable = true)
 |-- INFO_MQ: double (nullable = true)
 |-- INFO_SGT: stri



In [16]:
ss_indel.groupBy('filters').count().collect()

                                                                                

[Row(filters=['LowEVS'], count=128489),
 Row(filters=['PASS'], count=3654),
 Row(filters=['HighDepth;LowEVS'], count=78),
 Row(filters=['HighDepth'], count=3),
 Row(filters=['LowDepth'], count=1),
 Row(filters=['LowEVS;LowDepth'], count=1)]

## Manta somatic VEPed

In [17]:
manta_som = load_vcf(manta_som_vcf_path)
manta_som.printSchema()

root
 |-- contigName: string (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- referenceAllele: string (nullable = true)
 |-- alternateAlleles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- qual: double (nullable = true)
 |-- filters: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- splitFromMultiAllelic: boolean (nullable = true)
 |-- INFO_LEFT_SVINSSEQ: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_CIEND: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- INFO_CIGAR: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_MATE_BND_DEPTH: integer (nullable = true)
 |-- INFO_EVENT: string (nullable = true)
 |-- INFO_SVINSSEQ: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_BND_DEPTH: 

In [18]:
manta_som.groupBy('INFO_SVTYPE').count().collect()

                                                                                

[Row(INFO_SVTYPE='DUP', count=16),
 Row(INFO_SVTYPE='INS', count=2),
 Row(INFO_SVTYPE='DEL', count=35),
 Row(INFO_SVTYPE='BND', count=196)]

## Manta germline VEPed

In [19]:
manta_germ = load_vcf(manta_germ_vcf_path)
manta_germ.printSchema()

root
 |-- contigName: string (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- referenceAllele: string (nullable = true)
 |-- alternateAlleles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- qual: double (nullable = true)
 |-- filters: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- splitFromMultiAllelic: boolean (nullable = true)
 |-- INFO_LEFT_SVINSSEQ: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_JUNCTION_QUAL: integer (nullable = true)
 |-- INFO_CIEND: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- INFO_CIGAR: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- INFO_MATE_BND_DEPTH: integer (nullable = true)
 |-- INFO_EVENT: string (nullable = true)
 |-- INFO_SVINSSEQ: array (nullable = true)
 |    |-- element:

In [21]:
manta_germ.groupBy('INFO_SVTYPE').count().collect()

                                                                                

[Row(INFO_SVTYPE='DUP', count=505),
 Row(INFO_SVTYPE='INS', count=3309),
 Row(INFO_SVTYPE='DEL', count=5587),
 Row(INFO_SVTYPE='BND', count=3984)]