## https://glow.readthedocs.io/_/downloads/en/latest/pdf/

In [1]:
import time
import glob

import glow
spark = glow.register(spark)
import pyspark.sql.functions as fx
from pyspark.sql.types import *
from pyspark.sql.functions import *
from random import sample

In [2]:
vcf_path_glob = 'data/*R.strelka.somatic.snvs.vcf.gz'
vcf_paths = glob.glob(vcf_path_glob)
vcf_path = vcf_paths[0]

def load_vcf(path, includeSampleIds=True):
    vcf = (
      spark
      .read
      .format("vcf")
      .load(path, includeSampleIds=includeSampleIds)
    )
    return vcf

def filter_pass(vcf):
    return vcf.filter(array_contains(col("filters"), 'PASS'))

def get_only_tumor_df(vcf, cols2keep=None):
    if cols2keep is None:
        cols2keep = [e for e in vcf.columns if e not in ['genotypes']]
    
    #
    # this is a proper way to explot the genotypes array but due to a bug? / Delta API changes? it does not work
    #vcf.select(element_at(col("genotypes"),1).alias('tumor')).select(glow.expand_struct("tumor"))
    tvcf = vcf.select(cols2keep + [vcf.genotypes.getItem(1).alias('tumor')])
    return tvcf.select(cols2keep + ["tumor.*"]).drop('sampleId')
    
def get_tumor_and_normal_id_from_strelka_somatic_vcf_path(path):
    extension = '.strelka.somatic.snvs.vcf.gz'
    tvsn = os.path.basename(path)[:-len(extension)]
    try:
        tsid,nsid = tvsn.split('_vs_')
        return (tsid,nsid)
    except ValueError:
        print("For tumor and normal ID inference the filename of Strelka somatic VCF "+
              "is expected to be [TUMOR_vs_NORMAL" + extension + "].\nIt was " + os.path.basename(path))
    
def load_strelka_somatic_vcf(path, tumor_sample_id=None, normal_sample_id=None):
    tsid,nsid = get_tumor_and_normal_id_from_strelka_somatic_vcf_path(path)
    if (tumor_sample_id is None):
        tumor_sample_id = tsid
    if (normal_sample_id is None):
        normal_sample_id = nsid
    
    vcf = load_vcf(path)
    fvcf = filter_pass(vcf)
    tvcf = get_only_tumor_df(fvcf)
    return tumor_sample_id, tvcf


def get_df_for_vcfs(vcf_paths):
    vcf_df = None
    for p in vcf_paths:
        tid, vcf = load_strelka_somatic_vcf(p)
        new_vcf = vcf.withColumn('sampleId', lit(tid)) 
        vcf_df = new_vcf if vcf_df is None else vcf_df.unionAll(new_vcf)
    return vcf_df


## Load VCF

### Load single

In [3]:
%%time
vcf = load_vcf(vcf_path)
vcf.printSchema()

                                                                                

root
 |-- contigName: string (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- referenceAllele: string (nullable = true)
 |-- alternateAlleles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- qual: double (nullable = true)
 |-- filters: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- splitFromMultiAllelic: boolean (nullable = true)
 |-- INFO_SomaticEVS: double (nullable = true)
 |-- INFO_QSS_NT: integer (nullable = true)
 |-- INFO_PNOISE2: double (nullable = true)
 |-- INFO_MQ0: integer (nullable = true)
 |-- INFO_ReadPosRankSum: double (nullable = true)
 |-- INFO_TQSS: integer (nullable = true)
 |-- INFO_SNVSB: double (nullable = true)
 |-- INFO_DP: integer (nullable = true)
 |-- INFO_PNOISE: double (nullable = true)
 |-- INFO_QSS: integer (nullable = true)
 |-- INFO_MQ: double (nullable = true)
 |-- INFO

#### Load and filter

In [4]:
%%time
tid, vcf = load_strelka_somatic_vcf(vcf_path)



CPU times: user 25.8 ms, sys: 15.8 ms, total: 41.6 ms
Wall time: 934 ms


In [5]:
vcf.head(2)

21/08/10 21:33:59 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Row(contigName='1', start=60790, end=60791, names=None, referenceAllele='A', alternateAlleles=['G'], qual=None, filters=['PASS'], splitFromMultiAllelic=False, INFO_SomaticEVS=8.84, INFO_QSS_NT=72, INFO_PNOISE2=None, INFO_MQ0=5, INFO_ReadPosRankSum=-2.18, INFO_TQSS=1, INFO_SNVSB=0.0, INFO_DP=85, INFO_PNOISE=None, INFO_QSS=72, INFO_MQ=52.57, INFO_SGT='AA->AG', INFO_SOMATIC=True, INFO_NT='ref', INFO_TQSS_NT=1, FDP=0, GU=[7, 14], AU=[34, 35], depth=41, SUBDP=0, TU=[0, 0], SDP=0, CU=[0, 0]),
 Row(contigName='1', start=701235, end=701236, names=None, referenceAllele='C', alternateAlleles=['T'], qual=None, filters=['PASS'], splitFromMultiAllelic=False, INFO_SomaticEVS=7.26, INFO_QSS_NT=75, INFO_PNOISE2=None, INFO_MQ0=24, INFO_ReadPosRankSum=-3.81, INFO_TQSS=1, INFO_SNVSB=0.0, INFO_DP=154, INFO_PNOISE=None, INFO_QSS=75, INFO_MQ=37.2, INFO_SGT='CC->CT', INFO_SOMATIC=True, INFO_NT='ref', INFO_TQSS_NT=1, FDP=1, GU=[0, 0], AU=[0, 0], depth=78, SUBDP=0, TU=[11, 19], SDP=0, CU=[66, 83])]

In [6]:
%%time
vcf.count()



CPU times: user 4.64 ms, sys: 3.35 ms, total: 7.99 ms
Wall time: 3.04 s


                                                                                

21840

### Load multiple VCFs

In [7]:
%%time
vcf5_df = get_df_for_vcfs(vcf_paths[0:5])
vcf_df = vcf5_df



CPU times: user 209 ms, sys: 44.1 ms, total: 253 ms
Wall time: 2.78 s


In [8]:
%%time
vcf30_df = get_df_for_vcfs(vcf_paths[0:30])



CPU times: user 1.07 s, sys: 184 ms, total: 1.25 s
Wall time: 10.4 s


In [9]:
%%time
vcf133_df = get_df_for_vcfs(vcf_paths)









CPU times: user 3.87 s, sys: 718 ms, total: 4.59 s
Wall time: 31.9 s


## Make data lake

In [10]:
def save_datalake(vcf_obj, path):
    (
      vcf_obj
      .write
      .format("delta")
      .mode("overwrite")
      .save(path)
    )

def load_datalake(path):
    delta_vcf = spark.read.format("delta").load(path)    
    return delta_vcf

multi_lake5_path = "delta/test-delta-multi5"
multi_lake30_path = "delta/test-delta-multi30"
multi_lake133_path = "delta/test-delta-multi133"


### Multisample

In [11]:
%%time
#save_datalake(vcf5_df, multi_lake5_path)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 11.7 µs


In [12]:
%%time
#save_datalake(vcf30_df, multi_lake30_path)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.06 µs


In [13]:
%%time
#save_datalake(vcf133_df, multi_lake133_path)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.11 µs


## Load datalake

In [14]:
%%time
delta_vcf30 = load_datalake(multi_lake30_path)
delta_vcf30.show(n=2)

                                                                                

+----------+--------+--------+-----+---------------+----------------+----+-------+---------------------+---------------+-----------+------------+--------+-------------------+---------+----------+-------+-----------+--------+-------+--------+------------+-------+------------+---+------+--------+-----+-----+------+---+--------+-------------+
|contigName|   start|     end|names|referenceAllele|alternateAlleles|qual|filters|splitFromMultiAllelic|INFO_SomaticEVS|INFO_QSS_NT|INFO_PNOISE2|INFO_MQ0|INFO_ReadPosRankSum|INFO_TQSS|INFO_SNVSB|INFO_DP|INFO_PNOISE|INFO_QSS|INFO_MQ|INFO_SGT|INFO_SOMATIC|INFO_NT|INFO_TQSS_NT|FDP|    GU|      AU|depth|SUBDP|    TU|SDP|      CU|     sampleId|
+----------+--------+--------+-----+---------------+----------------+----+-------+---------------------+---------------+-----------+------------+--------+-------------------+---------+----------+-------+-----------+--------+-------+--------+------------+-------+------------+---+------+--------+-----+-----+------+--

[Stage 348:>                                                        (0 + 1) / 1]                                                                                

In [15]:
%%time
delta_vcf30.count()

CPU times: user 0 ns, sys: 2.8 ms, total: 2.8 ms
Wall time: 1.02 s


448250

## Timings

#### Count variants per sample

In [17]:
def get_variants_per_sample(df):
    return df.groupBy('sampleId').count().collect()

In [18]:
%%time  
get_variants_per_sample(vcf30_df)

21/08/10 21:35:03 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB




21/08/10 21:36:04 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB

CPU times: user 195 ms, sys: 56.4 ms, total: 251 ms
Wall time: 1min 16s




[Row(sampleId='CPCT02290028T', count=9735),
 Row(sampleId='CPCT02030278T', count=19379),
 Row(sampleId='CPCT02300026T', count=23464),
 Row(sampleId='CPCT02330084T', count=9693),
 Row(sampleId='CPCT02030498T', count=11081),
 Row(sampleId='CPCT02080198T', count=6591),
 Row(sampleId='CPCT02020593T', count=21695),
 Row(sampleId='CPCT02020466T', count=18889),
 Row(sampleId='CPCT02030509T', count=13717),
 Row(sampleId='CPCT02030455T', count=21840),
 Row(sampleId='CPCT02130056T', count=12887),
 Row(sampleId='CPCT02020516T', count=18339),
 Row(sampleId='CPCT02160017T', count=34899),
 Row(sampleId='CPCT02120120T', count=5911),
 Row(sampleId='CPCT02300049T', count=8729),
 Row(sampleId='CPCT02270031T', count=8630),
 Row(sampleId='CPCT02080191T', count=26262),
 Row(sampleId='CPCT02170020T', count=18549),
 Row(sampleId='CPCT02070190T', count=19832),
 Row(sampleId='CPCT02060168T', count=7746),
 Row(sampleId='CPCT02040223T', count=11975),
 Row(sampleId='CPCT02100112T', count=10975),
 Row(sampleId='CP

In [19]:
%%time  
get_variants_per_sample(delta_vcf30)

CPU times: user 3.5 ms, sys: 6.53 ms, total: 10 ms
Wall time: 1.85 s




[Row(sampleId='CPCT02290028T', count=9735),
 Row(sampleId='CPCT02030278T', count=19379),
 Row(sampleId='CPCT02300026T', count=23464),
 Row(sampleId='CPCT02330084T', count=9693),
 Row(sampleId='CPCT02030498T', count=11081),
 Row(sampleId='CPCT02080198T', count=6591),
 Row(sampleId='CPCT02020593T', count=21695),
 Row(sampleId='CPCT02020466T', count=18889),
 Row(sampleId='CPCT02030509T', count=13717),
 Row(sampleId='CPCT02030455T', count=21840),
 Row(sampleId='CPCT02130056T', count=12887),
 Row(sampleId='CPCT02020516T', count=18339),
 Row(sampleId='CPCT02160017T', count=34899),
 Row(sampleId='CPCT02120120T', count=5911),
 Row(sampleId='CPCT02300049T', count=8729),
 Row(sampleId='CPCT02270031T', count=8630),
 Row(sampleId='CPCT02080191T', count=26262),
 Row(sampleId='CPCT02170020T', count=18549),
 Row(sampleId='CPCT02070190T', count=19832),
 Row(sampleId='CPCT02060168T', count=7746),
 Row(sampleId='CPCT02040223T', count=11975),
 Row(sampleId='CPCT02100112T', count=10975),
 Row(sampleId='CP

#### Get min POS for each sample and each chromosome

In [21]:
def count_min_pos_per_chromosome(df):
    return df.groupBy('sampleId', 'contigName').agg(fx.min(df.start)).collect()

In [22]:
%%time
count_min_pos_per_chromosome(vcf30_df)

21/08/10 21:36:24 WARN DAGScheduler: Broadcasting large task binary with size 4.4 MiB




21/08/10 21:37:37 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB

CPU times: user 257 ms, sys: 36.5 ms, total: 294 ms
Wall time: 1min 29s


                                                                                

[Row(sampleId='CPCT02140079T', contigName='20', min(start)=198358),
 Row(sampleId='CPCT02020466T', contigName='MT', min(start)=7886),
 Row(sampleId='CPCT02020466T', contigName='GL000205.1', min(start)=97409),
 Row(sampleId='CPCT02020516T', contigName='17', min(start)=91847),
 Row(sampleId='CPCT02130056T', contigName='GL000199.1', min(start)=13862),
 Row(sampleId='CPCT02270031T', contigName='X', min(start)=327714),
 Row(sampleId='CPCT02080191T', contigName='GL000233.1', min(start)=17578),
 Row(sampleId='CPCT02160017T', contigName='GL000198.1', min(start)=64788),
 Row(sampleId='CPCT02080198T', contigName='hs37d5', min(start)=88978),
 Row(sampleId='CPCT02030498T', contigName='3', min(start)=123774),
 Row(sampleId='CPCT02040223T', contigName='MT', min(start)=250),
 Row(sampleId='CPCT02060226T', contigName='8', min(start)=579302),
 Row(sampleId='CPCT02080191T', contigName='15', min(start)=20047187),
 Row(sampleId='CPCT02030355T', contigName='5', min(start)=176369),
 Row(sampleId='CPCT020303

In [23]:
%%time
count_min_pos_per_chromosome(delta_vcf30)

CPU times: user 12.3 ms, sys: 4.12 ms, total: 16.5 ms
Wall time: 1.53 s


[Row(sampleId='CPCT02020466T', contigName='MT', min(start)=7886),
 Row(sampleId='CPCT02020516T', contigName='17', min(start)=91847),
 Row(sampleId='CPCT02140079T', contigName='20', min(start)=198358),
 Row(sampleId='CPCT02020466T', contigName='GL000205.1', min(start)=97409),
 Row(sampleId='CPCT02160017T', contigName='GL000198.1', min(start)=64788),
 Row(sampleId='CPCT02030498T', contigName='3', min(start)=123774),
 Row(sampleId='CPCT02080191T', contigName='GL000233.1', min(start)=17578),
 Row(sampleId='CPCT02270031T', contigName='X', min(start)=327714),
 Row(sampleId='CPCT02080198T', contigName='hs37d5', min(start)=88978),
 Row(sampleId='CPCT02130056T', contigName='GL000199.1', min(start)=13862),
 Row(sampleId='CPCT02080191T', contigName='15', min(start)=20047187),
 Row(sampleId='CPCT02060226T', contigName='8', min(start)=579302),
 Row(sampleId='CPCT02030355T', contigName='5', min(start)=176369),
 Row(sampleId='CPCT02040223T', contigName='MT', min(start)=250),
 Row(sampleId='CPCT020304

### Filtering

In [25]:
def get_to_G_subs(df):
    return df.filter(array_contains(col("alternateAlleles"), 'G'))
def get_to_G_subs_count_per_sample(df):
    return df.filter(array_contains(col("alternateAlleles"), 'G')).groupBy('sampleId').count().collect()

In [26]:
%%time
#get_to_G_subs(vcf30_df).count()
get_to_G_subs_count_per_sample(vcf30_df)

21/08/10 21:37:57 WARN DAGScheduler: Broadcasting large task binary with size 4.4 MiB




21/08/10 21:38:56 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB

CPU times: user 212 ms, sys: 37.3 ms, total: 249 ms
Wall time: 1min 13s




[Row(sampleId='CPCT02290028T', count=1885),
 Row(sampleId='CPCT02030278T', count=4014),
 Row(sampleId='CPCT02300026T', count=5181),
 Row(sampleId='CPCT02330084T', count=2005),
 Row(sampleId='CPCT02030498T', count=1872),
 Row(sampleId='CPCT02080198T', count=1244),
 Row(sampleId='CPCT02020593T', count=4641),
 Row(sampleId='CPCT02020466T', count=3606),
 Row(sampleId='CPCT02030509T', count=3156),
 Row(sampleId='CPCT02030455T', count=5270),
 Row(sampleId='CPCT02130056T', count=2565),
 Row(sampleId='CPCT02020516T', count=3829),
 Row(sampleId='CPCT02160017T', count=7460),
 Row(sampleId='CPCT02120120T', count=1324),
 Row(sampleId='CPCT02300049T', count=1970),
 Row(sampleId='CPCT02270031T', count=1692),
 Row(sampleId='CPCT02080191T', count=6019),
 Row(sampleId='CPCT02170020T', count=4365),
 Row(sampleId='CPCT02070190T', count=5090),
 Row(sampleId='CPCT02060168T', count=1540),
 Row(sampleId='CPCT02040223T', count=2345),
 Row(sampleId='CPCT02100112T', count=2177),
 Row(sampleId='CPCT02050058T', c

In [27]:
%%time
#get_to_G_subs(delta_vcf30).count()
get_to_G_subs_count_per_sample(delta_vcf30)



CPU times: user 7 ms, sys: 4.68 ms, total: 11.7 ms
Wall time: 2.13 s




[Row(sampleId='CPCT02290028T', count=1885),
 Row(sampleId='CPCT02030278T', count=4014),
 Row(sampleId='CPCT02300026T', count=5181),
 Row(sampleId='CPCT02330084T', count=2005),
 Row(sampleId='CPCT02030498T', count=1872),
 Row(sampleId='CPCT02080198T', count=1244),
 Row(sampleId='CPCT02020593T', count=4641),
 Row(sampleId='CPCT02020466T', count=3606),
 Row(sampleId='CPCT02030509T', count=3156),
 Row(sampleId='CPCT02030455T', count=5270),
 Row(sampleId='CPCT02130056T', count=2565),
 Row(sampleId='CPCT02020516T', count=3829),
 Row(sampleId='CPCT02160017T', count=7460),
 Row(sampleId='CPCT02120120T', count=1324),
 Row(sampleId='CPCT02300049T', count=1970),
 Row(sampleId='CPCT02270031T', count=1692),
 Row(sampleId='CPCT02080191T', count=6019),
 Row(sampleId='CPCT02170020T', count=4365),
 Row(sampleId='CPCT02070190T', count=5090),
 Row(sampleId='CPCT02060168T', count=1540),
 Row(sampleId='CPCT02040223T', count=2345),
 Row(sampleId='CPCT02100112T', count=2177),
 Row(sampleId='CPCT02050058T', c

### Filter and select columns

In [29]:
%%time
get_to_G_subs(vcf30_df).select(vcf30_df.start).collect()[0:10]

21/08/10 21:39:15 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB




                                                                                

CPU times: user 430 ms, sys: 40.7 ms, total: 471 ms
Wall time: 57.4 s


[Row(start=60790),
 Row(start=966539),
 Row(start=1532055),
 Row(start=1740663),
 Row(start=1851187),
 Row(start=1938818),
 Row(start=4406928),
 Row(start=4536171),
 Row(start=4691166),
 Row(start=4797853)]

In [30]:
%%time
get_to_G_subs(delta_vcf30).select(delta_vcf30.start).collect()[0:10]

CPU times: user 312 ms, sys: 0 ns, total: 312 ms
Wall time: 1.19 s


[Row(start=91429494),
 Row(start=92986881),
 Row(start=93475944),
 Row(start=95102291),
 Row(start=95836214),
 Row(start=96296397),
 Row(start=96431842),
 Row(start=97007874),
 Row(start=97353581),
 Row(start=97376714)]

### Select column on full

In [32]:
%%time
vcf30_df.select(col("filters"), col("qual")).collect()[0:10]

21/08/10 21:40:17 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB




                                                                                

CPU times: user 1.99 s, sys: 98.4 ms, total: 2.09 s
Wall time: 56.7 s


[Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None)]

In [33]:
%%time
delta_vcf30.select(col("filters"), col("qual")).collect()[0:10]



CPU times: user 2.1 s, sys: 87.2 ms, total: 2.19 s
Wall time: 3.56 s


[Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None),
 Row(filters=['PASS'], qual=None)]

### Count records (on full)

In [34]:
%%time
vcf30_df.count()

21/08/10 21:41:16 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB






CPU times: user 148 ms, sys: 27.5 ms, total: 176 ms
Wall time: 46.3 s


                                                                                

448250

In [35]:
%%time
delta_vcf30.count()

CPU times: user 2.27 ms, sys: 0 ns, total: 2.27 ms
Wall time: 464 ms


448250

## ------

In [None]:
#
# code examples from the manual
#

#phenotypes_path = '/databricks-datasets/genomics/1000G/phenotypes.normalized'
#reference_genome_path = "/dbfs/databricks-datasets/genomics/grch37/data/human_g1k_v37.fa"
#vcf_output_path = "dbfs:/home/genomics/vcf/subset.vcf"


#genotype = delta_vcf.where((fx.col("contigName") == '22') & 
#                           (fx.col("start") == 1234567)). \
#                     selectExpr("contigName", "start", "filter(genotypes, g -> g.sampleId = '{0}') as genotypes".format(sample_id))

## Class impl that do not work 
#### (problematic inheritance from DataFrame)

In [None]:
vcf_path_glob = 'data/*20R.strelka.somatic.snvs.vcf.gz'
vcf_paths = glob.glob(vcf_path_glob)
vcf_path = vcf_paths[0]

class VCF(DataFrame):
    
    path = None
    vcf = None
    
    def __init__(self, path):
        self = self.__load_vcf(path)
        self.path = path
    
    def __load_vcf(self, path, includeSampleIds=True):
        vcf = (
          spark
          .read
          .format("vcf")
          .load(path, includeSampleIds=includeSampleIds)
        )
        return vcf
   
    
    def filter_PASS(self):
        self.vcf = self.vcf.filter(array_contains(col("filters"), 'PASS'))
        return self
        
        
class StrelkaSomaticSnvVCF(VCF):

    extension = '.strelka.somatic.snvs.vcf.gz'
    
    def __init__(self, path):
        super().__init__(path)
    
    def get_only_tumor_df(self, cols2keep=None):
        if cols2keep is None:
            cols2keep = [e for e in self.vcf.columns if e not in ['genotypes']]
    
        #
        # this is a proper way to explot the genotypes array but due to a bug? / Delta API changes? it does not work
        #vcf.select(element_at(col("genotypes"),1).alias('tumor')).select(glow.expand_struct("tumor"))
        tvcf = self.vcf.select(cols2keep + [self.vcf.genotypes.getItem(1).alias('tumor')])
        self.vcf = tvcf.select(cols2keep + ["tumor.*"]).drop('sampleId')
        return self
    
    def get_tumor_and_normal_ids_from_path(self):
        tvsn = os.path.basename(self.path)[:-len(self.extension)]
        try:
            tsid,nsid = tvsn.split('_vs_')
            return (tsid,nsid)
        except ValueError:
            print("For tumor and normal ID inference the filename of Strelka somatic VCF "+
                  "is expected to be [TUMOR_vs_NORMAL" + self.extension + "].\n"
                  "It was " + os.path.basename(self.path))
    
    def get_tumor_PASS_variants():
        return self.filter_pass().get_only_tumor_df()
        
