# Slice vcf with Various Modules and Prototype Designer for VCF

- pybedtools
- pyvcf

HTSeq
-----

- Slicing is quite complex and we dont get easy access to call details

pybedtools
----------

- quite flaky with access to large vcf.gz tabixed files
- just use it for interval algebra, if necessary

pycvf
-----

- works a treat
- issues with running fetch in Python 2 dues to some Cython bugginess. Tests only pass in Py3

In [7]:
from pyfaidx import Fasta
from pybedtools import BedTool
import vcf

In [14]:
!pwd
!ls  ../test/test-data/AcCHR1_test*

/Users/cfljam/Documents/pcr_marker_design/docs
../test/test-data/AcCHR1_test.fasta
../test/test-data/AcCHR1_test.phased.vcf.gz
../test/test-data/AcCHR1_test.phased.vcf.gz.tbi
../test/test-data/AcCHR1_test.vcf.gz
../test/test-data/AcCHR1_test.vcf.gz.tbi


### Load up an indexed reference and vcf

In [16]:
myfa= Fasta("../test/test-data/AcCHR1_test.fasta")
myvcf=vcf.Reader(filename='../test/test-data/AcCHR1_test.vcf.gz')

Can access a slice using [fetch](http://pyvcf.readthedocs.org/en/latest/API.html#vcf-reader)

Pythonic intervals are start/end, POS is 1-based vcf index

In [17]:
 for X in myvcf.fetch("CHR1",200,900):
        print(X.POS,X.start,X.end,X.var_type,X.alleles)

361 360 394 indel ['GGTAAGGCTAACCACAGGTACATCCAAGCTTCTC', GC]
436 435 436 snp ['A', C]
499 498 501 indel ['CGC', CGGC]
535 534 535 snp ['C', T]
542 541 542 snp ['C', T]
558 557 558 snp ['T', G]
803 802 803 snp ['A', T]


To get length can just use end - start  

Fetch arg has our chromosome/contig ID

In [19]:
for  X in myvcf.fetch("CHR1",1000,1900):
    if (X.var_type=='indel'):
        print(X.POS,X.var_subtype,X.start,X.end,X.affected_end,X.REF,X.ALT,X.alleles)

1051 del 1050 1053 1053 TAC [TC] ['TAC', TC]
1793 unknown 1792 1806 1806 TAAAAAAAAAAAAT [TAAAAAAAAAAAAAT, TAAAAAAAAAAAT, TAAAAAAAAAAAAAAT] ['TAAAAAAAAAAAAT', TAAAAAAAAAAAAAT, TAAAAAAAAAAAT, TAAAAAAAAAAAAAAT]


Probaby simplest to return a dict from comprehension, so we can the exclude our target.

**However** this would be less flexible than making a bed tool

would need to 

- pass in a target in Python Primer 3 target format

i.e. 0-based start  length

- exclude using 

In [22]:
myintdict={X.start:X.end-X.start for X  in myvcf.fetch("CHR1",2000,2400)}
myintdict

{2084: 1,
 2142: 1,
 2151: 3,
 2173: 1,
 2207: 1,
 2216: 1,
 2217: 1,
 2240: 1,
 2257: 1,
 2295: 1,
 2317: 1,
 2333: 1,
 2348: 12,
 2371: 1,
 2391: 1}

### Create  a BedTool from Vcf interval

In [24]:
["CHR1" + " " + str(X.start)+ " " +  str(X.end) for X  in myvcf.fetch("CHR1",3000,3500)]

['CHR1 2997 3015',
 'CHR1 3039 3040',
 'CHR1 3161 3162',
 'CHR1 3222 3223',
 'CHR1 3306 3309',
 'CHR1 3423 3425']

need to join with newline into a string

In [25]:
testint=["CHR1" + " " + str(X.start)+ " " +  str(X.end) for X  in myvcf.fetch("CHR1",3000,3500)]
"\n".join(testint)

'CHR1 2997 3015\nCHR1 3039 3040\nCHR1 3161 3162\nCHR1 3222 3223\nCHR1 3306 3309\nCHR1 3423 3425'

In [26]:
testbed=BedTool("\n".join(testint),from_string=True)

In [27]:
print(testbed)

CHR1	2997	3015
CHR1	3039	3040
CHR1	3161	3162
CHR1	3222	3223
CHR1	3306	3309
CHR1	3423	3425



## Write a class

In [34]:
class VcfPrimerDesign:
    """A primer design object that is primed
    with genome reference and vcf variant data
    """
    def __init__(self,reference,vcf_file,desc):
        """
        Usage:  PrimerDesign(reference, vcf.gz, description)
        Initialise a design object with a  reference assembly and
        variant file(s)
        """
        self.reference = Fasta(reference)
        self.vcf=vcf.Reader(filename=vcf_file)
        self.desc=desc
        self.genome=self.reference.filename.replace("fasta","fasta.fai")
    
    
    def getseqslicedict(self,target,max_size):
        """Pass a bed target to a designer and get a dictionary
        slice that we can pass to P3
        """
        target_int=target.slop(b=max_size,g=self.genome)
        target_chrom=target[0].chrom
        target_start=target_int[0].start
        target_end=target_int[0].end
        offset=target_int[0].start
        sldic=dict(SEQUENCE_ID=self.desc)
        sldic['TARGET_ID']=target_chrom + "_" + str(target_start) +"_" + str(target_end)
        sldic['SEQUENCE_TEMPLATE']=str(self.reference[target_chrom][target_start:target_end].seq)
        #slice_annot=[(X.start -offset,X.length) for X in (self.annotations - target) if (X.chrom==target[0].chrom) & \
         #           (X.start > target_int[0].start) & (X.end < target_int[0].end)]
        slice_vars=[target_chrom + " " + str(X.start)+ " " +str(X.end) for X in self.vcf.fetch(target_chrom,target_start,target_end)]
        slice_annot=BedTool("\n".join(slice_vars),from_string=True)
        slice_annot=slice_annot-target
        sldic['SEQUENCE_EXCLUDED_REGION']=[(X.start,X.length) for X in slice_annot]
        sldic['SEQUENCE_TARGET']= (target[0].start -offset,target[0].length)
        return sldic



-----------------

Try Out 

In [47]:
mydesigner=VcfPrimerDesign("../test/test-data/AcCHR1_test.fasta",
                            '../test/test-data/AcCHR1_test.vcf.gz','TestCHR1')
myphaseddesigner=VcfPrimerDesign("../test/test-data/AcCHR1_test.fasta",
                            '../test/test-data/AcCHR1_test.phased.vcf.gz','TestCHR1')

In [48]:
my_target=BedTool('CHR1 3000 3001',from_string=True)
mybdesigner.getseqslicedict(my_target,200)

{'SEQUENCE_EXCLUDED_REGION': [(2872, 1),
  (2907, 1),
  (2926, 1),
  (2965, 1),
  (2993, 1),
  (3039, 1),
  (3161, 1)],
 'SEQUENCE_ID': 'TestCHR1',
 'SEQUENCE_TARGET': (200, 1),
 'SEQUENCE_TEMPLATE': 'CAGTCGCAACTGTTCCTAGCTTTGCAGAACCATTCCCACTCAAGGAATTCACGGACTCTTTATGTGCCTTCAGAACCAACTGTGTCGCACTGGGTTTTAAAGGAAATAAATAAATATGGAATAAAACATTGATATTACAAATAAAGGGTGCTTCTAGCTGAGTAGTCCTCCGATAAAGCACACGCATACAAAGGAATGAGAGAGAGAGAGAGAGGCGCTACCACATATAAAAGGGACAGCAAACATTTTAACATGAGCAAATCAGTGACACTAGGTAGGTGTTAGCACAAAAATGAACCTTGTTTACATCTGTTCACCACATCCTAGAACATCTTAGACACACACTGCAATAACATATGAGGTGGAGCATGGCACAGTGATACTGCAACAGTAGGATTCCC',
 'TARGET_ID': 'CHR1_2800_3201'}

In [67]:
myphaseddesigner.getseqslicedict(my_target,200)

{'SEQUENCE_EXCLUDED_REGION': [(2872, 1),
  (2907, 1),
  (2926, 1),
  (2965, 1),
  (2993, 1),
  (3039, 1),
  (3161, 1)],
 'SEQUENCE_ID': 'TestCHR1',
 'SEQUENCE_TARGET': (200, 1),
 'SEQUENCE_TEMPLATE': 'CAGTCGCAACTGTTCCTAGCTTTGCAGAACCATTCCCACTCAAGGAATTCACGGACTCTTTATGTGCCTTCAGAACCAACTGTGTCGCACTGGGTTTTAAAGGAAATAAATAAATATGGAATAAAACATTGATATTACAAATAAAGGGTGCTTCTAGCTGAGTAGTCCTCCGATAAAGCACACGCATACAAAGGAATGAGAGAGAGAGAGAGAGGCGCTACCACATATAAAAGGGACAGCAAACATTTTAACATGAGCAAATCAGTGACACTAGGTAGGTGTTAGCACAAAAATGAACCTTGTTTACATCTGTTCACCACATCCTAGAACATCTTAGACACACACTGCAATAACATATGAGGTGGAGCATGGCACAGTGATACTGCAACAGTAGGATTCCC',
 'TARGET_ID': 'CHR1_2800_3201'}

### Test with some other vcf

In [42]:
from pcr_marker_design import design as d
from pybedtools import BedTool
import vcf

In [60]:
mybdesign=d.VcfPrimerDesign('../test/test-data/CHR9.1.68.5.fasta',
                                '../test/test-data/Chr9_Myb210.vcf.gz','MybTest')

In [63]:
mybdesign.annot.fetch('CHR9',1390500,1390600)

ValueError: could not create iterator for region 'CHR9:1390501-1390600'

In [64]:
mytarget=BedTool("CHR9 1390622 1390623",from_string=True)
mybdesign.getseqslicedict(mytarget,100)

ValueError: could not create iterator for region 'CHR9:1390523-1390723'

In [7]:
import pysam
pysam.__version__

'0.8.4'

**This ony passes in Python3**

-----------

Try to Design
----------

In [65]:
from pcr_marker_design import run_p3 as P3

In [66]:
p3_test_globals={
        'PRIMER_OPT_SIZE': 20,
        'PRIMER_PICK_INTERNAL_OLIGO': 1,
        'PRIMER_INTERNAL_MAX_SELF_END': 8,
        'PRIMER_MIN_SIZE': 18,
        'PRIMER_MAX_SIZE': 25,
        'PRIMER_OPT_TM': 60.0,
        'PRIMER_MIN_TM': 57.0,
        'PRIMER_MAX_TM': 63.0,
        'PRIMER_MIN_GC': 20.0,
        'PRIMER_MAX_GC': 80.0,
        'PRIMER_MAX_POLY_X': 100,
        'PRIMER_INTERNAL_MAX_POLY_X': 100,
        'PRIMER_SALT_MONOVALENT': 50.0,
        'PRIMER_DNA_CONC': 50.0,
        'PRIMER_MAX_NS_ACCEPTED': 0,
        'PRIMER_MAX_SELF_ANY': 12,
        'PRIMER_MAX_SELF_END': 8,
        'PRIMER_PAIR_MAX_COMPL_ANY': 12,
        'PRIMER_PAIR_MAX_COMPL_END': 8,
        'PRIMER_PRODUCT_SIZE_RANGE': [[75,100],[100,125],[125,150],[150,175],[175,200],[200,225]],
    }