# Prototyping Usage for VCF Design

### John McCallum Feb 2017

Use autoreload for development to reload modules automagically

In [1]:
%load_ext autoreload
%autoreload 2

In [89]:
from pcr_marker_design import design as d 
from pcr_marker_design import run_p3
from pybedtools import BedTool, Interval
from pyfaidx import Fasta , FastaVariant
import pandas as pd

In [18]:
ls ../test/test-data

384um_251453690362217.txt      Chr9_Myb210.vcf
AcCHR1_test.fasta              Chr9_Myb210.vcf.gz
AcCHR1_test.fasta.fai          targets
AcCHR1_test.phased.vcf.gz      targets.fasta
AcCHR1_test.phased.vcf.gz.tbi  targets.fasta.fai
AcCHR1_test.vcf.gz             targets.gff
AcCHR1_test.vcf.gz.tbi         targets.snps.bed
CHR9.1.68.5.fasta.gz


In [19]:
test_seq = "../test/test-data/AcCHR1_test.fasta"
vcffile = "../test/test-data/AcCHR1_test.vcf.gz"
designer = d.VcfPrimerDesign(test_seq, vcffile, "TestCHR1")



In [20]:
designer.annot.samples

['CK51_02',
 'CK51_05',
 'CK51_06',
 'CK51_09',
 'CK51_11',
 'Hort16A',
 'Hort22d',
 'Russell',
 'T03.51-11-28f',
 'T94.30-03-10f',
 'T94.30-04-08b',
 'T94.30-04-08c',
 'T94.30-04-08d']

### Create a BedTool Target

In [77]:
target = BedTool('CHR1 435 436', from_string=True)
target2 = BedTool('CHR1 365 366', from_string=True)

In [91]:
target.to_dataframe()

Unnamed: 0,chrom,start,end
0,CHR1,435,436


In [92]:
target = BedTool('CHR1 435 436', from_string=True)

In [None]:
target.shuffle

In [88]:
dir(target)

['TEMPFILES',
 '__add__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__weakref__',
 '_bam_header',
 '_bed_to_bam',
 '_collapse',
 '_file_type',
 '_hascounts',
 '_isbam',
 '_log_to_history',
 '_randomintersection',
 '_tabixed',
 '_tag',
 '_tmp',
 'absolute_distance',
 'all_hits',
 'annotate',
 'any_hits',
 'as_intervalfile',
 'at',
 'bam_to_bed',
 'bam_to_fastq',
 'bed6',
 'bedpe_to_bam',
 'bgzip',
 'cat',
 'check_genome',
 'closest',
 'cluster',
 'colormap_normalize',
 'complement',
 'count',
 'count_hits',
 'coverage',
 'cut',
 'delete_temporary_history',
 'each',
 'expand',
 'features',
 'field_count',
 'file_type',
 'filter',
 'fisher',
 'flank'

In [87]:
print(target.shift(20))

AttributeError: 'BedTool' object has no attribute 'shift'

In [75]:
temp=BedTool(vcffile)
len(temp)

156

In [93]:
target

<BedTool(/var/folders/_q/cdp6zkkn2f73g2v531vnnsxx_xjprv/T/pybedtools.6cpgu5cg.tmp)>

In [76]:
len(temp.subtract(target))

155

In [80]:
?temp.subtract

In [83]:
len(temp.subtract(target2,A=True))

155

In [70]:
len((temp.subtract(target)

SyntaxError: unexpected EOF while parsing (<ipython-input-70-1d020cfbac17>, line 1)

In [66]:
print(temp.subtract(target))

CHR1	130	.	GAAAAAAAG	GAAAAAAAAG,GAAAAAAAAAG	14300	.	AB=0.504,0.512195;ABP=3.06242,3.06326;AC=16,1;AF=0.615385,0.0384615;AN=26;AO=477,21;CIGAR=1M1I8M,1M2I8M;DP=769;DPB=941;DPRA=1.01212,0.696884;EPP=9.24247,3.1137;EPPR=6.0605;GTI=0;LEN=1,2;MEANALT=2.09091,1;MQM=60,60;MQMR=60;NS=13;NUMALT=2;ODDS=32.4789;PAIRED=0.926625,0.428571;PAIREDR=0.968872;PAO=49,49;PQA=1555,1555;PQR=1555;PRO=49;QA=16747,740;QR=9544;RO=257;RPP=11.4276,3.1137;RPPR=7.47998;RUN=1,1;SAF=248,9;SAP=4.6537,3.94093;SAR=229,12;SRF=123;SRP=4.03267;SRR=134;TYPE=ins,ins;technology.Illumina=1,1	GT:GQ:DP:RO:QR:AO:QA:GL	0/2:0:41:20:732:0,21:0,740:-10,-10,-10,0,-10,-10	1/1:0:58:0:0:58,0:2031,0:-10,-10,0,-10,-10,-10	1/1:0:71:0:0:71,0:2614,0:-10,-10,0,-10,-10,-10	0/1:0:73:37:1399:33,0:1228,0:-10,0,-10,-10,-10,-10	0/1:0:72:31:1165:38,0:1303,0:-10,0,-10,-10,-10,-10	1/1:0:64:0:0:61,0:2060,0:-10,-10,0,-10,-10,-10	0/1:0:69:36:1326:32,0:1108,0:-10,0,-10,-10,-10,-10	1/1:99:42:0:0:41,0:1557,0:-10,-10,0,-10,-10,-10	0/1:0:35:10:318:23,0:738,0:-

### Create a target Interval

In [130]:
target_interval=Interval('CHR1',3000,3001)

In [23]:
target_interval.chrom

'CHR1'

In [24]:
target_interval.start

3000

In [95]:
target_interval.length

1

at 2c01be2662fe6466c3826b5a06ec81b9472f9f4c this needs a Bedtool length 1

In [131]:
designer.getseqslicedict(target,100)

AttributeError: 'BedTool' object has no attribute 'chrom'

Doesn't work with INterval

In [132]:
designer.getseqslicedict(target_interval,100)

{'REF_OFFSET': 2900,
 'SEQUENCE_EXCLUDED_REGION': [(7, 1), (26, 1), (65, 1), (93, 1), (139, 1)],
 'SEQUENCE_ID': 'CHR1:2900-3101',
 'SEQUENCE_TARGET': (100, 1),
 'SEQUENCE_TEMPLATE': 'AGGAAATAAATAAATATGGAATAAAACATTGATATTACAAATAAAGGGTGCTTCTAGCTGAGTAGTCCTCCGATAAAGCACACGCATACAAAGGAATGAGAGAGAGAGAGAGAGGCGCTACCACATATAAAAGGGACAGCAAACATTTTAACATGAGCAAATCAGTGACACTAGGTAGGTGTTAGCACAAAAATGAACCTT',
 'TARGET_ID': 'CHR1:3001-3001'}

### Design

-set globals defaults

In [133]:
p3_globals={
        'PRIMER_OPT_SIZE': 20,
        'PRIMER_PICK_INTERNAL_OLIGO': 0,
        'PRIMER_INTERNAL_MAX_SELF_END': 8,
        'PRIMER_MIN_SIZE': 18,
        'PRIMER_MAX_SIZE': 25,
        'PRIMER_OPT_TM': 60.0,
        'PRIMER_MIN_TM': 57.0,
        'PRIMER_MAX_TM': 63.0,
        'PRIMER_MIN_GC': 20.0,
        'PRIMER_MAX_GC': 80.0,
        'PRIMER_MAX_POLY_X': 100,
        'PRIMER_INTERNAL_MAX_POLY_X': 100,
        'PRIMER_SALT_MONOVALENT': 50.0,
        'PRIMER_DNA_CONC': 50.0,
        'PRIMER_MAX_NS_ACCEPTED': 0,
        'PRIMER_MAX_SELF_ANY': 12,
        'PRIMER_MAX_SELF_END': 8,
        'PRIMER_PAIR_MAX_COMPL_ANY': 12,
        'PRIMER_PAIR_MAX_COMPL_END': 8,
        'PRIMER_PRODUCT_SIZE_RANGE': [60,250],
    }

In [135]:
designer.getseqslicedict(target_interval,250)

{'REF_OFFSET': 2750,
 'SEQUENCE_EXCLUDED_REGION': [(43, 1),
  (122, 1),
  (157, 1),
  (176, 1),
  (215, 1),
  (243, 1),
  (289, 1),
  (411, 1),
  (472, 1)],
 'SEQUENCE_ID': 'CHR1:2750-3251',
 'SEQUENCE_TARGET': (250, 1),
 'SEQUENCE_TEMPLATE': 'CTCAATTTCTTTAGAAGCTTCCAGAGTTGTTGAATTGGCAGCGGCAACTACAGTCGCAACTGTTCCTAGCTTTGCAGAACCATTCCCACTCAAGGAATTCACGGACTCTTTATGTGCCTTCAGAACCAACTGTGTCGCACTGGGTTTTAAAGGAAATAAATAAATATGGAATAAAACATTGATATTACAAATAAAGGGTGCTTCTAGCTGAGTAGTCCTCCGATAAAGCACACGCATACAAAGGAATGAGAGAGAGAGAGAGAGGCGCTACCACATATAAAAGGGACAGCAAACATTTTAACATGAGCAAATCAGTGACACTAGGTAGGTGTTAGCACAAAAATGAACCTTGTTTACATCTGTTCACCACATCCTAGAACATCTTAGACACACACTGCAATAACATATGAGGTGGAGCATGGCACAGTGATACTGCAACAGTAGGATTCCCTGTAACTCTAATGCAACTTTTCATGTACTCAGCCTCTCAAATGATATCGC',
 'TARGET_ID': 'CHR1:3001-3001'}

In [136]:
run_p3.run_P3(global_dict=p3_globals,target_dict=designer.getseqslicedict(target_interval,250))

[{'AMPLICON_REGION': 'CHR1:2879-3022',
  'PRIMER_LEFT': (2878, 20),
  'PRIMER_LEFT_SEQUENCE': 'ACTGTGTCGCACTGGGTTTT',
  'PRIMER_RIGHT': (3021, 20),
  'PRIMER_RIGHT_SEQUENCE': 'GGTAGCGCCTCTCTCTCTCT',
  'SEQUENCE_ID': 'CHR1:2750-3251',
  'TARGET_ID': 'CHR1:3001-3001'},
 {'AMPLICON_REGION': 'CHR1:2878-3022',
  'PRIMER_LEFT': (2877, 20),
  'PRIMER_LEFT_SEQUENCE': 'AACTGTGTCGCACTGGGTTT',
  'PRIMER_RIGHT': (3021, 20),
  'PRIMER_RIGHT_SEQUENCE': 'GGTAGCGCCTCTCTCTCTCT',
  'SEQUENCE_ID': 'CHR1:2750-3251',
  'TARGET_ID': 'CHR1:3001-3001'},
 {'AMPLICON_REGION': 'CHR1:2879-3026',
  'PRIMER_LEFT': (2878, 20),
  'PRIMER_LEFT_SEQUENCE': 'ACTGTGTCGCACTGGGTTTT',
  'PRIMER_RIGHT': (3025, 20),
  'PRIMER_RIGHT_SEQUENCE': 'ATGTGGTAGCGCCTCTCTCT',
  'SEQUENCE_ID': 'CHR1:2750-3251',
  'TARGET_ID': 'CHR1:3001-3001'},
 {'AMPLICON_REGION': 'CHR1:2878-3026',
  'PRIMER_LEFT': (2877, 20),
  'PRIMER_LEFT_SEQUENCE': 'AACTGTGTCGCACTGGGTTT',
  'PRIMER_RIGHT': (3025, 20),
  'PRIMER_RIGHT_SEQUENCE': 'ATGTGGTAGCGCCTCTCTCT

### Check out this Built-in Function!

eg

```
faidx ../test/test-data/AcCHR1_test.fasta CHR1:2879-3022 CHR1:2878-3022 CHR1:2879-3026 CHR1:2878-3026
```


### Try out Variant Features in PyFaidx

In [50]:
from pyfaidx import FastaVariant, Fasta
ref=Fasta(test_seq,as_raw=True)
alt=FastaVariant(test_seq,vcffile,het=True, hom=True,sample=None, as_raw=True)
consensus_1=FastaVariant(test_seq,vcffile,het=True, hom=True,sample='CK51_02')
consensus_2=FastaVariant(test_seq,vcffile,het=True, hom=True,sample='CK51_09')



In [51]:
alt['CHR1'].variant_sites[:10]

(436, 542, 1024, 1218, 1604, 1634, 1893, 2085, 2143, 2241)

In [52]:
ref['CHR1'][430:450]

'TCCTAGCTTCTGTTGCTGGA'

In [53]:
alt['CHR1'][430:450]

'TCCTACCTTCTGTTGCTGGA'

### Melt the Ref and Alt consensus sequences at uMelt

In [54]:
from pcr_marker_design import umelt_service as um

In [26]:
## !export https_proxy=

In [27]:
umelt=um.UmeltService()
refmelt= um.MeltSeq(ref['CHR1'][430:450])
altmelt= um.MeltSeq(ref['CHR1'][430:450])

In [28]:
ref_melt_Tm=umelt.get_helicity_info(umelt.get_response(refmelt)).get_melting_temp()

In [29]:
alt_melt_Tm=umelt.get_helicity_info(umelt.get_response(altmelt)).get_melting_temp()

In [30]:
ref_melt_Tm

68.905500705218614

In [31]:
alt_melt_Tm

68.905500705218614

In [32]:
run_p3.run_P3(global_dict=p3_globals,
              target_dict=designer.getseqslicedict(BedTool('CHR1 2492 2493', from_string=True),250))

[{'AMPLICON_REGION': 'CHR1:2465-2699',
  'PRIMER_LEFT': (2464, 21),
  'PRIMER_LEFT_SEQUENCE': 'AGAGTTCTGAGTTCCTCGTGT',
  'PRIMER_RIGHT': (2698, 20),
  'PRIMER_RIGHT_SEQUENCE': 'CCAACCGATGGTCCAATGGA',
  'SEQUENCE_ID': 'CHR1:2242-2743',
  'TARGET_ID': 'CHR1:2493-2493'},
 {'AMPLICON_REGION': 'CHR1:2464-2699',
  'PRIMER_LEFT': (2463, 22),
  'PRIMER_LEFT_SEQUENCE': 'AAGAGTTCTGAGTTCCTCGTGT',
  'PRIMER_RIGHT': (2698, 20),
  'PRIMER_RIGHT_SEQUENCE': 'CCAACCGATGGTCCAATGGA',
  'SEQUENCE_ID': 'CHR1:2242-2743',
  'TARGET_ID': 'CHR1:2493-2493'},
 {'AMPLICON_REGION': 'CHR1:2465-2688',
  'PRIMER_LEFT': (2464, 21),
  'PRIMER_LEFT_SEQUENCE': 'AGAGTTCTGAGTTCCTCGTGT',
  'PRIMER_RIGHT': (2687, 20),
  'PRIMER_RIGHT_SEQUENCE': 'TCCAATGGATGACCTCACGG',
  'SEQUENCE_ID': 'CHR1:2242-2743',
  'TARGET_ID': 'CHR1:2493-2493'},
 {'AMPLICON_REGION': 'CHR1:2463-2699',
  'PRIMER_LEFT': (2462, 23),
  'PRIMER_LEFT_SEQUENCE': 'AAAGAGTTCTGAGTTCCTCGTGT',
  'PRIMER_RIGHT': (2698, 20),
  'PRIMER_RIGHT_SEQUENCE': 'CCAACCGATGGTC

### Try Flanking= False option and Melt Amplicons

In [55]:
import pandas as pd

In [56]:
test=run_p3.run_P3(global_dict=p3_globals,
              target_dict=designer.getseqslicedict(BedTool('CHR1 2400 2700', from_string=True),250,flanking= False))

In [57]:
test

[{'AMPLICON_REGION': 'CHR1:2547-2699',
  'PRIMER_LEFT': (2546, 20),
  'PRIMER_LEFT_SEQUENCE': 'TGATGTAGGTCGTTTGGCCT',
  'PRIMER_RIGHT': (2698, 20),
  'PRIMER_RIGHT_SEQUENCE': 'CCAACCGATGGTCCAATGGA',
  'SEQUENCE_ID': 'CHR1:2400-2700',
  'TARGET_ID': 'CHR1:2401-2700'},
 {'AMPLICON_REGION': 'CHR1:2554-2699',
  'PRIMER_LEFT': (2553, 20),
  'PRIMER_LEFT_SEQUENCE': 'GGTCGTTTGGCCTAATTCCC',
  'PRIMER_RIGHT': (2698, 20),
  'PRIMER_RIGHT_SEQUENCE': 'CCAACCGATGGTCCAATGGA',
  'SEQUENCE_ID': 'CHR1:2400-2700',
  'TARGET_ID': 'CHR1:2401-2700'},
 {'AMPLICON_REGION': 'CHR1:2544-2699',
  'PRIMER_LEFT': (2543, 21),
  'PRIMER_LEFT_SEQUENCE': 'GCTTGATGTAGGTCGTTTGGC',
  'PRIMER_RIGHT': (2698, 20),
  'PRIMER_RIGHT_SEQUENCE': 'CCAACCGATGGTCCAATGGA',
  'SEQUENCE_ID': 'CHR1:2400-2700',
  'TARGET_ID': 'CHR1:2401-2700'},
 {'AMPLICON_REGION': 'CHR1:2553-2699',
  'PRIMER_LEFT': (2552, 21),
  'PRIMER_LEFT_SEQUENCE': 'AGGTCGTTTGGCCTAATTCCC',
  'PRIMER_RIGHT': (2698, 20),
  'PRIMER_RIGHT_SEQUENCE': 'CCAACCGATGGTCCAATG

In [58]:
test_df=pd.DataFrame.from_dict(test)
test_df

Unnamed: 0,AMPLICON_REGION,PRIMER_LEFT,PRIMER_LEFT_SEQUENCE,PRIMER_RIGHT,PRIMER_RIGHT_SEQUENCE,SEQUENCE_ID,TARGET_ID
0,CHR1:2547-2699,"(2546, 20)",TGATGTAGGTCGTTTGGCCT,"(2698, 20)",CCAACCGATGGTCCAATGGA,CHR1:2400-2700,CHR1:2401-2700
1,CHR1:2554-2699,"(2553, 20)",GGTCGTTTGGCCTAATTCCC,"(2698, 20)",CCAACCGATGGTCCAATGGA,CHR1:2400-2700,CHR1:2401-2700
2,CHR1:2544-2699,"(2543, 21)",GCTTGATGTAGGTCGTTTGGC,"(2698, 20)",CCAACCGATGGTCCAATGGA,CHR1:2400-2700,CHR1:2401-2700
3,CHR1:2553-2699,"(2552, 21)",AGGTCGTTTGGCCTAATTCCC,"(2698, 20)",CCAACCGATGGTCCAATGGA,CHR1:2400-2700,CHR1:2401-2700


In [59]:
test_df.AMPLICON_REGION

0    CHR1:2547-2699
1    CHR1:2554-2699
2    CHR1:2544-2699
3    CHR1:2553-2699
Name: AMPLICON_REGION, dtype: object

In [60]:
designer.meltSlice(test_df.AMPLICON_REGION[0])

(86.380112834978846, 86.680535966149506)

In [61]:
test_df['Melt']=test_df['AMPLICON_REGION'].apply(designer.meltSlice)

In [62]:
test_df['TmDiff']=test_df.Melt.apply(lambda X : abs(X[0]-X[1]))
test_df

Unnamed: 0,AMPLICON_REGION,PRIMER_LEFT,PRIMER_LEFT_SEQUENCE,PRIMER_RIGHT,PRIMER_RIGHT_SEQUENCE,SEQUENCE_ID,TARGET_ID,Melt,TmDiff
0,CHR1:2547-2699,"(2546, 20)",TGATGTAGGTCGTTTGGCCT,"(2698, 20)",CCAACCGATGGTCCAATGGA,CHR1:2400-2700,CHR1:2401-2700,"(86.380112835, 86.6805359661)",0.300423
1,CHR1:2554-2699,"(2553, 20)",GGTCGTTTGGCCTAATTCCC,"(2698, 20)",CCAACCGATGGTCCAATGGA,CHR1:2400-2700,CHR1:2401-2700,"(86.2299012694, 86.3300423131)",0.100141
2,CHR1:2544-2699,"(2543, 21)",GCTTGATGTAGGTCGTTTGGC,"(2698, 20)",CCAACCGATGGTCCAATGGA,CHR1:2400-2700,CHR1:2401-2700,"(86.4802538787, 86.730606488)",0.250353
3,CHR1:2553-2699,"(2552, 21)",AGGTCGTTTGGCCTAATTCCC,"(2698, 20)",CCAACCGATGGTCCAATGGA,CHR1:2400-2700,CHR1:2401-2700,"(86.2799717913, 86.380112835)",0.100141


### Check on Prototyped melt method

In [41]:
designer.meltSlice('CHR1:2879-3022')

(87.381523272214395, 86.980959097320167)

In [42]:
!/Users/johnmccallum/miniconda3/envs/Py3PCR/bin/faidx ../test/test-data/AcCHR1_test.fasta CHR1:2879-3022

>CHR1:2879-3022
ACTGTGTCGCACTGGGTTTTAAAGGAAATAAATAAATATGGAATAAAACATTGATATTACAAATAAAGGGTGCTTCTAGC
TGAGTAGTCCTCCGATAAAGCACACGCATACAAAGGAATGAGAGAGAGAGAGAGAGGCGCTACC


In [94]:
Interval