# Prototyping Usage for VCF Design

Use autoreload for develp[ment to reload modules automagically

In [1]:
%load_ext autoreload
%autoreload 2

In [18]:
from pcr_marker_design import design as d 
from pcr_marker_design import run_p3
from pybedtools import BedTool

In [3]:
ls ../test/test-data

384um_251453690362217.txt      Chr9_Myb210.vcf
AcCHR1_test.fasta              Chr9_Myb210.vcf.gz
AcCHR1_test.fasta.fai          targets
AcCHR1_test.phased.vcf.gz      targets.fasta
AcCHR1_test.phased.vcf.gz.tbi  targets.fasta.fai
AcCHR1_test.vcf.gz             targets.gff
AcCHR1_test.vcf.gz.tbi         targets.snps.bed
CHR9.1.68.5.fasta.gz


In [4]:
test_seq = "../test/test-data/AcCHR1_test.fasta"
vcffile = "../test/test-data/AcCHR1_test.vcf.gz"
designer = d.VcfPrimerDesign(test_seq, vcffile, "TestCHR1")

In [62]:
designer.annot.samples

['CK51_02',
 'CK51_05',
 'CK51_06',
 'CK51_09',
 'CK51_11',
 'Hort16A',
 'Hort22d',
 'Russell',
 'T03.51-11-28f',
 'T94.30-03-10f',
 'T94.30-04-08b',
 'T94.30-04-08c',
 'T94.30-04-08d']

In [9]:
target = BedTool('CHR1 3000 3001', from_string=True)

In [43]:
designer.getseqslicedict(target,100)

{'REF_OFFSET': 2900,
 'SEQUENCE_EXCLUDED_REGION': [(7, 1), (26, 1), (65, 1), (93, 1), (139, 1)],
 'SEQUENCE_ID': 'CHR1:2900-3101',
 'SEQUENCE_TARGET': (100, 1),
 'SEQUENCE_TEMPLATE': 'AGGAAATAAATAAATATGGAATAAAACATTGATATTACAAATAAAGGGTGCTTCTAGCTGAGTAGTCCTCCGATAAAGCACACGCATACAAAGGAATGAGAGAGAGAGAGAGAGGCGCTACCACATATAAAAGGGACAGCAAACATTTTAACATGAGCAAATCAGTGACACTAGGTAGGTGTTAGCACAAAAATGAACCTT',
 'TARGET_ID': 'CHR1:3000-3001'}

### Design

-set globals defaults

In [22]:
p3_globals={
        'PRIMER_OPT_SIZE': 20,
        'PRIMER_PICK_INTERNAL_OLIGO': 0,
        'PRIMER_INTERNAL_MAX_SELF_END': 8,
        'PRIMER_MIN_SIZE': 18,
        'PRIMER_MAX_SIZE': 25,
        'PRIMER_OPT_TM': 60.0,
        'PRIMER_MIN_TM': 57.0,
        'PRIMER_MAX_TM': 63.0,
        'PRIMER_MIN_GC': 20.0,
        'PRIMER_MAX_GC': 80.0,
        'PRIMER_MAX_POLY_X': 100,
        'PRIMER_INTERNAL_MAX_POLY_X': 100,
        'PRIMER_SALT_MONOVALENT': 50.0,
        'PRIMER_DNA_CONC': 50.0,
        'PRIMER_MAX_NS_ACCEPTED': 0,
        'PRIMER_MAX_SELF_ANY': 12,
        'PRIMER_MAX_SELF_END': 8,
        'PRIMER_PAIR_MAX_COMPL_ANY': 12,
        'PRIMER_PAIR_MAX_COMPL_END': 8,
        'PRIMER_PRODUCT_SIZE_RANGE': [60,250],
    }

In [36]:
designer.getseqslicedict(target,250)

{'REF_OFFSET': 2750,
 'SEQUENCE_EXCLUDED_REGION': [(43, 1),
  (122, 1),
  (157, 1),
  (176, 1),
  (215, 1),
  (243, 1),
  (289, 1),
  (411, 1),
  (472, 1)],
 'SEQUENCE_ID': 'CHR1:2750-3251',
 'SEQUENCE_TARGET': (250, 1),
 'SEQUENCE_TEMPLATE': 'CTCAATTTCTTTAGAAGCTTCCAGAGTTGTTGAATTGGCAGCGGCAACTACAGTCGCAACTGTTCCTAGCTTTGCAGAACCATTCCCACTCAAGGAATTCACGGACTCTTTATGTGCCTTCAGAACCAACTGTGTCGCACTGGGTTTTAAAGGAAATAAATAAATATGGAATAAAACATTGATATTACAAATAAAGGGTGCTTCTAGCTGAGTAGTCCTCCGATAAAGCACACGCATACAAAGGAATGAGAGAGAGAGAGAGAGGCGCTACCACATATAAAAGGGACAGCAAACATTTTAACATGAGCAAATCAGTGACACTAGGTAGGTGTTAGCACAAAAATGAACCTTGTTTACATCTGTTCACCACATCCTAGAACATCTTAGACACACACTGCAATAACATATGAGGTGGAGCATGGCACAGTGATACTGCAACAGTAGGATTCCCTGTAACTCTAATGCAACTTTTCATGTACTCAGCCTCTCAAATGATATCGC',
 'TARGET_ID': 'CHR1:3000-3001'}

In [40]:
run_p3.run_P3(global_dict=p3_globals,target_dict=designer.getseqslicedict(target,250))

[{'AMPLICON_REGION': 'CHR1:2879-3022',
  'PRIMER_LEFT': (2878, 20),
  'PRIMER_LEFT_SEQUENCE': 'ACTGTGTCGCACTGGGTTTT',
  'PRIMER_RIGHT': (3021, 20),
  'PRIMER_RIGHT_SEQUENCE': 'GGTAGCGCCTCTCTCTCTCT',
  'SEQUENCE_ID': 'CHR1:2750-3251',
  'TARGET_ID': 'CHR1:3000-3001'},
 {'AMPLICON_REGION': 'CHR1:2878-3022',
  'PRIMER_LEFT': (2877, 20),
  'PRIMER_LEFT_SEQUENCE': 'AACTGTGTCGCACTGGGTTT',
  'PRIMER_RIGHT': (3021, 20),
  'PRIMER_RIGHT_SEQUENCE': 'GGTAGCGCCTCTCTCTCTCT',
  'SEQUENCE_ID': 'CHR1:2750-3251',
  'TARGET_ID': 'CHR1:3000-3001'},
 {'AMPLICON_REGION': 'CHR1:2879-3026',
  'PRIMER_LEFT': (2878, 20),
  'PRIMER_LEFT_SEQUENCE': 'ACTGTGTCGCACTGGGTTTT',
  'PRIMER_RIGHT': (3025, 20),
  'PRIMER_RIGHT_SEQUENCE': 'ATGTGGTAGCGCCTCTCTCT',
  'SEQUENCE_ID': 'CHR1:2750-3251',
  'TARGET_ID': 'CHR1:3000-3001'},
 {'AMPLICON_REGION': 'CHR1:2878-3026',
  'PRIMER_LEFT': (2877, 20),
  'PRIMER_LEFT_SEQUENCE': 'AACTGTGTCGCACTGGGTTT',
  'PRIMER_RIGHT': (3025, 20),
  'PRIMER_RIGHT_SEQUENCE': 'ATGTGGTAGCGCCTCTCTCT

### Check out this Built-in Function!



In [42]:
!faidx ../test/test-data/AcCHR1_test.fasta CHR1:2879-3022 CHR1:2878-3022 CHR1:2879-3026 CHR1:2878-3026

>CHR1:2879-3022
ACTGTGTCGCACTGGGTTTTAAAGGAAATAAATAAATATGGAATAAAACATTGATATTACAAATAAAGGGTGCTTCTAGC
TGAGTAGTCCTCCGATAAAGCACACGCATACAAAGGAATGAGAGAGAGAGAGAGAGGCGCTACC
>CHR1:2878-3022
AACTGTGTCGCACTGGGTTTTAAAGGAAATAAATAAATATGGAATAAAACATTGATATTACAAATAAAGGGTGCTTCTAG
CTGAGTAGTCCTCCGATAAAGCACACGCATACAAAGGAATGAGAGAGAGAGAGAGAGGCGCTACC
>CHR1:2879-3026
ACTGTGTCGCACTGGGTTTTAAAGGAAATAAATAAATATGGAATAAAACATTGATATTACAAATAAAGGGTGCTTCTAGC
TGAGTAGTCCTCCGATAAAGCACACGCATACAAAGGAATGAGAGAGAGAGAGAGAGGCGCTACCACAT
>CHR1:2878-3026
AACTGTGTCGCACTGGGTTTTAAAGGAAATAAATAAATATGGAATAAAACATTGATATTACAAATAAAGGGTGCTTCTAG
CTGAGTAGTCCTCCGATAAAGCACACGCATACAAAGGAATGAGAGAGAGAGAGAGAGGCGCTACCACAT


### Try out Variant Features in PyFaidx

In [80]:
from pyfaidx import FastaVariant
consensus_1=FastaVariant(test_seq,vcffile,het=True, hom=True,sample='CK51_02')
consensus_2=FastaVariant(test_seq,vcffile,het=True, hom=True,sample='CK51_09')

In [75]:
consensus_1['CHR1'].variant_sites[10:20]

(2296, 2318, 2493, 2567, 2908, 2927, 2966, 2994, 3040, 3162)

In [76]:
consensus_2['CHR1'].variant_sites[10:20]

(2318, 2416, 2567, 2908, 2927, 2966, 2994, 3040, 3162, 3223)

In [83]:
import difflib

In [82]:
consensus_1['CHR1'][2465:2699]

>CHR1:2466-2699
GAGTTCTGAGTTCCTCGTGTATTTCAAGAAAACAAATCTTTAAACTACATGTGTGATAAGACCCGTATATGTGGAATAGCTTGATGTAGGTCGTTTGGCCTCATTCCCAACAGTCTCCATACTCTACCTACTGCTATCACTCCAAAGCTAAATTAAGTTTGTATAGACATATATACTAGAAAGTAACAGTCCTACCTTCATGACCGTGAGGTCATCCATTGGACCATCGGTTGG

In [79]:
consensus_2['CHR1'][2465:2699]

>CHR1:2466-2699
GAGTTCTGAGTTCCTCGTGTATTTCAATAAAACAAATCTTTAAACTACATGTGTGATAAGACCCGTATATGTGGAATAGCTTGATGTAGGTCGTTTGGCCTCATTCCCAACAGTCTCCATACTCTACCTACTGCTATCACTCCAAAGCTAAATTAAGTTTGTATAGACATATATACTAGAAAGTAACAGTCCTACCTTCATGACCGTGAGGTCATCCATTGGACCATCGGTTGG

In [89]:
diff=difflib.ndiff(consensus_1['CHR1'][2465:2699].seq,consensus_2['CHR1'][2465:2699].seq)

In [90]:
print("".join(diff))

  G  A  G  T  T  C  T  G  A  G  T  T  C  C  T  C  G  T  G  T  A  T  T  T  C  A  A- G- A- A- A- A- C- A- A- A  T- C- T- T- T  A  A  A- C- T  A  C  A- T- G- T- G- T- G  A- T  A- A- G- A- C- C- C- G  T- A- T- A- T- G- T- G- G- A- A- T- A- G  C  T  T- G- A  T- G- T  A- G- G- T- C- G- T- T- T- G- G- C- C- T- C  A- T- T- C- C- C  A- A  C- A- G  T- C- T- C- C  A- T- A  C- T- C- T  A- C- C  T- A- C- T  G- C  T- A- T- C- A- C- T- C- C- A- A- A  G- C  T- A- A- A- T- T- A- A  G- T- T- T- G- T  A  T  A- G  A- C- A- T- A- T- A- T- A- C- T- A  G  A- A- A- G- T- A- A  C- A- G- T  C  C- T- A- C- C- T- T- C- A- T  G- A- C- C- G  T- G  A- G- G  T- C  A  T- C- C- A- T- T  G- G- A- C- C- A  T- C  G  G+ A+ A  T- T+ A  G+ C+ T+ T  G+ A+ T+ G+ T+ A+ G+ G+ T+ C+ G+ T+ T+ T+ G+ G+ C+ C+ T+ C+ A+ T+ T+ C+ C+ C+ A+ A+ C+ A+ G+ T+ C+ T+ C+ C+ A+ T+ A+ C+ T+ C+ T+ A+ C+ C+ T+ A+ C+ T+ G+ C+ T+ A+ T+ C+ A+ C+ T+ C+ C+ A+ A+ A+ G+ C+ T+ A+ A+ A+ T+ T+ A+ A+ G+ T+ T+ T+ G+ T+ A+ T+ A+ G+ A+ C+ A+ T+ A+ T+ A+ T+ A+ C+

In [77]:
run_p3.run_P3(global_dict=p3_globals,
              target_dict=designer.getseqslicedict(BedTool('CHR1 2492 2493', from_string=True),250))

[{'AMPLICON_REGION': 'CHR1:2465-2699',
  'PRIMER_LEFT': (2464, 21),
  'PRIMER_LEFT_SEQUENCE': 'AGAGTTCTGAGTTCCTCGTGT',
  'PRIMER_RIGHT': (2698, 20),
  'PRIMER_RIGHT_SEQUENCE': 'CCAACCGATGGTCCAATGGA',
  'SEQUENCE_ID': 'CHR1:2242-2743',
  'TARGET_ID': 'CHR1:2492-2493'},
 {'AMPLICON_REGION': 'CHR1:2464-2699',
  'PRIMER_LEFT': (2463, 22),
  'PRIMER_LEFT_SEQUENCE': 'AAGAGTTCTGAGTTCCTCGTGT',
  'PRIMER_RIGHT': (2698, 20),
  'PRIMER_RIGHT_SEQUENCE': 'CCAACCGATGGTCCAATGGA',
  'SEQUENCE_ID': 'CHR1:2242-2743',
  'TARGET_ID': 'CHR1:2492-2493'},
 {'AMPLICON_REGION': 'CHR1:2465-2688',
  'PRIMER_LEFT': (2464, 21),
  'PRIMER_LEFT_SEQUENCE': 'AGAGTTCTGAGTTCCTCGTGT',
  'PRIMER_RIGHT': (2687, 20),
  'PRIMER_RIGHT_SEQUENCE': 'TCCAATGGATGACCTCACGG',
  'SEQUENCE_ID': 'CHR1:2242-2743',
  'TARGET_ID': 'CHR1:2492-2493'},
 {'AMPLICON_REGION': 'CHR1:2463-2699',
  'PRIMER_LEFT': (2462, 23),
  'PRIMER_LEFT_SEQUENCE': 'AAAGAGTTCTGAGTTCCTCGTGT',
  'PRIMER_RIGHT': (2698, 20),
  'PRIMER_RIGHT_SEQUENCE': 'CCAACCGATGGTC