# Prototyping Design_primers for Design from VCF

### John McCallum Feb 2017

Use autoreload for development to reload modules automagically

In [30]:
%load_ext autoreload
%autoreload 2
import random

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load up Module

In [31]:
from pybedtools import BedTool, Interval
from pyfaidx import Fasta , FastaVariant
import pandas as pd

In [32]:
from pcr_marker_design import design as d 
from pcr_marker_design import run_p3 as P3


In [33]:
ls ../test/test-data

384um_251453690362217.txt      Chr9_Myb210.vcf
AcCHR1_test.fasta              Chr9_Myb210.vcf.gz
AcCHR1_test.fasta.fai          targets
AcCHR1_test.phased.vcf.gz      targets.fasta
AcCHR1_test.phased.vcf.gz.tbi  targets.fasta.fai
AcCHR1_test.vcf.gz             targets.gff
AcCHR1_test.vcf.gz.tbi         targets.snps.bed
CHR9.1.68.5.fasta.gz


## Initialise Designer

In [92]:
test_seq = "../test/test-data/AcCHR1_test.fasta"
vcffile = "../test/test-data/AcCHR1_test.vcf.gz"
designer = d.VcfPrimerDesign(test_seq, vcffile, "TestCHR1")

### Create a BedTool Target

In [115]:
import numpy


array([1, 2, 3, 4, 5, 6, 7])

In [117]:
targetBed=BedTool([random.choice(BedTool(vcffile)) for X in numpy.arange(1,8)])

In [137]:
Tdf=targetBed.to_dataframe()[[0,1,3,4]]
Tdf.columns=['CHR','POS','REF','ALT']
Tdf['TARGET_ID']= Tdf.apply(lambda X: X.CHR + ":"+ str(X.POS) + "-" + str(X.POS),axis=1)
Tdf

Unnamed: 0,CHR,POS,REF,ALT,TARGET_ID
0,CHR1,7577,C,T,CHR1:7577-7577
1,CHR1,2998,TGAGAGAGAGAGAGAGAG,"TGAGAGAGAGAGAGAGAGAGAG,TGAGAGAGAGAGAGAGAGAGAGA...",CHR1:2998-2998
2,CHR1,6568,G,T,CHR1:6568-6568
3,CHR1,6287,A,G,CHR1:6287-6287
4,CHR1,130,GAAAAAAAG,"GAAAAAAAAG,GAAAAAAAAAG",CHR1:130-130
5,CHR1,5910,G,C,CHR1:5910-5910
6,CHR1,7022,A,G,CHR1:7022-7022


In [119]:
print(targetBed)

CHR1	7577	.	C	T	926.572	.	AB=0.584615;ABP=7.05258;AC=1;AF=0.0384615;AN=26;AO=39;CIGAR=1X;DP=867;DPB=867;DPRA=0.97894;EPP=4.40227;EPPR=42.7349;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=13;NUMALT=1;ODDS=36.2947;PAIRED=1;PAIREDR=0.997582;PAO=0;PQA=0;PQR=0;PRO=0;QA=1311;QR=29486;RO=827;RPP=3.06598;RPPR=5.53362;RUN=1;SAF=23;SAP=5.73856;SAR=16;SRF=382;SRP=13.4318;SRR=445;TYPE=snp;technology.Illumina=1	GT:GQ:DP:RO:QR:AO:QA:GL	0/1:0:65:27:942:38:1303:-10,0,-10	0/0:0:59:59:2061:0:0:0,-10,-10	0/0:0:96:96:3438:0:0:0,-10,-10	0/0:0:72:72:2691:0:0:0,-10,-10	0/0:0:92:92:3177:0:0:0,-10,-10	0/0:0:72:71:2552:0:0:0,-10,-10	0/0:0:73:73:2528:0:0:0,-10,-10	0/0:99:47:47:1692:0:0:0,-10,-10	0/0:99:42:42:1408:0:0:0,-10,-10	0/0:0:65:65:2376:0:0:0,-10,-10	0/0:0:66:66:2405:0:0:0,-10,-10	0/0:0:52:52:1876:0:0:0,-10,-10	0/0:0:66:65:2340:1:8:0,-10,-10
CHR1	2998	.	TGAGAGAGAGAGAGAGAG	TGAGAGAGAGAGAGAGAGAGAG,TGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAG,TGAGAGAGAGAGAGAGAGAG,TGAGAATGAGAGAGAGAGAGAGAGAGAG,TGAGAGAGAGAGAGAGAGAGAGAGAGAGAG	473

### Indexing into fasta reference

note start and end

In [96]:
designer.reference['CHR1'][6669:6670]

>CHR1:6670-6670
G

In [97]:
targetBed[0]

Interval(CHR1:6929-6930)

In [98]:
targetBed[0].start

6929

In [99]:
targetBed[0].end

6930

## Generate Primer3 Dict

In [121]:
designer.getseqslicedict(targetBed[0],120)

{'REF_OFFSET': 7457,
 'SEQUENCE_EXCLUDED_REGION': [(45, 1), (113, 1), (119, 1), (176, 1)],
 'SEQUENCE_ID': 'CHR1:7457-7698',
 'SEQUENCE_TARGET': (120, 1),
 'SEQUENCE_TEMPLATE': 'ATGCATGATTTACATAAAACAGAATCTACTCAGGTGTATTTCACACATGATAAAGAAAATGCAGTTGCAGTGGTTTTAATTACACCTCTATTAGAAATGCATTAAATTAAATGCTTGGCAAATCCTTTAATTGCCTCGCTTGCTGCAGGACCAATTTGTAGACCTAAAAGGCAGCGTTTGGATAGCGTATTTGTGTATACAAAAAAGGAAGAAAAAAAATATAATTAGACAACCAAGTTAT',
 'TARGET_ID': 'CHR1:7577-7577'}

In [122]:
p3_globals={
        'PRIMER_OPT_SIZE': 20,
        'PRIMER_PICK_INTERNAL_OLIGO': 0,
        'PRIMER_INTERNAL_MAX_SELF_END': 8,
        'PRIMER_MIN_SIZE': 18,
        'PRIMER_MAX_SIZE': 25,
        'PRIMER_OPT_TM': 60.0,
        'PRIMER_MIN_TM': 57.0,
        'PRIMER_MAX_TM': 63.0,
        'PRIMER_MIN_GC': 20.0,
        'PRIMER_MAX_GC': 80.0,
        'PRIMER_MAX_POLY_X': 100,
        'PRIMER_INTERNAL_MAX_POLY_X': 100,
        'PRIMER_SALT_MONOVALENT': 50.0,
        'PRIMER_DNA_CONC': 50.0,
        'PRIMER_MAX_NS_ACCEPTED': 0,
        'PRIMER_MAX_SELF_ANY': 12,
        'PRIMER_MAX_SELF_END': 8,
        'PRIMER_PAIR_MAX_COMPL_ANY': 12,
        'PRIMER_PAIR_MAX_COMPL_END': 8,
        'PRIMER_PRODUCT_SIZE_RANGE': [60,250],
        'PRIMER_NUM_RETURN' : 2
    }

### Design a single target

In [123]:
P3.run_P3(global_dict=p3_globals,
              target_dict=designer.getseqslicedict(targetBed[1],120))

[{'AMPLICON_REGION': 'CHR1:2879-3022',
  'PRIMER_LEFT': (2878, 20),
  'PRIMER_LEFT_SEQUENCE': 'ACTGTGTCGCACTGGGTTTT',
  'PRIMER_RIGHT': (3021, 20),
  'PRIMER_RIGHT_SEQUENCE': 'GGTAGCGCCTCTCTCTCTCT',
  'SEQUENCE_ID': 'CHR1:2878-3119',
  'TARGET_ID': 'CHR1:2998-2998'}]

### Design to all 

In [124]:
[X.start for X in targetBed]

[7577, 2998, 6568, 6287, 130, 5910, 7022]

In [147]:
f=lambda x: P3.run_P3(global_dict=p3_globals,
              target_dict=designer.getseqslicedict(x,120))
amps=[y for x in [f(X) for X in targetBed] for y in x]
TAmp=pd.DataFrame(amps)
TAmp[[6,0,2,4]]

Unnamed: 0,TARGET_ID,AMPLICON_REGION,PRIMER_LEFT_SEQUENCE,PRIMER_RIGHT_SEQUENCE
0,CHR1:7577-7577,CHR1:7518-7603,GCAGTTGCAGTGGTTTTAATTACAC,GCAGCAAGCGAGGCAATTAA
1,CHR1:2998-2998,CHR1:2879-3022,ACTGTGTCGCACTGGGTTTT,GGTAGCGCCTCTCTCTCTCT
2,CHR1:6568-6568,CHR1:6490-6659,GGTCAATGTTATTTTGGCTACAACT,GCCTCCGTAAGAAGCCCTAA
3,CHR1:6287-6287,CHR1:6191-6329,TGGGGTTATTTTTTGGGGTCT,TGGAGAGCTATTATTGCTTAGTGGA
4,CHR1:130-130,CHR1:12-234,CGAGCTCTGAAGGCTGATCA,TGTGAACCTTTTTGATGTGCCT
5,CHR1:5910-5910,CHR1:5874-5958,GCAGAAGCTCCATTGGAAGC,ACCACCCGTCAACTCACATC


generate lists of amplicons

NB need to the comprhension to flatten the nested lists

In [126]:
%%bash
export https_proxy=https://proxy.pfr.co.nz:8080
export http_proxy=http://proxy.pfr.co.nz:8080
env | grep proxy

http_proxy=http://proxy.pfr.co.nz:8080
https_proxy=https://proxy.pfr.co.nz:8080


In [127]:
%%bash
export https_proxy=
export http_proxy=
env | grep proxy

http_proxy=
https_proxy=


### Melt a single  Amplicon

melts the refrence and consensus amplicons returning a tuple

In [128]:
designer.meltSlice('CHR1:6640-6755')

(85.629055007052187, 88.733427362482374)

### Run Prediction on All Amplicons in our results set

In [151]:
TAmp['Tm']=TAmp.AMPLICON_REGION.apply(lambda X: designer.meltSlice(X))

In [163]:
TAmp['TmDiff']=TAmp.Tm.apply(lambda X: abs(X[0]-X[1]))
TAmp[[6,0,8]]

Unnamed: 0,TARGET_ID,AMPLICON_REGION,TmDiff
0,CHR1:7577-7577,CHR1:7518-7603,0.0
1,CHR1:2998-2998,CHR1:2879-3022,0.400564
2,CHR1:6568-6568,CHR1:6490-6659,0.050071
3,CHR1:6287-6287,CHR1:6191-6329,0.0
4,CHR1:130-130,CHR1:12-234,0.0
5,CHR1:5910-5910,CHR1:5874-5958,0.700987


In [164]:
Tdf

Unnamed: 0,CHR,POS,REF,ALT,TARGET_ID
0,CHR1,7577,C,T,CHR1:7577-7577
1,CHR1,2998,TGAGAGAGAGAGAGAGAG,"TGAGAGAGAGAGAGAGAGAGAG,TGAGAGAGAGAGAGAGAGAGAGA...",CHR1:2998-2998
2,CHR1,6568,G,T,CHR1:6568-6568
3,CHR1,6287,A,G,CHR1:6287-6287
4,CHR1,130,GAAAAAAAG,"GAAAAAAAAG,GAAAAAAAAAG",CHR1:130-130
5,CHR1,5910,G,C,CHR1:5910-5910
6,CHR1,7022,A,G,CHR1:7022-7022
