forked from jjti/repp
/
enzymes.go
366 lines (307 loc) · 9.42 KB
/
enzymes.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
package repp
import (
"fmt"
"os"
"regexp"
"sort"
"strings"
"text/tabwriter"
"github.com/Lattice-Automation/repp/internal/config"
"github.com/spf13/cobra"
)
// enzyme is a single enzyme that can be used to linearize a backbone before
// inserting a sequence.
type enzyme struct {
name string
recog string
seqCutIndex int
compCutIndex int
}
// cut is a binding index and the length of the overhang after digestion
type cut struct {
index int
strand bool
enzyme enzyme
}
// Backbone is for information on a linearized backbone in the output payload
type Backbone struct {
// URL of the backbone fragment's source
URL string `json:"url"`
// Seq is the sequence of the backbone (unlinearized)
Seq string `json:"seq"`
// Enzymes is the list of enzymes names used to linearize the backbone
Enzymes []string `json:"enzymes"`
// cutsites are the indexes where the backbone was cleaved
Cutsites []int `json:"recognitionIndex"`
// Strands of each cut direction. True if fwd, False if rev direction
Strands []bool `json:"strands"`
}
// parses a recognition sequence into a hangInd, cutInd for overhang calculation.
func newEnzyme(name, recogSeq string) enzyme {
cutIndex := strings.Index(recogSeq, "^")
hangIndex := strings.Index(recogSeq, "_")
if cutIndex < hangIndex {
hangIndex--
} else {
cutIndex--
}
recogSeq = strings.Replace(recogSeq, "^", "", -1)
recogSeq = strings.Replace(recogSeq, "_", "", -1)
return enzyme{
name: name,
recog: recogSeq,
seqCutIndex: cutIndex,
compCutIndex: hangIndex,
}
}
// digest a Frag (backbone) with an enzyme's first recogition site
//
// remove the 5' end of the fragment post-cleaving. it will be degraded.
// keep exposed 3' ends. good visual explanation:
// https://warwick.ac.uk/study/csde/gsp/eportfolio/directory/pg/lsujcw/gibsonguide/
func digest(frag *Frag, enzymes []enzyme) (digested *Frag, backbone *Backbone, err error) {
wrappedBp := 38 // largest current recognition site in the list of enzymes
if len(frag.Seq) < wrappedBp {
return &Frag{}, &Backbone{}, fmt.Errorf("%s is too short for digestion", frag.ID)
}
frag.Seq = strings.ToUpper(frag.Seq)
firstHalf := frag.Seq[:len(frag.Seq)/2]
secondHalf := frag.Seq[len(frag.Seq)/2:]
if firstHalf == secondHalf {
// it's a circular fragment that's doubled in the database
frag.Seq = frag.Seq[:len(frag.Seq)/2] // undo the doubling of sequence for circular parts
}
// find all the cutsites
cuts, lengths := cutsites(frag.Seq, enzymes)
// none found
if len(cuts) == 0 {
enzymeNames := []string{}
for _, enzyme := range enzymes {
enzymeNames = append(enzymeNames, enzyme.name)
}
return &Frag{}, &Backbone{}, fmt.Errorf("no %s cutsites found in %s", strings.Join(enzymeNames, ","), frag.ID)
}
// only one cutsite
if len(cuts) == 1 {
cut := cuts[0]
overhangLength := cut.enzyme.seqCutIndex - cut.enzyme.compCutIndex
digestedSeq := ""
if overhangLength >= 0 {
cutIndex := (cut.index + cut.enzyme.seqCutIndex) % len(frag.Seq)
digestedSeq = frag.Seq[cutIndex:] + frag.Seq[:cutIndex]
} else {
bottomIndex := (cut.index + cut.enzyme.seqCutIndex) % len(frag.Seq)
topIndex := (cut.index + cut.enzyme.compCutIndex) % len(frag.Seq)
digestedSeq = frag.Seq[topIndex:] + frag.Seq[:bottomIndex]
}
return &Frag{
ID: frag.ID,
uniqueID: "backbone",
Seq: digestedSeq,
fragType: linear,
db: frag.db,
},
&Backbone{
Seq: frag.Seq,
Enzymes: []string{cut.enzyme.name},
Cutsites: []int{cut.index},
Strands: []bool{cut.strand},
},
nil
}
// find the largest band
largestBand := 0
for i, bandLength := range lengths {
if bandLength > lengths[largestBand] {
largestBand = i
}
}
// find the enzyme from the start and end of the largest band
cut1 := cuts[largestBand]
cut2 := cuts[(largestBand+1)%len(lengths)]
doubled := frag.Seq + frag.Seq
cut1Index := cut1.index + cut1.enzyme.compCutIndex
if cut1.enzyme.seqCutIndex-cut1.enzyme.compCutIndex < 0 {
cut1Index = cut1.index + cut1.enzyme.seqCutIndex
}
cut2Index := cut2.index + cut2.enzyme.compCutIndex
if cut2.enzyme.seqCutIndex-cut2.enzyme.compCutIndex < 0 {
cut2Index = cut2.index + cut2.enzyme.seqCutIndex
}
if cut2Index < cut1Index {
cut2Index += len(frag.Seq)
}
digestedSeq := doubled[cut1Index:cut2Index]
return &Frag{
ID: frag.ID,
uniqueID: "backbone",
Seq: digestedSeq,
fragType: linear,
db: frag.db,
},
&Backbone{
Seq: frag.Seq,
Enzymes: []string{cut1.enzyme.name, cut2.enzyme.name},
Cutsites: []int{cut1Index, cut2Index},
Strands: []bool{cut1.strand, cut2.strand},
},
nil
}
// cutsites finds all the cutsites of a list of enzymes against a target sequence
// also returns the lengths of each "band" of DNA after digestion. Each band length
// corresponds to the band formed with the start of the enzyme at the same index in cuts
func cutsites(seq string, enzymes []enzyme) (cuts []cut, lengths []int) {
s := seq
rcs := reverseComplement(seq)
for _, enzyme := range enzymes {
regexRecognition := recogRegex(enzyme.recog)
reg := regexp.MustCompile(regexRecognition)
for _, submatch := range reg.FindAllStringSubmatchIndex(s, -1) {
index := submatch[0]
cuts = append(cuts, cut{index: index, enzyme: enzyme, strand: true})
}
// if it's a palindrome enzyme, don't scan over it again
if reverseComplement(regexRecognition) == regexRecognition {
continue
}
for _, submatch := range reg.FindAllStringSubmatchIndex(rcs, -1) {
index := submatch[0]
index = len(seq) - index - len(enzyme.recog)
cuts = append(cuts, cut{index: index, enzyme: enzyme, strand: false})
}
}
sort.Slice(cuts, func(i, j int) bool {
return cuts[i].index < cuts[j].index
})
for i, c := range cuts {
next := (i + 1) % len(cuts)
bandLength := (cuts[next].index - c.index + len(seq)) % len(seq)
lengths = append(lengths, bandLength)
}
return
}
// recogRegex turns a recognition sequence into a regex sequence for searching
// sequence for searching the sequence for digestion sites.
func recogRegex(recog string) (decoded string) {
regexDecode := map[rune]string{
'A': "A",
'C': "C",
'G': "G",
'T': "T",
'M': "(A|C)",
'R': "(A|G)",
'W': "(A|T)",
'Y': "(C|T)",
'S': "(C|G)",
'K': "(G|T)",
'H': "(A|C|T)",
'D': "(A|G|T)",
'V': "(A|C|G)",
'B': "(C|G|T)",
'N': "(A|C|G|T)",
'X': "(A|C|G|T)",
}
var regexDecoder strings.Builder
for _, c := range recog {
regexDecoder.WriteString(regexDecode[c])
}
return regexDecoder.String()
}
// NewEnzymeDB returns a new copy of the enzymes db.
func NewEnzymeDB() *kv {
return newKV(config.EnzymeDB)
}
// EnzymesReadCmd writes enzymes that are similar in queried name to stdout.
// if multiple enzyme names include the enzyme name, they are all returned.
// otherwise a list of enzyme names are returned (those beneath a levenshtein distance cutoff).
func EnzymesReadCmd(cmd *cobra.Command, args []string) {
f := NewEnzymeDB()
// from https://golang.org/pkg/text/tabwriter/
w := tabwriter.NewWriter(os.Stdout, 0, 0, 3, ' ', tabwriter.TabIndent)
if len(args) < 1 {
enzymeNames := make([]string, len(f.contents))
i := 0
for name := range f.contents {
enzymeNames[i] = name
i++
}
sort.Strings(enzymeNames)
for _, name := range enzymeNames {
fmt.Fprintf(w, "%s\t%s\n", name, f.contents[name])
}
w.Flush()
return
}
name := args[0]
// if there's an exact match, just log that one
if seq, exists := f.contents[name]; exists {
fmt.Printf("%s %s\n", name, seq)
return
}
ldCutoff := 2
containing := []string{}
lowDistance := []string{}
for fName, fSeq := range f.contents {
if strings.Contains(fName, name) {
containing = append(containing, fName+"\t"+fSeq)
} else if len(fName) > ldCutoff && ld(name, fName, true) <= ldCutoff {
lowDistance = append(lowDistance, fName+"\t"+fSeq)
}
}
if len(containing) < 3 {
lowDistance = append(lowDistance, containing...)
containing = []string{} // clear
}
if len(containing) > 0 {
fmt.Fprint(w, strings.Join(containing, "\n"))
} else if len(lowDistance) > 0 {
fmt.Fprint(w, strings.Join(lowDistance, "\n"))
} else {
fmt.Fprintf(w, "failed to find any enzymes for %s", name)
}
if _, err := w.Write([]byte("\n")); err != nil {
rlog.Fatal(err)
}
w.Flush()
}
// EnzymesAddCmd the enzyme's seq in the database (or create if it isn't in the enzyme db).
func EnzymesAddCmd(cmd *cobra.Command, args []string) {
f := NewEnzymeDB()
name := args[0]
seq := args[1]
if len(args) > 2 {
name = strings.Join(args[:len(args)-1], " ")
seq = args[len(args)-1]
}
seq = strings.ToUpper(seq)
invalidChars := regexp.MustCompile(`[^ATGCMRWYSKHDVBNX_\^]`)
seq = invalidChars.ReplaceAllString(seq, "")
if strings.Count(seq, "^") != 1 || strings.Count(seq, "_") != 1 {
rlog.Fatal("%s is not a valid enzyme recognition sequence. see 'repp find enzyme --help'\n", seq)
}
f.contents[name] = seq
if err := f.save(); err != nil {
rlog.Fatal(err)
}
}
// EnzymesDeleteCmd deletes an enzyme from the database
func EnzymesDeleteCmd(cmd *cobra.Command, args []string) {
f := NewEnzymeDB()
if len(args) < 1 {
if helperr := cmd.Help(); helperr != nil {
rlog.Fatal(helperr)
}
rlog.Fatal("\nexpected an enzyme name")
}
name := args[0]
if len(args) > 1 {
name = strings.Join(args, " ")
}
if _, contained := f.contents[name]; !contained {
fmt.Printf("failed to find %s in the enzymes database\n", name)
}
delete(f.contents, name)
if err := f.save(); err != nil {
rlog.Fatal(err)
}
}