Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add sequencing functions #49

Merged
merged 10 commits into from
Jan 19, 2024
Merged

add sequencing functions #49

merged 10 commits into from
Jan 19, 2024

Conversation

Koeng101
Copy link
Owner

@Koeng101 Koeng101 commented Jan 2, 2024

This PR adds some functions for handling / analyzing DNA sequencing.

@Koeng101
Copy link
Owner Author

Koeng101 commented Jan 2, 2024

Channels should be passing values, not pointers. This needs to be fixed.

@Koeng101
Copy link
Owner Author

Koeng101 commented Jan 4, 2024

Things to do:

  • Make all concurrent functions implement context closing
  • Replace all channel operations with values instead of pointers
  • Update non-err functions with nil return for consistent styling
  • Make generic function for workflows (RunWorkflow) which takes in the errorGroup functions and the number of workers for each
  • Fix the damn linter

@Koeng101
Copy link
Owner Author

func TrimBarcodeWithChannels(barcodeName string, fastqInput <-chan fastq.Read, fastqOutput chan<- fastq.Read) error {
	for {
		select {
		case data, ok := <-fastqInput:
			if !ok {
				// If the input channel is closed, we close the output channel and return
				close(fastqOutput)
				return nil
			}
			trimmedRead, err := TrimBarcode(barcodeName, *data)
			if err != nil {
				close(fastqOutput)
				return err
			}
			fastqOutput <- &trimmedRead
		}
	}

}

// TrimBarcode takes a barcodeName and a fastqSequence and returns a trimmed
// barcode.
func TrimBarcode(barcodeName string, fastqRead fastq.Read) (fastq.Read, error) {
	// Copy into new fastq read
	var newFastqRead fastq.Read
	newFastqRead.Identifier = fastqRead.Identifier
	newFastqRead.Optionals = make(map[string]string)
	for key, value := range fastqRead.Optionals {
		newFastqRead.Optionals[key] = value
	}

	// Get the barcode
	barcode, ok := NativeBarcodeMap[barcodeName]
	if !ok {
		return newFastqRead, fmt.Errorf("barcode %s not found in NativeBarcodeMap.", barcodeName)
	}

	// Trim in the forward direction
	var sequence string
	var quality string
	sequence = fastqRead.Sequence
	quality = fastqRead.Quality
	if len(sequence) > 80 {
		score, alignment, err := Align(sequence[:80], barcode.Forward)
		if err != nil {
			return newFastqRead, err
		}
		// Empirically from looking around, it seems to me that approximately a
		// score of 21 looks like a barcode match. This is almost by definition a
		// magic number, so it is defined above as so.
		if score >= ScoreMagicNumber {
			modifiedAlignment := strings.ReplaceAll(alignment, "-", "")
			index := strings.Index(sequence, modifiedAlignment)
			// The index needs to be within the first 80 base pairs. Usually the
			// native adapter + native barcode is within the first ~70 bp, but we
			// put a small buffer.
			if index != -1 {
				// 7 here is a magic number between the native barcode and your
				// target sequence. It's just a number that exists irl, so it is
				// not defined above.
				newStart := index + 7
				if newStart < len(sequence) {
					sequence = sequence[newStart:]
					quality = quality[newStart:]
				}
			}
		}
	}

	// Now do the same thing with the reverse strand.
	if len(sequence) > 80 {
		score, alignment, err := Align(sequence[len(sequence)-80:], barcode.Reverse)
		if err != nil {
			return newFastqRead, err
		}
		if score >= ScoreMagicNumber {
			modifiedAlignment := strings.ReplaceAll(alignment, "-", "")
			index := strings.Index(sequence, modifiedAlignment)
			// This time we need to check within the last 80 base pairs.
			if index != -1 {
				newEnd := index - 7
				if newEnd < len(sequence) {
					sequence = sequence[:newEnd]
					quality = quality[:newEnd]
				}
			}
		}
	}
	newFastqRead.Sequence = sequence
	newFastqRead.Quality = quality
	return newFastqRead, nil
}

// Align uses the SmithWaterman algorithm to align a barcode to a sequence.
// It is used because it is a rather simple algorithm, and since barcodes are
// sufficiently small, fast enough.
func Align(sequence string, barcodeSequence string) (int, string, error) {
	m := [][]int{
		/*       A C G T U */
		/* A */ {1, -1, -1, -1, -1},
		/* C */ {-1, 1, -1, -1, -1},
		/* G */ {-1, -1, 1, -1, -1},
		/* T */ {-1, -1, -1, 1, -1},
		/* U */ {-1, -1, -1, -1, 1},
	}

	alphabet := alphabet.NewAlphabet([]string{"A", "C", "G", "T", "U"})
	subMatrix, err := matrix.NewSubstitutionMatrix(alphabet, alphabet, m)
	if err != nil {
		return 0, "", err
	}
	scoring, err := align.NewScoring(subMatrix, -1)
	if err != nil {
		return 0, "", err
	}
	score, alignSequence, _, err := align.SmithWaterman(sequence, barcodeSequence, scoring)
	if err != nil {
		return 0, "", err
	}
	return score, alignSequence, nil
}

I don't think I need trim barcodes right now, with megamash now being a thing. So I'm deleting this block of code, but want to maintain it here for posterity.

@Koeng101
Copy link
Owner Author

Mostly, I just want to add docs for megamash, then this is ready.

@Koeng101 Koeng101 mentioned this pull request Jan 15, 2024
@Koeng101 Koeng101 merged commit 3aab61c into main Jan 19, 2024
5 checks passed
@Koeng101 Koeng101 deleted the sequencing branch January 19, 2024 23:45
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

1 participant