In [1]:
# ========== Step One -- Don't Modify Anything in This Cell
# Import Libraries and Modules
from Bio import SeqIO
from Bio.Seq import Seq
import subprocess
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import NGSanalysis_module as ngs

In [2]:
# ========== Step Three -- Modify Parameters for Your Use Case

# Parameters to Change

bbmap_bin = '/Users/ekello73/src/bbmap/'
samtools_bin = '/Users/ekello73/src/samtools-1.22.1/bin/bin/'

# Absolute filepath to your merged paired end reads in a .fastq file format
fastq_filepath = "/Users/ekello73/projects/NGS/datasets/PrashantTagmentation/PD2_merged.fastq"

# Primer sequence to the end of the LE or RE
primer = "CTGAAAAACAACCACCACGACATTAATTTGCGAATAACGACACTAAATTGCGAAAAGCGACATTTAATTTGCGAATGTACA"
# AcCAST LE CTGAAAAACAACCACCACGACATTAATTTGCGAATAACGACACTAAATTGCGAAAAGCGACATTTAATTTGCGAATGTACA
# PmcCAST LE CCCTTTATTGGAAGCATAAGCTTGCCGTTGCGGCAAAGTTATGGGTAAAGTCACA

# Absolute filepath to CJP003 genome .fasta file
referencegenome_filepath = "/Users/ekello73/projects/NGS/genomes/cJP003_assembly.fasta"

# The first 10 bases in your donor plasmid immediately following the LE or RE -- used to throw away non-enzymatic integrations/recombinations
first_ten_in_donor = "TATATAATGG"
# AcCAST TATATAATGG
# PmcCAST ATCCCAATGG

# Sequence of the targeting portion of your sgRNA
sgRNA_sequence = "TTTTTGGGCTAGCGATTGAAAACG"

# PmcCAST De novo sgRNAs
# Spacer 1 TGAAAGCTGGCGCATGATGACCACCGATA
# Spacer 2 TGAACTTTACCCGGTGGTGCATATCGGGGA
# Spacer 3 ACTTTATCTGACAGCAGACGTGCACTGGCC
# Spacer 4 AGGTTAATGGCGTTTTTGATGTCATTTTCG

# AcCAST sgRNAs
# Spacer 1 TCTCCATACCCGTTTTTTTGGGCT
# Spacer 2 TTTTTGGGCTAGCGATTGAAAACG
# Spacer 3 TACACCTATAAAAGAGAGAGCCGT

# Length of the CAST's PAM Sequence
PAM_len = 3

# Is this a naturally occuring CAST system?
natural_system = True # Set to false if Novel CAST -- integration site relative to the PAM is different than in natural systems

# Filepath for Output files from bbmap -- change to the absolute filepath where you want them exported to, end a dir with "/"
path_to_output_dir = "./test/"

# Adds this to the beginning of each output file for simplicity
sample_abbreviation = "PD2"

# Names of files of interest -- change to your liking
trimmed_fastq = f"{sample_abbreviation}_trimmed_reads.fastq"
ForwardInsertions = f"{sample_abbreviation}_forward_insertions.txt"
ReverseInsertions = f"{sample_abbreviation}_reverse_insertions.txt"
genome_wide_integrations = f"{sample_abbreviation}_genome_insertion_histogram.png"
ccdb_integrations = f"{sample_abbreviation}_CCDB_insertion_histogram.png"
integration_distances = f"{sample_abbreviation}_Integration_profile.png"
integration_report = f"{sample_abbreviation}_integration_report.txt"

# Useful variables -- only change if necessary
genome_sequence = ngs.fasta_to_string(referencegenome_filepath)
ccdb_gene = "ATGCAGTTTAAGGTTTACACCTATAAAAGAGAGAGCCGTTATCGTCTGTTTGTGGATGTACAGAGTGATATTATTGACACGCCCGGGCGACGGATGGTGATCCCCCTGGCCAGTGCACGTCTGCTGTCAGATAAAGTCTCCCGTGAACTTTACCCGGTGGTGCATATCGGGGATGAAAGCTGGCGCATGATGACCACCGATATGGCCAGTGTGCCGGTCTCCGTTATCGGGGAAGAAGTGGCTGATCTCAGCCACCGCGAAAATGACATCAAAAACGCCATTAACCTGATGTTTTGGGGAATATAA"
ccdb_start_site = genome_sequence.find(ccdb_gene)
sgRNA_start_site, PAM_start_site, PAM_end_site, is_rc = ngs.sgRNA_finder(sgRNA_sequence, genome_sequence, PAM_len, natural_system)

highlight = [
    (sgRNA_start_site, sgRNA_start_site + len(sgRNA_sequence), 'red'), (PAM_start_site, PAM_end_site, 'black')
]


In [3]:
bbmap_bin = '/Users/ekello73/src/bbmap/'
samtools_bin = '/Users/ekello73/src/samtools-1.22.1/bin/bin/'

[forward_insertions,reverse_insertions] = ngs.process_reads(fastq_filepath,
                                                            primer,
                                                            first_ten_in_donor,
                                                            referencegenome_filepath,
                                                            bbmap_bin,
                                                            samtools_bin,
                                                            path_to_output_dir)

# Save positions
Path(f"{path_to_output_dir}{ForwardInsertions}").write_text("\n".join(map(str, forward_insertions)))
Path(f"{path_to_output_dir}{ReverseInsertions}").write_text("\n".join(map(str, reverse_insertions)))

# Plot Integration profiles in genome and in CCDB gene
ngs.plot_insertion_histogram(forward_insertions, reverse_insertions, region_start=0, region_end=len(genome_sequence), title="Genome", highlight_regions=highlight, filename=f"{path_to_output_dir}{genome_wide_integrations}")
ngs.plot_insertion_histogram(forward_insertions, reverse_insertions, region_start=ccdb_start_site-50, region_end=ccdb_start_site+len(ccdb_gene), title="CCDB Gene", highlight_regions=highlight, filename=f"{path_to_output_dir}{ccdb_integrations}")

# Plot Integration Profiles
modified_forward_insertions = ngs.integration_finder(forward_insertions, ccdb_gene, ccdb_start_site, PAM_start_site, PAM_end_site, natural_system, is_rc)

modified_reverse_insertions = ngs.integration_finder(reverse_insertions, ccdb_gene, ccdb_start_site, PAM_start_site, PAM_end_site, natural_system, is_rc)

combined_integrations = modified_forward_insertions + modified_reverse_insertions

ngs.plot_integration_profile(combined_integrations, filename=f"{path_to_output_dir}{integration_distances}")

# --- Calculate integration stats ---
fwd_total = len(forward_insertions)
rev_total = len(reverse_insertions)
fwd_on_target = len(modified_forward_insertions)
rev_on_target = len(modified_reverse_insertions)
total = fwd_total + rev_total
on_target_total = fwd_on_target + rev_on_target

fwd_pct = 100 * fwd_on_target / fwd_total if fwd_total else 0
rev_pct = 100 * rev_on_target / rev_total if rev_total else 0
total_pct = 100 * on_target_total / total if total else 0

all_lengths = modified_forward_insertions + modified_reverse_insertions
mean_len = np.mean(all_lengths) if all_lengths else 0
median_len = np.median(all_lengths) if all_lengths else 0
std_len = np.std(all_lengths) if all_lengths else 0

# --- Format and write report ---
report_text = f"""Integration Analysis Report
==================================
Total insertions:
  Forward strand: {fwd_total}
  Reverse strand: {rev_total}
  Combined total: {total}

On-target insertions (within CCDB region):
  Forward strand: {fwd_on_target} ({fwd_pct:.2f}%)
  Reverse strand: {rev_on_target} ({rev_pct:.2f}%)
  Combined on-target: {on_target_total} ({total_pct:.2f}%)

Integration length statistics (bp):
  Mean length:   {mean_len:.2f}
  Median length: {median_len:.2f}
  Std deviation: {std_len:.2f}
"""

# Create output directory if needed
output_report_path = Path(f"{path_to_output_dir}{integration_report}")
output_report_path.parent.mkdir(parents=True, exist_ok=True)
output_report_path.write_text(report_text)

print(f"Integration report written to '{output_report_path.resolve()}'")

Total reads before trimming: 84463
Total reads after primer trimming: 11509
Total reads after donor removal: 5275


Max memory cannot be determined.  Attempting to use 3200 MB.
If this fails, please add the -Xmx flag (e.g. -Xmx24g) to your command, 
or run this program qsubbed or from a qlogin session on Genepool, or set ulimit to an appropriate value.
java -ea -Xmx3200m -Xms3200m -cp /Users/ekello73/src/bbmap/current/ align2.BBMap build=1 overwrite=true fastareadlen=500 ref=/Users/ekello73/projects/NGS/genomes/cJP003_assembly.fasta
Executing align2.BBMap [build=1, overwrite=true, fastareadlen=500, ref=/Users/ekello73/projects/NGS/genomes/cJP003_assembly.fasta]
Version 39.11

No output file.
NOTE:	Deleting contents of ref/genome/1 because reference is specified and overwrite=true
NOTE:	Deleting contents of ref/index/1 because reference is specified and overwrite=true
Writing reference.
Executing dna.FastaToChromArrays2 [/Users/ekello73/projects/NGS/genomes/cJP003_assembly.fasta, 1, writeinthread=false, genscaffoldinfo=true, retain, waitforwriting=false, gz=true, maxlen=536670912, writechroms=true, m

Integration report written to '/Users/ekello73/projects/NGS/TagmentationAnalysis/test/PD2_integration_report.txt'
