## De Bruijn assembly workflow

In [1]:
import os
import sys

script_dir = os.getcwd()
sys.path.append(os.path.join(script_dir, '../src'))

In [2]:
# my modules
import dbg
import greedy_method as greedy
import mapping as map
import consensus as cons
import alignment as align
import clustering as clus
import preprocessing as prep
import compute_statistics as comp_stat

# import libraries
from tqdm import tqdm
from tempfile import mkdtemp
from itertools import combinations
from collections import defaultdict, Counter
from Bio import SeqIO

import json
import re
import Bio
import shutil
import logging
import importlib
import statistics
import subprocess
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [3]:
def get_sample_metadata(run, chain="", json_path="../json/sample_metadata.json"):
    with open(json_path, "r") as f:
        all_meta = json.load(f)

    if run not in all_meta:
        raise ValueError(f"Run '{run}' not found in metadata.")

    entries = all_meta[run]

    for entry in entries:
        if entry["chain"] == chain:
            return entry

    raise ValueError(f"No metadata found for run '{run}' with chain '{chain}'.")

In [4]:
def get_colors_from_run(cat, is_scaffold=False, json_path="../json/colors.json"):
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"Missing color file: {json_path}")

    with open(json_path, "r") as f:
        colors = json.load(f)

    category = cat.split("_")[0].lower()
    key = "scaffold" if is_scaffold else "contig"

    try:
        return colors[category][key]
    except KeyError:
        raise ValueError(f"Color not defined for category '{category}' and key '{key}'")


In [5]:
def get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap, min_identity, max_mismatches):
    if ass_method == "dbg":
        return f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}_mi{min_identity}_mm{max_mismatches}"
    else:
        return f"comb_{ass_method}_c{conf}_ts{size_threshold}_mo{min_overlap}_mi{min_identity}_mm{max_mismatches}"


In [6]:
run = "ma1"

meta = get_sample_metadata(run, chain = "heavy")

protein = meta["protein"]
chain = meta["chain"]
proteases = meta["proteases"]

In [7]:
# antibodies,ma3heavy,dbg,comb_dbg_c0.9_ks7_ts0_mo4_mi0.8_mm12,39,0.968,72,0.251,30,0.144,0.939,0.907,0.619
# nb6 comb_dbg_c0.86_ks6_ts0_mo4_mi0.9_mm12
# bd17 comb_dbg_c0.88_ks7_ts10_mo4_mi0.9_mm12
# bd15 comb_dbg_c0.92_ks6_ts0_mo4_mi0.9_mm12

# best contig dbg results

# ma1 heavy comb_dbg_c0.88_ks7_ts0_mo4_mi0.9_mm14
# ma2 heavy comb_dbg_c0.86_ks6_ts0_mo3_mi0.9_mm10
# ma3 heavy comb_dbg_c0.88_ks6_ts0_mo4_mi0.8_mm8

# ma1 light comb_dbg_c0.86_ks7_ts0_mo4_mi0.9_mm8
# ma2 light comb_dbg_c0.92_ks7_ts5_mo3_mi0.9_mm10
# ma3 light comb_dbg_c0.92_ks6_ts10_mo3_mi0.8_mm14

In [8]:
ass_method = 'dbg'
kmer_size = 7
conf = 0.9
size_threshold = 10
min_overlap = 3
min_identity = 0.9
max_mismatches = 10

In [9]:
comb = get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap, min_identity, max_mismatches)

print(comb)

comb_dbg_c0.9_ks7_ts10_mo3_mi0.9_mm10


In [10]:
params = {"ass_method": 'dbg',
          "conf": conf,
          "kmer_size": kmer_size,
          "min_overlap": min_overlap,
          "min_identity": min_identity,
          "max_mismatches": max_mismatches,
          "size_threshold": size_threshold
          }

In [11]:
folder_outputs = f"../outputs/{run}{chain}"

prep.create_directory(folder_outputs)

combination_folder_out = os.path.join(folder_outputs, f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}_mi{min_identity}_mm{max_mismatches}")

prep.create_subdirectories_outputs(combination_folder_out)

### Data cleaning

In [12]:
protein_norm = prep.normalize_sequence(protein)

df = pd.read_csv(f"../input/{run}.csv")

In [13]:
df['protease'] = df['experiment_name'].apply(lambda name: prep.extract_protease(name, proteases))

df = prep.clean_dataframe(df)

In [14]:
df['cleaned_preds'] = df['preds'].apply(prep.remove_modifications)

In [15]:
cleaned_psms = df['cleaned_preds'].tolist()

In [16]:
filtered_psms = prep.filter_contaminants(cleaned_psms, run, "../fasta/contaminants.fasta")

In [17]:
df = df[df['cleaned_preds'].isin(filtered_psms)]

In [18]:
df["mapped"] = df["cleaned_preds"].apply(lambda x: "True" if x in protein_norm else "False")

In [19]:
df = df[df['conf'] > conf]

In [20]:
df.reset_index(drop=True, inplace=True)

In [21]:
final_psms = df['cleaned_preds'].tolist()

In [22]:
mapped_psms = map.process_protein_contigs_scaffold(final_psms, protein_norm, max_mismatches, min_identity)

### Assembly

In [23]:
kmers = dbg.get_kmers(final_psms, kmer_size=kmer_size)

In [24]:
edges = dbg.get_debruijn_edges_from_kmers(kmers)

In [25]:
assembled_contigs = dbg.assemble_contigs(edges)

Traversing nodes: 100%|██████████| 209/209 [00:00<00:00, 52617.62it/s]


In [26]:
assembled_contigs = sorted(assembled_contigs, key=len, reverse=True)

In [27]:
assembled_contigs = list(set(assembled_contigs))

In [28]:
assembled_contigs = [seq for seq in assembled_contigs if len(seq) > size_threshold]

In [29]:
assembled_contigs = sorted(assembled_contigs, key=len, reverse=True)

In [30]:
# assembled_contigs = list(dict.fromkeys(assembled_contigs))

# assembled_contigs = [seq for seq in assembled_contigs if len(seq) > size_threshold]

# assembled_contigs = sorted(assembled_contigs, key=len, reverse=True)

# set could be the problem

In [31]:
records = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(contig), id=f"contig_{idx+1}",
                                description=f"length: {len(contig)}") for idx,
                                contig in enumerate(assembled_contigs)]

In [32]:
Bio.SeqIO.write(records, f"{combination_folder_out}/contigs/{ass_method}_contig_{conf}_{run}.fasta", "fasta")

311

In [33]:
mapped_contigs = map.process_protein_contigs_scaffold(assembled_contigs, protein_norm, max_mismatches, min_identity)

In [34]:
df_contigs_mapped = map.create_dataframe_from_mapped_sequences(data = mapped_contigs)

In [35]:
comp_stat.compute_assembly_statistics(df = df_contigs_mapped, sequence_type='contigs', output_folder = f'{combination_folder_out}/statistics', reference = protein_norm, **params)


{'ass_method': 'dbg',
 'conf': 0.9,
 'kmer_size': 7,
 'min_overlap': 3,
 'min_identity': 0.9,
 'max_mismatches': 10,
 'size_threshold': 10,
 'reference_start': 0,
 'reference_end': 459,
 'total_sequences': 48,
 'average_length': 38.895833333333336,
 'min_length': 14,
 'max_length': 73,
 'coverage': 0.9629629629629629,
 'mean_identity': 0.9600039990804534,
 'median_identity': 0.9578900709219859,
 'perfect_matches': 13,
 'total_mismatches': 21,
 'N50': 49,
 'N90': 22}

In [36]:
map.mapping_substitutions(mapped_sequences = mapped_contigs,
                          prot_seq = protein_norm,
                          title= f"Contig mapping to reference sequence, {run}",
                          contig_colors = get_colors_from_run("nanobodies", is_scaffold=False),
                          match_color = get_colors_from_run("bsa", is_scaffold=False),
                          output_file=f"fig_X_{run}_substitution_map_contigs_dbg.svg",
                          output_folder="."
                          )

Plotting contigs: 100%|██████████| 48/48 [00:02<00:00, 23.25it/s]


Plotted 48 sequences.


In case scaffold_iterative is too slow or does not work, it is recommeneded to run only the few iteration in the following raw cell

In [37]:
assembled_scaffolds = dbg.create_scaffolds(assembled_contigs, min_overlap)

assembled_scaffolds = list(set(assembled_scaffolds))
    
assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)
    
assembled_scaffolds = [scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold]

assembled_scaffolds = dbg.merge_sequences(assembled_scaffolds)
    
assembled_scaffolds = list(set(assembled_scaffolds))
    
assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)
    
assembled_scaffolds = [scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold]

Finding overlaps: 100%|██████████| 48205/48205 [00:00<00:00, 95651.32it/s] 
Merging overlaps: 100%|██████████| 122/122 [00:00<00:00, 581483.05it/s]
Merging contigs: 100%|██████████| 433/433 [00:00<00:00, 20767.68it/s]


In [38]:
records = []
for i, seq in enumerate(assembled_scaffolds):
    record = Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(seq), id=f"scaffold_{i+1}", description=f"length: {len(seq)}")
    records.append(record)

In [39]:
Bio.SeqIO.write(records, f"{combination_folder_out}/scaffolds/{ass_method}_scaffold_{conf}_{kmer_size}_{run}.fasta", "fasta")

354

In [40]:
mapped_scaffolds = map.process_protein_contigs_scaffold(assembled_scaffolds, protein_norm, max_mismatches, min_identity)

In [41]:
df_scaffolds_mapped = map.create_dataframe_from_mapped_sequences(data = mapped_scaffolds)

In [42]:
comp_stat.compute_assembly_statistics(df = df_scaffolds_mapped, sequence_type='scaffolds', output_folder = f"{combination_folder_out}/statistics", reference = protein_norm, **params)

{'ass_method': 'dbg',
 'conf': 0.9,
 'kmer_size': 7,
 'min_overlap': 3,
 'min_identity': 0.9,
 'max_mismatches': 10,
 'size_threshold': 10,
 'reference_start': 0,
 'reference_end': 459,
 'total_sequences': 51,
 'average_length': 51.372549019607845,
 'min_length': 14,
 'max_length': 103,
 'coverage': 0.9629629629629629,
 'mean_identity': 0.960236085596241,
 'median_identity': 0.9583333333333334,
 'perfect_matches': 9,
 'total_mismatches': 31,
 'N50': 59,
 'N90': 33}

In [43]:
map.mapping_substitutions(mapped_sequences = mapped_scaffolds, prot_seq = protein_norm,
                          title=f"Scaffold mapping to reference sequence, {run} {chain}", contig_colors= get_colors_from_run("bsa", is_scaffold=True),
                          match_color= get_colors_from_run("bsa", is_scaffold=True), output_file=f"fig_X_{run}{chain}_substitution_map_scaffolds_dbg.svg",
                          output_folder="."
                          )


Plotting contigs: 100%|██████████| 51/51 [00:03<00:00, 12.90it/s]


Plotted 51 sequences.


### Clustering

In [44]:
scaffolds_folder_out = f"../outputs/{run}{chain}/{comb}/scaffolds"
print(f"scaffolds_folder_out: {scaffolds_folder_out}")

scaffolds_folder_out: ../outputs/ma1heavy/comb_dbg_c0.9_ks7_ts10_mo3_mi0.9_mm10/scaffolds


In [45]:
clus.cluster_fasta_files(input_folder = scaffolds_folder_out)

the current fasta path is: ../outputs/ma1heavy/comb_dbg_c0.9_ks7_ts10_mo3_mi0.9_mm10/scaffolds/dbg_scaffold_0.9_7_ma1.fasta
Clustering dbg_scaffold_0.9_7_ma1.fasta...
Clustering completed for dbg_scaffold_0.9_7_ma1.fasta, results stored with prefix ../outputs/ma1heavy/comb_dbg_c0.9_ks7_ts10_mo3_mi0.9_mm10/scaffolds/cluster/dbg_scaffold_0.9_7_ma1
All clustering tasks completed.


In [46]:
cluster_folder_out = os.path.join(scaffolds_folder_out, "cluster")
print(cluster_folder_out)

../outputs/ma1heavy/comb_dbg_c0.9_ks7_ts10_mo3_mi0.9_mm10/scaffolds/cluster


In [47]:
cluster_tsv_folder = os.path.join(scaffolds_folder_out, "cluster")
output_base_folder = os.path.join(scaffolds_folder_out, "cluster_fasta")

for fasta_file in os.listdir(scaffolds_folder_out):
    if fasta_file.endswith('.fasta'):
        fasta_path = os.path.join(scaffolds_folder_out, fasta_file)
        clus.process_fasta_and_clusters(fasta_path, cluster_tsv_folder, output_base_folder)

Processing clusters for dbg_scaffold_0.9_7_ma1: 100%|██████████| 106/106 [00:00<00:00, 169.28it/s]


### Alignment

In [48]:
cluster_fasta_folder = os.path.join(scaffolds_folder_out, "cluster_fasta") 
align_folder = os.path.join(scaffolds_folder_out, "align")
prep.create_directory(align_folder)

In [49]:
for cluster_folder in os.listdir(cluster_fasta_folder): 
    cluster_folder_path = os.path.join(cluster_fasta_folder, cluster_folder) 
    if os.path.isdir(cluster_folder_path): 
        
        output_cluster_folder = os.path.join(align_folder, cluster_folder) 
        os.makedirs(output_cluster_folder, exist_ok=True) 
            
        for fasta_file in os.listdir(cluster_folder_path): 
            if fasta_file.endswith('.fasta'): 
                fasta_file_path = os.path.join(cluster_folder_path, fasta_file)
                base_filename = os.path.splitext(fasta_file)[0] 
                output_file = os.path.join(output_cluster_folder, f"{base_filename}_out.afa")
                    
                align.align_or_copy_fasta(fasta_file_path, output_file)

print("All alignment tasks completed.")

All alignment tasks completed.


### Consensus

In [50]:
consensus_folder = os.path.join(scaffolds_folder_out, "consensus")

In [51]:
cons.process_alignment_files(align_folder, consensus_folder)

Processing ../outputs/ma1heavy/comb_dbg_c0.9_ks7_ts10_mo3_mi0.9_mm10/scaffolds/align/dbg_scaffold_0.9_7_ma1_cluster_fasta


100%|██████████| 106/106 [08:18<00:00,  4.70s/it]


In [52]:
all_sequences = cons.load_all_consensus_sequences(consensus_folder)