## De Bruijn assembly workflow

In [1]:
import os
import sys

script_dir = os.getcwd()
sys.path.append(os.path.join(script_dir, '../src'))

In [2]:
# my modules
import dbg
import greedy_method as greedy
import mapping as map
import consensus as cons
import alignment as align
import clustering as clus
import preprocessing as prep
import compute_statistics as comp_stat

# import libraries
from tqdm import tqdm
from tempfile import mkdtemp
from itertools import combinations
from collections import defaultdict, Counter
from Bio import SeqIO

import json
import re
import Bio
import shutil
import logging
import importlib
import statistics
import subprocess
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [3]:
def get_sample_metadata(run, chain="", json_path="../json/sample_metadata.json"):
    with open(json_path, "r") as f:
        all_meta = json.load(f)

    if run not in all_meta:
        raise ValueError(f"Run '{run}' not found in metadata.")

    entries = all_meta[run]

    for entry in entries:
        if entry["chain"] == chain:
            return entry

    raise ValueError(f"No metadata found for run '{run}' with chain '{chain}'.")

In [4]:
def get_colors_from_run(cat, is_scaffold=False, json_path="../json/colors.json"):
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"Missing color file: {json_path}")

    with open(json_path, "r") as f:
        colors = json.load(f)

    category = cat.split("_")[0].lower()
    key = "scaffold" if is_scaffold else "contig"

    try:
        return colors[category][key]
    except KeyError:
        raise ValueError(f"Color not defined for category '{category}' and key '{key}'")


In [5]:
def get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap, min_identity, max_mismatches):
    if ass_method == "dbg":
        return f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}_mi{min_identity}_mm{max_mismatches}"
    else:
        return f"comb_{ass_method}_c{conf}_ts{size_threshold}_mo{min_overlap}_mi{min_identity}_mm{max_mismatches}"


In [6]:
run = "bsa"

meta = get_sample_metadata(run)

protein = meta["protein"]
chain = meta["chain"]
proteases = meta["proteases"]

In [7]:
# antibodies,ma3heavy,dbg,comb_dbg_c0.9_ks7_ts0_mo4_mi0.8_mm12,39,0.968,72,0.251,30,0.144,0.939,0.907,0.619
# nb6 comb_dbg_c0.86_ks6_ts0_mo4_mi0.9_mm12
# bd17 comb_dbg_c0.88_ks7_ts10_mo4_mi0.9_mm12
# bd15 comb_dbg_c0.92_ks6_ts0_mo4_mi0.9_mm12

# best contig dbg results

# ma1 heavy comb_dbg_c0.88_ks7_ts0_mo4_mi0.9_mm14
# ma2 heavy comb_dbg_c0.86_ks6_ts0_mo3_mi0.9_mm10
# ma3 heavy comb_dbg_c0.88_ks6_ts0_mo4_mi0.8_mm8

# ma1 light comb_dbg_c0.86_ks7_ts0_mo4_mi0.9_mm8
# ma2 light comb_dbg_c0.92_ks7_ts5_mo3_mi0.9_mm10
# ma3 light comb_dbg_c0.92_ks6_ts10_mo3_mi0.8_mm14

In [8]:
ass_method = 'dbg'
kmer_size = 6
conf = 0.86
size_threshold = 0
min_overlap = 3
min_identity = 0.9
max_mismatches = 12

In [9]:
comb = get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap, min_identity, max_mismatches)

print(comb)

comb_dbg_c0.86_ks6_ts0_mo3_mi0.9_mm12


In [10]:
params = {"ass_method": 'dbg',
          "conf": conf,
          "kmer_size": kmer_size,
          "min_overlap": min_overlap,
          "min_identity": min_identity,
          "max_mismatches": max_mismatches,
          "size_threshold": size_threshold
          }

In [11]:
folder_outputs = f"../outputs/{run}{chain}"

prep.create_directory(folder_outputs)

combination_folder_out = os.path.join(folder_outputs, f"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}_mi{min_identity}_mm{max_mismatches}")

prep.create_subdirectories_outputs(combination_folder_out)

### Data cleaning

In [12]:
protein_norm = prep.normalize_sequence(protein)

df = pd.read_csv(f"../input/{run}.csv")

In [13]:
importlib.reload(prep)
prep.missing_values_barplot(run, df, "../figures")

Valid PSMs: 93649 (85.64%)
Missing PSMs: 15709 (14.36%)
Total PSMs: 109358


In [14]:
df['protease'] = df['experiment_name'].apply(lambda name: prep.extract_protease(name, proteases))

df = prep.clean_dataframe(df)

In [15]:
df['cleaned_preds'] = df['preds'].apply(prep.remove_modifications)

In [16]:
cleaned_psms = df['cleaned_preds'].tolist()

In [17]:
filtered_psms = prep.filter_contaminants(cleaned_psms, run, "../fasta/contaminants.fasta")

In [18]:
df = df[df['cleaned_preds'].isin(filtered_psms)]

In [19]:
df["mapped"] = df["cleaned_preds"].apply(lambda x: "True" if x in protein_norm else "False")

In [20]:
importlib.reload(prep)

prep.plot_map_unmap_distribution(df, protein_norm, run, folder = "../figures", conf_lim=0, title = False)

In [23]:
importlib.reload(prep)
prep.fdr_ratio_mapped_unmapped(run, df, folder = "../figures")

In [30]:
importlib.reload(prep)

prep.plot_relative_map_distribution(run, df, protein_norm, folder = "../figures")









In [32]:
importlib.reload(prep)

prep.plot_map_distribution(run, df, protein_norm, threshold=0.88, folder = "../figures")

In [37]:
df = df[df['conf'] > conf]

In [38]:
df.reset_index(drop=True, inplace=True)

In [39]:
final_psms = df['cleaned_preds'].tolist()

In [40]:
mapped_psms = map.process_protein_contigs_scaffold(final_psms, protein_norm, max_mismatches, min_identity)

In [43]:
proteases

['Chymotrypsin',
 'Legumain',
 'Krakatoa',
 'Elastase',
 'Trypsin',
 'Papain',
 'Thermo',
 'ProtK',
 'GluC',
 'LysC']

In [33]:
custom_colors = [
    "#8dd3c7", # Thermo
    "#ffffb3", # Krakatoa
    "#bebada", # Chymotrypsin
    "#fb8072", # Elastase
    "#80b1d3", # GluC
    "#fdb462", # Trypsin
    "#b3de69", # Papain
    "#fccde5", # ProtK
    "#d9d9d9", # Legumain
    "#bc80bd" # LysC
]

palette = dict(zip(proteases, custom_colors))

In [44]:
mapped_psms

[('DDKEACFA', (584, 592, [], 1.0)),
 ('ECFLSHKDD', (123, 132, [], 1.0)),
 ('KYLCDNQDTLSSKL', (284, 298, [], 1.0)),
 ('KYNGVFQ', (182, 189, [], 1.0)),
 ('KYNGVFQ', (182, 189, [], 1.0)),
 ('ECFLSHKDD', (123, 132, [], 1.0)),
 ('VMENFVAF', (569, 577, [], 1.0)),
 ('CFLSHKDD', (124, 132, [], 1.0)),
 ('RETYGDM', (104, 111, [], 1.0)),
 ('NGVFQECC', (184, 192, [], 1.0)),
 ('MENFVAF', (570, 577, [], 1.0)),
 ('CDNQDTL', (287, 294, [], 1.0)),
 ('ECFLSH', (123, 129, [], 1.0)),
 ('ELTEFAKTC', (68, 77, [], 1.0)),
 ('KHLVDEPQNL', (400, 410, [], 1.0)),
 ('LHTLFGDELCKV', (89, 101, [], 1.0)),
 ('GPKLVV', (594, 600, [], 1.0)),
 ('LPPLTAD', (324, 331, [], 1.0)),
 ('FSALTPDET', (510, 519, [], 1.0)),
 ('YLCDNQDT', (285, 293, [], 1.0)),
 ('LPPLTAD', (324, 331, [], 1.0)),
 ('SLRETYGD', (102, 110, [], 1.0)),
 ('KHLVDEPQNL', (400, 410, [], 1.0)),
 ('ECFLSH', (123, 129, [], 1.0)),
 ('LRETYGDM', (103, 111, [], 1.0)),
 ('KYNGVF', (182, 188, [], 1.0)),
 ('CFLSHKDD', (124, 132, [], 1.0)),
 ('NKYNGVFQ', (181, 189, [],

In [42]:
importlib.reload(map)
map.mapping_psms_protease_associated(mapped_sequences = mapped_psms,
                                     prot_seq = protein_norm,
                                     labels = proteases, 
                                     title = f"",
                                     palette = palette,
                                     output_folder = "../figures",
                                     output_file = f"psms_per_proteases.svg"
                                     )

Plotting contigs: 10it [00:00, 365.75it/s]


IndexError: list index out of range

### Assembly

In [None]:
kmers = dbg.get_kmers(final_psms, kmer_size=kmer_size)

In [None]:
edges = dbg.get_debruijn_edges_from_kmers(kmers)

In [None]:
assembled_contigs = dbg.assemble_contigs(edges)

In [None]:
assembled_contigs = sorted(assembled_contigs, key=len, reverse=True)

In [None]:
assembled_contigs = list(set(assembled_contigs))

In [None]:
assembled_contigs = [seq for seq in assembled_contigs if len(seq) > size_threshold]

In [None]:
assembled_contigs = sorted(assembled_contigs, key=len, reverse=True)

In [None]:
# assembled_contigs = list(dict.fromkeys(assembled_contigs))

# assembled_contigs = [seq for seq in assembled_contigs if len(seq) > size_threshold]

# assembled_contigs = sorted(assembled_contigs, key=len, reverse=True)

# set could be the problem

In [None]:
records = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(contig), id=f"contig_{idx+1}",
                                description=f"length: {len(contig)}") for idx,
                                contig in enumerate(assembled_contigs)]

In [None]:
Bio.SeqIO.write(records, f"{combination_folder_out}/contigs/{ass_method}_contig_{conf}_{run}.fasta", "fasta")

In [None]:
mapped_contigs = map.process_protein_contigs_scaffold(assembled_contigs, protein_norm, max_mismatches, min_identity)

In [None]:
df_contigs_mapped = map.create_dataframe_from_mapped_sequences(data = mapped_contigs)

In [None]:
comp_stat.compute_assembly_statistics(df = df_contigs_mapped, sequence_type='contigs', output_folder = f'{combination_folder_out}/statistics', reference = protein_norm, **params)


In [None]:
map.mapping_substitutions(mapped_sequences = mapped_contigs,
                          prot_seq = protein_norm,
                          title= f"Contig mapping to reference sequence, {run}",
                          contig_colors = get_colors_from_run("nanobodies", is_scaffold=False),
                          match_color = get_colors_from_run("bsa", is_scaffold=False),
                          output_file=f"fig_X_{run}_substitution_map_contigs_dbg.svg",
                          output_folder="."
                          )

In [None]:
assembled_scaffolds = dbg.scaffold_iterative(assembled_contigs, min_overlap, size_threshold)

In case scaffold_iterative is too slow or does not work, it is recommeneded to run only the few iteration in the following raw cell

In [None]:
records = []
for i, seq in enumerate(assembled_scaffolds):
    record = Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(seq), id=f"scaffold_{i+1}", description=f"length: {len(seq)}")
    records.append(record)

In [None]:
Bio.SeqIO.write(records, f"{combination_folder_out}/scaffolds/{ass_method}_scaffold_{conf}_{kmer_size}_{run}.fasta", "fasta")

In [None]:
mapped_scaffolds = map.process_protein_contigs_scaffold(assembled_scaffolds, protein_norm, max_mismatches, min_identity)

In [None]:
df_scaffolds_mapped = map.create_dataframe_from_mapped_sequences(data = mapped_scaffolds)

In [None]:
comp_stat.compute_assembly_statistics(df = df_scaffolds_mapped, sequence_type='scaffolds', output_folder = f"{combination_folder_out}/statistics", reference = protein_norm, **params)

In [None]:
map.mapping_substitutions(mapped_sequences = mapped_scaffolds, prot_seq = protein_norm,
                          title=f"Scaffold mapping to reference sequence, {run} {chain}", contig_colors= get_colors_from_run("bsa", is_scaffold=True),
                          match_color= get_colors_from_run("bsa", is_scaffold=True), output_file=f"fig_X_{run}{chain}_substitution_map_scaffolds_dbg.svg",
                          output_folder="."
                          )


### Clustering

In [None]:
scaffolds_folder_out = f"../outputs/{run}{chain}/{comb}/scaffolds"
print(f"scaffolds_folder_out: {scaffolds_folder_out}")

In [None]:
clus.cluster_fasta_files(input_folder = scaffolds_folder_out)

In [None]:
cluster_folder_out = os.path.join(scaffolds_folder_out, "cluster")
print(cluster_folder_out)

In [None]:
cluster_tsv_folder = os.path.join(scaffolds_folder_out, "cluster")
output_base_folder = os.path.join(scaffolds_folder_out, "cluster_fasta")

for fasta_file in os.listdir(scaffolds_folder_out):
    if fasta_file.endswith('.fasta'):
        fasta_path = os.path.join(scaffolds_folder_out, fasta_file)
        clus.process_fasta_and_clusters(fasta_path, cluster_tsv_folder, output_base_folder)

### Alignment

In [None]:
cluster_fasta_folder = os.path.join(scaffolds_folder_out, "cluster_fasta") 
align_folder = os.path.join(scaffolds_folder_out, "align")
prep.create_directory(align_folder)

In [None]:
for cluster_folder in os.listdir(cluster_fasta_folder): 
    cluster_folder_path = os.path.join(cluster_fasta_folder, cluster_folder) 
    if os.path.isdir(cluster_folder_path): 
        
        output_cluster_folder = os.path.join(align_folder, cluster_folder) 
        os.makedirs(output_cluster_folder, exist_ok=True) 
            
        for fasta_file in os.listdir(cluster_folder_path): 
            if fasta_file.endswith('.fasta'): 
                fasta_file_path = os.path.join(cluster_folder_path, fasta_file)
                base_filename = os.path.splitext(fasta_file)[0] 
                output_file = os.path.join(output_cluster_folder, f"{base_filename}_out.afa")
                    
                align.align_or_copy_fasta(fasta_file_path, output_file)

print("All alignment tasks completed.")

### Consensus

In [None]:
consensus_folder = os.path.join(scaffolds_folder_out, "consensus")

In [None]:
cons.process_alignment_files(align_folder, consensus_folder)

In [None]:
all_sequences = cons.load_all_consensus_sequences(consensus_folder)