## Greedy assembly workflow

In [None]:
import os
import sys

script_dir = os.getcwd() 
sys.path.append(os.path.join(script_dir, '../src'))

In [None]:
# my modules
import greedy_method as greedy
import mapping as map
import consensus as cons
import alignment as align
import clustering as clus
import preprocessing as prep
import compute_statistics as comp_stat

# import libraries
from tqdm import tqdm
from Bio import SeqIO
from tempfile import mkdtemp
from itertools import combinations
from collections import defaultdict, Counter
from matplotlib.patches import Rectangle


import json
import re
import Bio
import shutil
import logging
import importlib
import statistics
import subprocess
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go


In [None]:
def get_sample_metadata(run, chain="", json_path="../json/sample_metadata.json"):
    with open(json_path, "r") as f:
        all_meta = json.load(f)

    if run not in all_meta:
        raise ValueError(f"Run '{run}' not found in metadata.")

    entries = all_meta[run]

    for entry in entries:
        if entry["chain"] == chain:
            return entry

    raise ValueError(f"No metadata found for run '{run}' with chain '{chain}'.")

In [None]:
def get_colors_from_run(cat, is_scaffold=False, json_path="../json/colors.json"):
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"Missing color file: {json_path}")

    with open(json_path, "r") as f:
        colors = json.load(f)

    category = cat.split("_")[0].lower()
    key = "scaffold" if is_scaffold else "contig"

    try:
        return colors[category][key]
    except KeyError:
        raise ValueError(f"Color not defined for category '{category}' and key '{key}'")


In [None]:
def get_combination_name(ass_method, conf, size_threshold, min_overlap, min_identity, max_mismatches):
    return f"comb_{ass_method}_c{conf}_ts{size_threshold}_mo{min_overlap}_mi{min_identity}_mm{max_mismatches}"

In [None]:
run = "NB1"

meta = get_sample_metadata(run)

protein = meta["protein"]
chain = meta["chain"]
proteases = meta["proteases"]

In [None]:
# nb6 comb_greedy_c0.9_ts10_mo3_mi0.9_mm10
# nb3 comb_greedy_c0.92_ts10_mo3_mi0.9_mm10
# nb13 comb_greedy_c0.86_ts10_mo3_mi0.8_mm12
# bd17 comb_greedy_c0.92_ts10_mo3_mi0.6_mm14

# best contig greedy results
# ma1 heavy comb_greedy_c0.92_ts10_mo3_mi0.9_mm12
# ma2 heavy comb_greedy_c0.86_ts10_mo3_mi0.8_mm14
# ma3 heavy comb_greedy_c0.9_ts10_mo3_mi0.6_mm12

# ma1 light comb_greedy_c0.88_ts10_mo3_mi0.9_mm10
# ma2 light comb_greedy_c0.9_ts10_mo3_mi0.9_mm10
# ma3 light comb_greedy_c0.92_ts10_mo3_mi0.9_mm8

In [None]:
ass_method = 'greedy'
conf = 0.92
size_threshold = 10
min_overlap = 3
min_identity = 0.6
max_mismatches = 14

In [None]:
comb = get_combination_name(ass_method, conf, size_threshold, min_overlap, min_identity, max_mismatches)

print(comb)

In [None]:
params = {
        "ass_method": 'greedy',
        "conf": conf,
        "size_threshold": size_threshold,
        "min_overlap": min_overlap,
        "max_mismatches": max_mismatches,
        "min_identity": min_identity
    }

In [None]:
folder_outputs = f"../outputs/{run}{chain}"

prep.create_directory(folder_outputs)

combination_folder_out = os.path.join(folder_outputs, comb)

prep.create_subdirectories_outputs(combination_folder_out)


### Data cleaning

In [None]:
protein_norm = prep.normalize_sequence(protein)

df = pd.read_csv(f"../input/{run}.csv")

In [None]:
df['protease'] = df['experiment_name'].apply(lambda name: prep.extract_protease(name, proteases))

df = prep.clean_dataframe(df)

In [None]:
df['cleaned_preds'] = df['preds'].apply(prep.remove_modifications)

In [None]:
cleaned_psms = df['cleaned_preds'].tolist()

In [None]:
filtered_psms = prep.filter_contaminants(cleaned_psms, run, "../fasta/contaminants.fasta")

In [None]:
df = df[df['cleaned_preds'].isin(filtered_psms)]

In [None]:
df["mapped"] = df["cleaned_preds"].apply(lambda x: "True" if x in protein_norm else "False")

In [None]:
df = df[df['conf'] > conf]

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
final_psms = df['cleaned_preds'].tolist()

In [None]:
mapped_psms = map.process_protein_contigs_scaffold(final_psms, protein_norm, max_mismatches, min_identity)

### Assembly

In [None]:
assembled_contigs = greedy.assemble_contigs(final_psms, min_overlap)

In [None]:
assembled_contigs = list(set(assembled_contigs))

In [None]:
assembled_contigs = [contig for contig in assembled_contigs if len(contig) > size_threshold]

In [None]:
assembled_contigs = sorted(assembled_contigs, key=len, reverse=True)

In [None]:
records = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(contig), id=f"contig_{idx+1}",
                                    description=f"length: {len(contig)}") for idx,
                                    contig in enumerate(assembled_contigs)
                                    ]

In [None]:
Bio.SeqIO.write(records, f"{combination_folder_out}/contigs/{ass_method}_contig_{conf}_{run}.fasta", "fasta")

In [None]:
mapped_contigs = map.process_protein_contigs_scaffold(assembled_contigs, protein_norm, max_mismatches, min_identity)

In [None]:
df_contigs_mapped = map.create_dataframe_from_mapped_sequences(data = mapped_contigs)

In [None]:
comp_stat.compute_assembly_statistics(df = df_contigs_mapped, sequence_type='contigs', output_folder = f"{combination_folder_out}/statistics", reference = protein_norm, **params)

In [None]:
map.mapping_substitutions(mapped_sequences = mapped_contigs,
                          prot_seq = protein_norm,
                          title=f"Contig mapping to reference sequence, {run} {chain}", contig_colors= get_colors_from_run("bsa", is_scaffold=False),
                          match_color= get_colors_from_run("bsa", is_scaffold=False), output_file= f"fig_X_{run}_substitution_map_contigs_greedy.svg",
                          output_folder="."
                          )

In [None]:
assembled_scaffolds = greedy.scaffold_iterative_greedy(assembled_contigs, min_overlap, size_threshold)

In [None]:
records = []

for i, seq in enumerate(assembled_scaffolds):
    record = Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(seq), id=f"scaffold_{i+1}", description=f"length: {len(seq)}")
    records.append(record)

In [None]:
Bio.SeqIO.write(records, f"{combination_folder_out}/scaffolds/{ass_method}_scaffold_{conf}_{run}.fasta", "fasta")

In [None]:
mapped_scaffolds = map.process_protein_contigs_scaffold(assembled_scaffolds, protein_norm, max_mismatches, min_identity)

In [None]:
df_scaffolds_mapped = map.create_dataframe_from_mapped_sequences(data = mapped_scaffolds)

In [None]:
comp_stat.compute_assembly_statistics(df = df_scaffolds_mapped, sequence_type='scaffolds', output_folder = f"{combination_folder_out}/statistics", reference = protein_norm, **params)

In [None]:
map.mapping_substitutions(mapped_sequences = mapped_scaffolds, prot_seq = protein_norm,
                          title=f"Scaffold mapping to reference sequence, {run} {chain}", contig_colors= get_colors_from_run("bsa", is_scaffold=True),
                          match_color = get_colors_from_run("bsa", is_scaffold=True), output_file=f"fig_X_{run}_substitution_map_scaffolds_greedy.svg",
                          output_folder="."
                          )

### Clustering

In [None]:
scaffolds_folder_out = f"../outputs/{run}{chain}/{comb}/scaffolds"
print(f"scaffolds_folder_out: {scaffolds_folder_out}")

In [None]:
clus.cluster_fasta_files(input_folder = scaffolds_folder_out)

In [None]:
cluster_folder_out = os.path.join(scaffolds_folder_out, "cluster")
print(cluster_folder_out)

In [None]:
cluster_tsv_folder = os.path.join(scaffolds_folder_out, "cluster")
output_base_folder = os.path.join(scaffolds_folder_out, "cluster_fasta")

for fasta_file in os.listdir(scaffolds_folder_out):
    if fasta_file.endswith('.fasta'):
        fasta_path = os.path.join(scaffolds_folder_out, fasta_file)
        clus.process_fasta_and_clusters(fasta_path, cluster_tsv_folder, output_base_folder)

### Alignment

In [None]:
cluster_fasta_folder = os.path.join(scaffolds_folder_out, "cluster_fasta") 
align_folder = os.path.join(scaffolds_folder_out, "align")
prep.create_directory(align_folder)

In [None]:
for cluster_folder in os.listdir(cluster_fasta_folder): 
    cluster_folder_path = os.path.join(cluster_fasta_folder, cluster_folder) 
    if os.path.isdir(cluster_folder_path): 
        
        output_cluster_folder = os.path.join(align_folder, cluster_folder) 
        os.makedirs(output_cluster_folder, exist_ok=True) 
            
        for fasta_file in os.listdir(cluster_folder_path): 
            if fasta_file.endswith('.fasta'): 
                fasta_file_path = os.path.join(cluster_folder_path, fasta_file)
                base_filename = os.path.splitext(fasta_file)[0] 
                output_file = os.path.join(output_cluster_folder, f"{base_filename}_out.afa")
                    
                align.align_or_copy_fasta(fasta_file_path, output_file)

print("All alignment tasks completed.")

### Consensus

In [None]:
consensus_folder = os.path.join(scaffolds_folder_out, "consensus")

In [None]:
cons.process_alignment_files(align_folder, consensus_folder)

In [None]:
all_sequences = cons.load_all_consensus_sequences(consensus_folder)