In [None]:
import pandas as pd
import glob
import os
import hashlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.ticker as ticker


In [None]:
def get_md5sum(x):
    return hashlib.md5(x.encode("utf-8")).hexdigest()[:10]

In [None]:
def map_assembler(cell):
    if "carpedeam" in cell:
        return "CarpeDeam"
    elif "penguin" in cell:
        return "PenguiN"
    elif "megahit" in cell:
        return "MEGAHIT"
    elif "spades" in cell:
        return "metaSPAdes"
    else:
        return cell  # Return the cell as is if none of the conditions are met

def adjust_assemblerconfig(row):
    if row["assembler"] == "CarpeDeam":
        if "Safe" in row["file"]:
            return "CarpeDeam (safe mode)"
        elif "Unsafe" in row["file"]:
            return "CarpeDeam (unsafe mode)"
        else:
            return "CarpeDeam\n(safe mode)"
    else:
        return row["assembler"]

In [None]:
def map_type(cell):
    if "correct" in cell:
        return "correct assemblies"
    elif "error" in cell:
        return "misassemblies"
    else:
        return "none"

In [None]:
def map_extension(cell):
    if "ext" in cell:
        return "selected_candidates"
    elif "all" in cell:
        return "candidates"
    else:
        return "none"

In [None]:
def map_dataset(mapping_dict, cell):
    for key, value in mapping_dict.items():
        if key in cell:
            return value
    return None  # or a default value or raise an exception if a key substring is not found in cell

In [None]:
labels = ["synth_EMN001", "synth_GDN001"]

labels_clean = [
"EMN001", 
"GDN001"
]

labels_dict = {key: get_md5sum(key) for key in labels}
labels_dict_inv = {value: key for key, value in labels_dict.items()}
print(labels_dict_inv)

labels_dict_clean = {labels[i] : labels_clean[i] for i in range(len(labels))}


In [None]:
def curate_df(path_tsv):

    files = glob.glob(path_tsv)
    dfs = []

    times = []
    for file in files:
        basename = os.path.basename(file)
        times.append(basename.split("_")[2].split(".")[0])
    times = sorted(list(set(times)))
    print(times)
        
    for file in files:
        if "ext" in file:
            basename = os.path.basename(file)
            df = pd.read_csv(file, sep='\t', header=None)
            df["assembler"] = "CarpeDeam"
    
            iter = times.index(basename.split("_")[2].split(".")[0])
            df["iteration"] = iter+1
            df["extension_type"] = map_extension(file)
            df = df[df["extension_type"] == "selected_candidates"]
            dfs.append(df)
    
    main_df = pd.concat(dfs, ignore_index=True)
    main_df.sort_values(by=["iteration"], inplace=True)
        
    return main_df

In [None]:
def simplify_filenames(df, filename_column, index=3):
    """ Simplify filenames in the DataFrame to a specific format ('configXXXX'). """
    df['simplified_filename'] = df[filename_column].apply(lambda x: x.split('.')[index])
    return df

def plot_data(df, x, y, title, ylabel, color):
    """ General function for creating bar plots. """
    plt.figure(figsize=(10, 5))
    plt.bar(df[x], df[y], color=color)
    plt.xlabel('Configuration')
    plt.ylabel(ylabel)
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


## Translocations Misassembly Analysis

In [None]:
translocations="assembly-evaluation-quast/combined_reference/contigs_reports/interspecies_translocations_by_refs_CarpeDeam.info"

In [None]:
def curate_misasm(path):
    # Read the file, using whitespace as the delimiter
    data = pd.read_csv(path, delimiter=r'\s+', skiprows=1, engine='python')
    
    # Find the row that contains "References:"
    end_index = data.index[data.iloc[:, 0].str.contains('References:', na=False)].tolist()
    
    # If we found the "References:" row, we use its index to slice the DataFrame
    if end_index:
        data = data.iloc[:end_index[0]]
    
    
    # Set the column names to the 'References' values
    data.columns = ['References'] + data['References'].tolist()
    
    # Display the resulting matrix
    return data


In [None]:
def heatmap_misasm(df_orig, title):
    df = df_orig.copy()
    df.set_index("References", inplace=True)
    
    # Convert DataFrame to numeric, handle non-numeric entries by converting them to NaN then filling with 0
    df_numeric = df.apply(pd.to_numeric, errors='coerce').fillna(0)
    
    # Create a mask for the diagonal (True for diagonal, False elsewhere)
    mask = np.zeros_like(df_numeric, dtype=bool)
    np.fill_diagonal(mask, True)
    
    # Zero out the diagonal and lower triangle to focus on upper half
    for i in range(len(df_numeric)):
        for j in range(i + 1):
            df_numeric.iloc[i, j] = 0
    
    # Calculate the sum of the upper half of the diagonal
    total_misassemblies = df_numeric.values[np.triu_indices_from(df_numeric.values, k=1)].sum()
    
    # Create a custom colormap
    from matplotlib.colors import LinearSegmentedColormap

    colors = [(0.9, 0.9, 0.9)] + sns.color_palette("dark:salmon_r", as_cmap=True)(np.linspace(0, 1, 256)).tolist()
    custom_cmap = LinearSegmentedColormap.from_list("custom_cmap", colors, N=256)

    # Plotting the heatmap with a mask for the diagonal
    plt.figure(figsize=(20, 18))
    ax = sns.heatmap(df_numeric, mask=mask, annot=False, cmap=custom_cmap, cbar=True,
                     linewidths=.5, linecolor='grey')
    ax.set_title(f'Total Number of Interspecies Misassemblies: {int(total_misassemblies)}')
    ax.set_xlabel('References')
    ax.set_ylabel('References')

    # Set x and y ticks
    ax.set_xticks(np.arange(len(df_numeric.columns)))
    ax.set_yticks(np.arange(len(df_numeric.index)))

    # Label them with the respective list entries
    ax.set_xticklabels(df_numeric.columns, rotation=90, fontsize=7)
    ax.set_yticklabels(df_numeric.index, fontsize=7)

    # Ensure every tick label is shown
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    # Save the heatmap
    #plt.savefig(f'misasm_heatmaps/heatmap_misasm_{title}.png', dpi=300, bbox_inches="tight")
    plt.show()

    # Print pairs with non-zero misassembly values from the upper diagonal
    nonzero_misasm = []
    for i in range(len(df_numeric)):
        for j in range(i + 1, len(df_numeric.columns)):
            value = df_numeric.iloc[i, j]
            if value > 0:
                to_add = [df_numeric.index[i], df_numeric.index[j], int(value)]
                nonzero_misasm.append(to_add)
    # Create a DataFrame with explicit column names
    nonzero_mis_df = pd.DataFrame(nonzero_misasm, columns=['Species1', 'Species2', '# Misassemblies'])
    # Sort the DataFrame by '# Misassemblies' in descending order
    nonzero_mis_df.sort_values(by='# Misassemblies', ascending=False, inplace=True)
    nonzero_mis_df.reset_index(inplace=True, drop=True)
    return nonzero_mis_df


In [None]:
data = curate_misasm(translocations)

In [None]:
misasms_info = heatmap_misasm(data, "CarpeDeamUnsafe")

In [None]:
print(misasms_info.head(10).to_latex(index=False))

## Intraspecies Miassembly Analysis


In [None]:
def collect_metric_data(directory, metric):
    # List to store data
    data = []
    
    # Walk through the given directory
    for subdir, dirs, files in os.walk(directory):
        if 'report.tsv' in files:
            file_path = os.path.join(subdir, 'report.tsv')
            try:
                with open(file_path, 'r') as file:
                    for line in file:
                        if metric in line:
                            # Clean and extract the necessary parts
                            misassemblies_count = line.strip().split()[-1]
                            species_name = os.path.basename(subdir)
                            data.append([species_name, int(misassemblies_count)])
                            break  # No need to read further once we find the line
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    # Create a DataFrame
    df = pd.DataFrame(data, columns=['Species', metric])
    df.sort_values(by=metric, ascending=False, inplace=True)
    df.reset_index(inplace=True, drop=True)
    
    return df

In [None]:
def plot_metric(df, metric):
    # Convert "# misassemblies" to numeric if it's not already
    df[metric] = pd.to_numeric(df[metric], errors='coerce')
    
    # Sort the DataFrame by '# Misassemblies'
    df_sorted = df.sort_values(by=metric, ascending=True)
    
    # Create the plot
    plt.figure(figsize=(10, 6))
    plt.bar(df_sorted['Species'], df_sorted[metric], color='skyblue')
    plt.xlabel('Species')
    plt.ylabel(f'Intraspecies {metric}')
    plt.title(f'Intraspecies {metric} per Species')
    plt.xticks(rotation=90, fontsize=6)  # Rotate the species names for better readability
    plt.tight_layout()  # Adjust layout to make room for label rotation
    plt.savefig('miasm_intraspecies/intraspecies_miasm.png', dpi=300, bbox_inches="tight")
    plt.show()


In [None]:
def match_coverage_info(coverage_path, misassembly_df):
    # Step 1: Read the new file into a DataFrame
    new_df = pd.read_csv(coverage_path, delim_whitespace=True)
    
    # Step 2: Clean the 'Taxon' column to match the 'Species' format in df1
    new_df['Taxon'] = new_df['Taxon'].str.replace('----', '____').str.replace(r'\.1$', '')
    
    # Step 3: Merge new_df with df1
    # Assuming 'Species' in df1 needs to match 'Taxon' in new_df
    misassembly_df = misassembly_df.merge(new_df[['Taxon', 'Coverage']], left_on='Species', right_on='Taxon', how='left')
    
    # Rename 'Coverage' to be clear and drop redundant 'Taxon' column
    misassembly_df.rename(columns={'Coverage': 'Coverage'}, inplace=True)
    misassembly_df.drop('Taxon', axis=1, inplace=True)
    return misassembly_df

In [None]:
# Example usage
directory_path = "assembly-evaluation-quast/runs_per_reference"
genome_comp="genome-compositions.tsv"
df = collect_metric_data(directory_path, "# misassemblies")
print(df)
plot_metric(df,"# misassemblies")


In [None]:
df_cov = match_coverage_info(genome_comp, df)

In [None]:

print(df_cov.head(10).to_latex(index=False))