In [2]:
#imports
import pandas as pd
import os
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

### Metabat2 

In [1]:
metabatPath = "/lustre/shared/wfsr-mcfa/projects/internships/luka/viral_metagenomics_pipeline/results/metabat2/host_removed_checkv_filtered"

In [63]:
#count how many bins for each sample.

# Initialize an empty DataFrame to store binning statistics
binning_df  = pd.DataFrame()

# Iterate through directories in the 'metabatPath' directory
for index,sample in enumerate(os.listdir(metabatPath)):
    #print(sample)
    # Initialize counters for the number of bins and unbinned contigs
    num_bins = 0
    num_unbinned = 0

    # Create a DataFrame to store statistics for the current sample
    sample_binning_df = pd.DataFrame()
    
    for bin in os.listdir(os.path.join(metabatPath, sample)):
        
         # Skip files starting with '.snakemake'
        if bin.startswith('.snakemake'):
            continue
        #print(bin)
        
        #count num binned (it is only a bin if 2nd partion has a digit)
        if bin.split(".")[1].isdigit():
            num_bins += 1
            #print(bin.split(".")[1])
        #count many contigs were unbinned
        if bin.split(".")[1] == "unbinned":            
            with open(os.path.join(metabatPath,sample,bin), 'r') as fn:
                for line in fn.readlines():
                    if line.startswith(">"):
                        num_unbinned += 1
    #print(num_unbinned, sample)
    #break

    # Create a DataFrame for the current sample's statistics
    sample_binning_df.at[index,'sample'] = sample
    sample_binning_df.at[index,'num_bins'] = str(num_bins)
    sample_binning_df.at[index,'num_unbinned_contigs'] = str(num_unbinned)


    # Concatenate the sample DataFrame with the overall binning DataFrame
    binning_df = pd.concat([binning_df, sample_binning_df])



# Save the binning statistics DataFrame to a CSV file
binning_df.to_csv("../supplimentary_data/metabat2_binning_stats.csv", columns=binning_df.columns)

In [65]:
binning_df.head(20)

Unnamed: 0,sample,num_bins,num_unbinned_contigs
0,HCFKNDSX3_104762-001-002_47_GAAGGAAG-CCAGGATG_...,6,2
1,HCFKNDSX3_104762-001-002_64_AACGCATT-CAACCTGC_...,16,7
2,HCFKNDSX3_104762-001-002_57_ATTATCAA-GAGAGTCG_...,9,0
3,HCFKNDSX3_104762-001-002_44_TTAATCAG-CAGATTGG_...,13,1
4,HCFKNDSX3_104762-001-002_53_TCTGCAAG-AAGGTGAA_...,24,3
5,HCFKNDSX3_104762-001-002_09_CATGATCG-GGAGAGTA_...,35,8
6,H7NKYDSX3_104762-001-003_56_TTAATCAG-CAGATTGG_...,40,4
7,HCFKNDSX3_104762-001-002_35_AGGTGCGA-GCCAGAAG_...,25,4
8,HCFKNDSX3_104762-001-002_27_CTGTGGCG-GGCTAGTG_...,11,2
9,H7NKYDSX3_104762-001-003_31_TCGCCTTG-GAATCAGC_...,26,6


### VAMB

In [66]:
host_exlusionPath ="/lustre/shared/wfsr-mcfa/projects/internships/luka/viral_metagenomics_pipeline/results/vamb/host_removed/bin_size1000/clusters.tsv"

In [208]:
#read in the cluster file
def read_vamb_bins(filePath):
    """
    Read VAMB bin information from a TSV file.

    Parameters:
    filePath (str): The path to the VAMB bin file in TSV format.

    Returns:
    pd.DataFrame: A DataFrame containing bin information with columns 'bin_id' and 'contig_id'.
    """

    vamb_df = pd.read_csv(filePath, sep="\t", header=None)

    # Rename the columns for clarity
    vamb_df.rename(columns={
        0:'bin_id',
        1:"contig_id"
    },inplace=True)

    return vamb_df

# add sample ids to vamb bins
def add_sample(x):
    """
    Extract and add sample IDs to contig IDs.

    Parameters:
    x (str): A contig ID with a sample identifier.

    Returns:
    str: A modified contig ID with 'sample_' prefix.
    """
    
    sample_id = x.split("_")[1]
    
    return f"sample_{sample_id}"




#count the number of bins per sample
def count_vamb_bin_per_sample(vamb_df):
    """
    Count the number of bins per sample in a VAMB DataFrame.

    Parameters:
    vamb_df (pd.DataFrame): A DataFrame containing bin information with 'bin_id' and 'contig_id' columns.

    Returns:
    pd.DataFrame: A DataFrame with the number of bins per sample, with columns 'sample' and 'num_bins'.
    """

    bins_df = pd.DataFrame(vamb_df.groupby(['sample'])['bin_id'].count())
    bins_df.reset_index(inplace=True)
    bins_df.rename(columns={    
        "bin_id":"num_bins"
    }, inplace=True)

    return bins_df

In [209]:
#read host exclusion bins
host_exlusion_df = read_vamb_bins(host_exlusionPath)
#add sample id
host_exlusion_df['sample'] = host_exlusion_df['contig_id'].apply(add_sample)
host_exlusion_df.tail()



Unnamed: 0,bin_id,contig_id,sample
451582,351250,sample_8_NODE_5741_length_441_cov_9.831210,sample_8
451583,351251,sample_18_NODE_3310_length_436_cov_8.132686,sample_18
451584,351252,sample_15_NODE_22368_length_308_cov_8.823204,sample_15
451585,351253,sample_14_NODE_773_length_1179_cov_13.676806,sample_14
451586,351254,sample_11_NODE_5313_length_557_cov_8.783721,sample_11


In [210]:

host_exl_bin = count_vamb_bin_per_sample(host_exlusion_df)
host_exl_bin .head()

Unnamed: 0,sample,num_bins
0,sample_1,5471
1,sample_10,18851
2,sample_11,46995
3,sample_12,29873
4,sample_13,17474


In [211]:
#same steps above for viruses-only
viruses_onlyPath = "/lustre/shared/wfsr-mcfa/projects/internships/luka/viral_metagenomics_pipeline/results/vamb/virus_only/bin_size1000/clusters.tsv"

In [213]:
#read host exclusion bins
viruses_only_df = read_vamb_bins(viruses_onlyPath)
#add sample id
viruses_only_df['sample'] = viruses_only_df['contig_id'].apply(add_sample)
viruses_only_df.tail()

Unnamed: 0,bin_id,contig_id,sample
7708,5072,sample_4_NODE_279_length_282_cov_11.077419,sample_4
7709,5073,sample_4_NODE_282_length_282_cov_0.361290,sample_4
7710,5074,sample_14_NODE_304_length_341_cov_3.761682,sample_14
7711,5075,sample_4_NODE_62_length_513_cov_26.054404,sample_4
7712,5076,sample_14_NODE_466_length_293_cov_10.198795,sample_14


In [216]:
viruses_only_bin = count_vamb_bin_per_sample(viruses_only_df)
viruses_only_bin.head(20)

Unnamed: 0,sample,num_bins
0,sample_1,77
1,sample_10,156
2,sample_11,941
3,sample_12,924
4,sample_13,287
5,sample_14,746
6,sample_15,413
7,sample_16,574
8,sample_17,74
9,sample_18,86


In [224]:
# merge the two counts
merged_bins = pd.DataFrame(viruses_only_bin['sample'])
merged_bins['host_exclusion_bins'] = host_exl_bin['num_bins']
merged_bins['viruses-_only_bins'] = viruses_only_bin['num_bins']


merged_bins.to_csv("../supplimentary_data/vamb_bins_per_sample.csv", columns=merged_bins.columns)
merged_bins.head(20)

Unnamed: 0,sample,host_exclusion_bins,viruses-_only_bins
0,sample_1,5471,77
1,sample_10,18851,156
2,sample_11,46995,941
3,sample_12,29873,924
4,sample_13,17474,287
5,sample_14,40316,746
6,sample_15,39804,413
7,sample_16,19624,574
8,sample_17,10772,74
9,sample_18,15964,86
