In [None]:
import pandas as pd
from itertools import combinations
import multiprocessing
from multiprocessing import Manager
import csv
import time

# Start time tracking
startTime = time.time()

# Load target list
target_list = pd.read_csv('target_taxo_list.csv', sep="\t")
target_list = list(target_list.iloc[:, 0])

# List of donors to process
donor_list = ['Pr1', 'Pr2', 'Pr3', 'Pr5', 'Pr6', 'Pr7']

# Function to generate all possible media combinations
def media_combinations_list_generation(list_of_conditions, min_comb, max_comb): 
    media_comb_list = []
    for n in range(min_comb, max_comb):
        media_comb = list(combinations(list_of_conditions, n))
        media_comb_list.append(media_comb)
    # Flatten the list of combinations
    media_comb_list = [item for sublist in media_comb_list for item in sublist]
    return media_comb_list

# Function to compute metrics for each media combination
def media_compute(mix):
    row = []
    
    # Filter target taxo benchmark for the given media combination
    marker_for_mix = target_taxo_benchmark_filtered[list(mix)]
    
    # Remove rows that contain a 0 (absent OTUs)
    marker_for_mix = marker_for_mix[~(marker_for_mix == 0).any(1)]
    
    # Calculate total number of OTUs in the combination
    total_OTU = len(marker_for_mix)
    if total_OTU > 0:
        # Calculate mean relative abundance (RA) per OTU
        marker_for_mix['Mean_RA_OTU'] = marker_for_mix.mean(axis=1, numeric_only=True).round(3)
        marker_for_mix.sort_values(by='Mean_RA_OTU', inplace=True, ascending=False)
        
        # Get the target mean RA
        target_mean_RA = marker_for_mix.loc[target, 'Mean_RA_OTU']
        sum_mean_RA = marker_for_mix['Mean_RA_OTU'].sum()
        
        # Calculate the ratio of target RA over total RA
        ratio_target_total_mean_RA_in_percent = (target_mean_RA / sum_mean_RA) * 100
        
        # Get the top 3 OTUs if available
        if len(marker_for_mix) >= 3:
            top = {
                'Top1': marker_for_mix.index[0], 'Top1_RA': marker_for_mix.iloc[0, -1],
                'Top2': marker_for_mix.index[1], 'Top2_RA': marker_for_mix.iloc[1, -1],
                'Top3': marker_for_mix.index[2], 'Top3_RA': marker_for_mix.iloc[2, -1]
            }
        else:
            top = 'Less than 3 OTUs detected in combos'
        
        # Append the computed metrics to the row
        row = [total_OTU, target_mean_RA, ratio_target_total_mean_RA_in_percent, [str(','.join(str(e) for e in mix))], top]
        record.append(row)

# Main loop to process each donor and target
for donor in donor_list:
    # Load the target taxo benchmark dataset for the donor
    target_taxo_benchmark = pd.read_csv(f'target_taxo_benchmark_df_{donor}.csv', sep="\t")
    
    for target in target_list:
        # Filter the dataset for the target OTU
        species_to_target = target_taxo_benchmark[target_taxo_benchmark['OTU'] == target]
        record = Manager().list()
        target_name_reformated = target.replace('/', '-')
        
        # Create CSV file to store output combinations
        csv_file = open(f"output_{target_name_reformated}_{donor}_combinations.csv", 'w')
        csv_writer = csv.writer(csv_file, delimiter="\t") 
        
        # Check if the dataset is not empty and has enough columns
        if not ((species_to_target.empty) or (len(species_to_target.columns) <= 2)):
            # Filter columns with non-zero values
            species_to_target = species_to_target.loc[:, (species_to_target != 0).any(0)].dropna(axis=1)
            conditions_to_combo = species_to_target.columns
            
            # Filter the target taxo benchmark by selected conditions
            target_taxo_benchmark_filtered = target_taxo_benchmark[conditions_to_combo]
            target_taxo_benchmark_filtered.set_index('OTU', inplace=True)
            
            # Generate media combinations (up to 6 conditions)
            media_combinations_list = media_combinations_list_generation(conditions_to_combo[1:], 1, 6)
            
            # Use multiprocessing to process combinations in parallel
            if __name__ == '__main__':
                pool = multiprocessing.Pool(12)
                pool.map(media_compute, media_combinations_list)
        
        # Write the results to the CSV file
        csv_writer.writerows(record)
        csv_file.close()
        
        # Print execution time for each target
        executionTime = (time.time() - startTime)
        print(f'Execution time in seconds: {executionTime} for target {target}')


In [None]:
import pandas as pd
import gc
import time
import os

# Start time tracking
startTime = time.time()

# List of donors
donor_list = ['Pr1', 'Pr2', 'Pr3', 'Pr5', 'Pr6', 'Pr7']

# Load metadata
metadata_df = pd.read_csv("metadata_benchmark_2.csv", usecols=['Sample_ID', 'Modification'], sep="\t")
modification = metadata_df.set_index('Sample_ID')['Modification'].to_dict()

# Load taxonomic data
taxo_df_melted = pd.read_csv("taxo_RA_v2_df_melted.csv", sep="\t", index_col=[0])
target_taxo_level_df = taxo_df_melted[taxo_df_melted['Count'] != 0]
target_taxo_level_df = target_taxo_level_df[['Phylum', 'Family', 'Genus', 'OTU']].drop_duplicates().reset_index(drop=True)
target_taxo_level_df.columns = ['Phylum', 'Family', 'Genus', 'Target']

# Initialize dataframe to hold all combinations
all_donor_set = pd.DataFrame()

# Iterate over each donor
for chosen_donor in donor_list:
    target_benchmark = pd.read_csv(f'target_taxo_benchmark_df_{chosen_donor}.csv', sep="\t")
    target_list = target_benchmark['OTU'].unique()
    target_list = [t.replace('/', '-') for t in target_list]
    
    # Process each target
    for target in target_list:
        dataset = os.path.isfile(f'output_{target}_{chosen_donor}_combinations.csv')
        
        if dataset:
            filesize = os.path.getsize(f'output_{target}_{chosen_donor}_combinations.csv')
            
            if filesize != 0:
                # Load combinations data for the target
                combinations_df = pd.read_csv(f'output_{target}_{chosen_donor}_combinations.csv', sep='\t',
                                              names=['Total OTUs in combo', 'Target mean RA', 'Ratio Target over Total mean RA percentage', 
                                                     'Media combination', 'Top 3 OTUs in the combo'])
                
                # Sort based on ranking criteria
                combinations_df.sort_values(by=['Ratio Target over Total mean RA percentage', 'Total OTUs in combo', 'Target mean RA'], ascending=False, inplace=True)
                combinations_df = combinations_df.head(150)
                
                # Clean up media combinations
                combinations_df['Media combination'] = combinations_df['Media combination'].str.replace("'", '').str.replace("[", '').str.replace("]", '')
                combinations_df['Media combination'] = combinations_df['Media combination'].apply(lambda x: set(x.strip("[]").split(",")))
                combinations_df['Number of media in combo'] = combinations_df['Media combination'].str.len()
                combinations_df['Target'] = target
                combinations_df['Donor'] = chosen_donor
                
                # Prepare dataframe for merging
                combin_name = pd.DataFrame(combinations_df['Media combination'].values.tolist()).rename(columns=lambda x: f'Media {x + 1}').fillna('None')
                combinations_df = pd.merge(left=combinations_df, right=combin_name, left_index=True, right_index=True)
                combinations_df.drop('Media combination', inplace=True, axis=1)
                
                # Apply filtering rules
                result_curated_ph = combin_name.copy()
                result_curated_temp = combin_name.copy()
                result_curated_ph.fillna('None', inplace=True)
                
                result_curated_ph_counts = result_curated_ph.apply(lambda x: x.str.count("pH")).sum(axis=1)
                result_curated_temp_counts = result_curated_temp.apply(lambda x: x.str.count("Temp")).sum(axis=1)
                
                # Filter combinations with certain conditions
                filtered_combinations = combinations_df[(result_curated_ph_counts < 2) & (result_curated_temp_counts < 2)].head(100)
                filtered_combinations.to_csv(f'output_taxo_{target}_{chosen_donor}_combinations_top100.csv', sep='\t', index=False)
                
                # Clean memory
                del combinations_df
                gc.collect()

# After processing, combine all results into one dataset
all_donor_set.to_csv('output_taxo_all_compound_combinations_curated_all_donors.csv', sep='\t', index=False)

# Completion time
executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))


In [None]:
import pandas as pd
import plotly.express as px

# Load processed data for Collinsella aerofaciens
species_level_all_donor_all_combo = pd.read_csv('output_taxo_all_compound_combinations_curated_all_donors.csv', sep='\t')

# Filter data for Collinsella aerofaciens
selected_target = 'Collinsella aerofaciens'
filtered_df = species_level_all_donor_all_combo[species_level_all_donor_all_combo['Target'] == selected_target]

# Plotting the results
fig = px.scatter(filtered_df, y='Total OTUs in combo', x='Target mean RA', color='Donor', 
                 size='Number of unique media in combo',
                 hover_data=['Target', 'Donor', 'Total OTUs in combo', 'Target mean RA',
                             'Ratio Target over Total mean RA percentage', 
                             'Media 1', 'Media 2', 'Media 3', 'Media 4', 
                             'Media 5', 'Media 6', 'Media 7', 'Media 8',
                             'Number of unique media in combo'])

fig.update_layout(title={'text': f'Total OTU recovery and relative abundance for {selected_target} per media combination',
                         'y': 0.99, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'}, 
                  title_font_size=20)

fig.show()


In [None]:
import pandas as pd
import base64
import dash
from dash.dependencies import Input, Output
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px

# Load the processed dataset for all combinations across all donors
species_level_all_donor_all_combo = pd.read_csv('output_taxo_all_compound_combinations_curated_all_donors.csv', sep='\t')

# Calculate the number of media used in each combination
species_level_all_donor_all_combo = species_level_all_donor_all_combo.fillna('None')
empty_media_cell = species_level_all_donor_all_combo.apply(lambda x: (x == 'None').sum(), axis='columns')
species_level_all_donor_all_combo['Number of unique media in combo'] = 8 - empty_media_cell
species_level_all_donor_all_combo = species_level_all_donor_all_combo[species_level_all_donor_all_combo['Number of unique media in combo'] > 1]

# List all available targets in the dataset
list_target = species_level_all_donor_all_combo['Target'].sort_values().unique()

# Dash app setup
app = dash.Dash(__name__)
app.config.suppress_callback_exceptions = True

# App layout
app.layout = html.Div(children=[
    html.H1("Analysis app for selecting targeted enrichment combinations"),
    
    dcc.Dropdown(id='dropdown_target_select', 
                 options=[{'label': i, 'value': i} for i in list_target],
                 value='Collinsella aerofaciens', multi=False, 
                 placeholder='Filter by target'),

    html.Div([
        html.Div(id='graph'),
        html.Div(id='network-heatmap'),
    ], className="row")   
])

@app.callback([Output('graph', 'children'), Output('network-heatmap', 'children')],
              [Input('dropdown_target_select', 'value')])
def select_graph(value):
    selected_target_all_compound_combo = species_level_all_donor_all_combo[species_level_all_donor_all_combo['Target'] == value]
    
    fig = px.scatter(selected_target_all_compound_combo, 
                     y='Total OTUs in combo', 
                     x='Target mean RA', 
                     color='Donor',
                     size='Number of unique media in combo', 
                     hover_data=['Target', 'Donor', 'Total OTUs in combo', 'Target mean RA',
                                 'Ratio Target over Total mean RA percentage', 
                                 'Media 1', 'Media 2', 'Media 3', 'Media 4', 
                                 'Media 5', 'Media 6', 'Media 7', 'Media 8',
                                 'Number of unique media in combo'],
                     template='plotly_white')
    
    fig.update_layout(title={'text': f'Total OTU recovery and relative abundance (%) for {value} per media combination',
                             'y': .99, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'}, 
                      title_font_size=30)
    fig.update_layout(plot_bgcolor='rgba(0, 0, 0, 0)', 
                      paper_bgcolor='rgba(0, 0, 0, 0)', 
                      autosize=False, width=1500, height=400, 
                      margin=dict(l=20, r=20, b=20, t=60, pad=4))
    
    fig.update_layout(legend=dict(title='Sample donor:', orientation="h",
                                  yanchor="bottom", y=0.99, 
                                  xanchor="left", x=0.1))

    encoded_image = base64.b64encode(open(f'{value} - Network graph and heatmap of the mean relative abundance for {value} markers in most prevalent media.png', 'rb').read())
    source = 'data:image/png;base64,{}'.format(encoded_image.decode())

    return [dcc.Graph(figure=fig, style={'display': 'inline-block', 'vertical-align': 'middle'}),
            html.Img(src=source, style={'height': '70%', 'width': '50%', 'display': 'inline-block',
                                        'vertical-align': 'middle', 'horizontal-align': 'right', 
                                        'margin-left': '100px'})]

if __name__ == "__main__":
    app.run_server(debug=False)
