In [1]:
# default_exp multi_condition_analysis

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import alphaquant.multicond.median_condition_creation as aq_median_cond_creation
import pandas as pd
import numpy as np

#test aq_median_cond_creation.MedianConditionCreator


def run_tests():
    replicate_numbers = [3, 2, 3]
    samplemap_df = generate_samplemap_df(replicate_numbers)
    input_df_aqformat = generate_normalized_input_df(samplemap_df, num_quant_ids=100, nan_fraction=0.3, add_additional_samples=True)
    display(input_df_aqformat)

    mediancreator = aq_median_cond_creation.MedianConditionCreator(samplemap_df, input_df_aqformat)

    expected_number_replicates = min(replicate_numbers)
    expected_number_ions = len(input_df_aqformat.index)
    expected_number_samples = len(set(samplemap_df['sample']))
    test_determine_number_replicates(mediancreator, expected_number_replicates)

    display(mediancreator.extended_input_df)

    test_median_dataframe_shape(mediancreator, input_df_aqformat, expected_number_replicates, expected_number_ions, expected_number_samples)
    test_extended_samplemap_shape(mediancreator, samplemap_df, expected_number_replicates)


    print("All tests passed!")



def generate_samplemap_df(replicate_numbers):
        
    num_conditions = len(replicate_numbers)
    samplemap_data = {
        'sample': [],
        'condition': []
    }
    
    for condition_idx in range(num_conditions):
        condition_name = f'condition{condition_idx + 1}'
        for replicate_idx in range(replicate_numbers[condition_idx]):
            samplemap_data['sample'].append(f'sample{condition_idx + 1}_{replicate_idx + 1}')
            samplemap_data['condition'].append(condition_name)
            
    return pd.DataFrame(samplemap_data)

def generate_normalized_input_df(samplemap_df, num_quant_ids =10, nan_fraction = 0.1, add_additional_samples = False):
    np.random.seed(0)  # For reproducibility
    quant_ids = [f"quant_{i}" for i in range(num_quant_ids)]
    proteins = [f"protein_{i}" for i in range(num_quant_ids)]
    reformatted_input_data = {'quant_id': quant_ids, 'protein': proteins}

    sample_list = get_sample_list(samplemap_df, add_additional_samples)
    
    for sample in sample_list:
        data = np.random.rand(num_quant_ids)
        if nan_fraction > 0:


            nan_indices = np.random.choice(num_quant_ids, int(np.floor(nan_fraction * num_quant_ids)), replace=False)
            data[nan_indices] = np.nan
        reformatted_input_data[sample] = data
        
    return pd.DataFrame(reformatted_input_data)

def get_sample_list(samplemap_df, add_additional_samples):
    sample_list = samplemap_df['sample'].to_list()
    if add_additional_samples:
        sample_list.extend(['additional_sample1', 'additional_sample2'])
    return sample_list


def test_determine_number_replicates(mediancreator, expected_number):
    print(mediancreator._determine_number_replicates())
    assert mediancreator._determine_number_replicates() == expected_number, "Number of replicates is incorrect"

def test_median_dataframe_shape(mediancreator, input_df_aqformat,expected_number_replicates, expected_number_ions, expected_number_samples):
    print(input_df_aqformat.columns)
    expected_shape = (expected_number_ions, expected_number_samples+expected_number_replicates)
    assert mediancreator.extended_input_df.shape == expected_shape, "Median DataFrame shape is incorrect"

def test_extended_samplemap_shape(mediancreator, samplemap_df, expected_number_replicates):
    expected_shape = (len(samplemap_df.index) + expected_number_replicates, 2)
    assert mediancreator.extended_samplemap_df.shape == expected_shape, "Extended samplemap shape is incorrect"


run_tests()


Unnamed: 0,quant_id,protein,sample1_1,sample1_2,sample1_3,sample2_1,sample2_2,sample3_1,sample3_2,sample3_3,additional_sample1,additional_sample2
0,quant_0,protein_0,,0.179490,,0.944707,0.561577,,,0.342921,,
1,quant_1,protein_1,0.715189,0.170987,0.044612,0.892862,,,0.471457,0.603649,0.866292,0.487799
2,quant_2,protein_2,0.602763,,0.799796,0.677114,,0.127689,,,0.927490,0.771406
3,quant_3,protein_3,0.544883,0.874573,0.076956,0.639027,0.085233,,0.943851,0.298614,,
4,quant_4,protein_4,0.423655,0.944120,0.518835,0.548361,0.665678,0.239337,0.964925,,0.480421,
...,...,...,...,...,...,...,...,...,...,...,...,...
95,quant_95,protein_95,0.183191,0.181631,0.287052,0.928219,,,0.630832,0.958146,,0.951102
96,quant_96,protein_96,0.586513,,0.706575,0.592081,,,0.997994,,0.607624,0.034989
97,quant_97,protein_97,0.020108,,0.414857,0.431785,0.437814,0.700928,0.987889,0.921060,0.369941,0.538944
98,quant_98,protein_98,0.828940,,,0.592780,0.974990,,,,,0.945636


2


Unnamed: 0_level_0,Unnamed: 1_level_0,sample1_1,sample1_2,sample1_3,sample2_1,sample2_2,sample3_1,sample3_2,sample3_3,median_rep0,median_rep1
protein,quant_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
protein_0,quant_0,,0.179490,,0.944707,0.561577,,,0.342921,0.944707,0.370534
protein_1,quant_1,0.715189,0.170987,0.044612,0.892862,,,0.471457,0.603649,0.804026,0.321222
protein_2,quant_2,0.602763,,0.799796,0.677114,,0.127689,,,0.602763,
protein_3,quant_3,0.544883,0.874573,0.076956,0.639027,0.085233,,0.943851,0.298614,0.591955,0.874573
protein_4,quant_4,0.423655,0.944120,0.518835,0.548361,0.665678,0.239337,0.964925,,0.423655,0.944120
...,...,...,...,...,...,...,...,...,...,...,...
protein_95,quant_95,0.183191,0.181631,0.287052,0.928219,,,0.630832,0.958146,0.555705,0.406232
protein_96,quant_96,0.586513,,0.706575,0.592081,,,0.997994,,0.589297,0.997994
protein_97,quant_97,0.020108,,0.414857,0.431785,0.437814,0.700928,0.987889,0.921060,0.431785,0.712852
protein_98,quant_98,0.828940,,,0.592780,0.974990,,,,0.710860,0.974990


Index(['quant_id', 'protein', 'sample1_1', 'sample1_2', 'sample1_3',
       'sample2_1', 'sample2_2', 'sample3_1', 'sample3_2', 'sample3_3',
       'additional_sample1', 'additional_sample2'],
      dtype='object')
All tests passed!


In [4]:
import os
import tempfile

def test_add_and_save_median_condition():
    # Create a temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        # Create temporary file paths
        input_file_path = os.path.join(temp_dir, 'input.tsv')
        samplemap_file_path = os.path.join(temp_dir, 'samplemap.tsv')

        # Generate test data and save to temporary files
        replicate_numbers = [3, 2, 3]
        samplemap_df = generate_samplemap_df(replicate_numbers)
        samplemap_df.to_csv(samplemap_file_path, sep="\t", index=False)

        input_df = generate_normalized_input_df(samplemap_df, num_quant_ids=100, nan_fraction=0.3)
        input_df.to_csv(input_file_path, sep="\t", index=False)

        # Call function under test
        aqmcm = aq_median_cond_creation.MedianConditionManager(input_file_path, samplemap_file_path)
        
        input_file_path_adapted = aqmcm.input_filename_adapted
        samplemap_file_path_adapted = aqmcm._samplemap_filename_adapted

        # Check that the files were created
        assert os.path.exists(input_file_path_adapted), "Input file was not created."
        assert os.path.exists(samplemap_file_path_adapted), "Samplemap file was not created."

        # Read the files back in and validate contents
        extended_input_df_saved = pd.read_csv(input_file_path_adapted, sep="\t")
        extended_samplemap_df_saved = pd.read_csv(samplemap_file_path_adapted, sep="\t")

        # Validate that the DataFrames have the added median conditions
        median_condition_creator = aq_median_cond_creation.MedianConditionCreator(samplemap_df, input_df)
        pd.testing.assert_frame_equal(extended_input_df_saved, median_condition_creator.extended_input_df.reset_index())
        
        pd.testing.assert_frame_equal(extended_samplemap_df_saved.reset_index(drop= True), median_condition_creator.extended_samplemap_df.reset_index(drop = True))

        display(extended_input_df_saved)
        display(extended_samplemap_df_saved)

        print("Test for add_and_save_median_condition passed.")

# Call the new test function
test_add_and_save_median_condition()


Unnamed: 0,protein,quant_id,sample1_1,sample1_2,sample1_3,sample2_1,sample2_2,sample3_1,sample3_2,sample3_3,median_rep0,median_rep1
0,protein_0,quant_0,,0.179490,,0.944707,0.561577,,,0.342921,0.944707,0.370534
1,protein_1,quant_1,0.715189,0.170987,0.044612,0.892862,,,0.471457,0.603649,0.804026,0.321222
2,protein_2,quant_2,0.602763,,0.799796,0.677114,,0.127689,,,0.602763,
3,protein_3,quant_3,0.544883,0.874573,0.076956,0.639027,0.085233,,0.943851,0.298614,0.591955,0.874573
4,protein_4,quant_4,0.423655,0.944120,0.518835,0.548361,0.665678,0.239337,0.964925,,0.423655,0.944120
...,...,...,...,...,...,...,...,...,...,...,...,...
95,protein_95,quant_95,0.183191,0.181631,0.287052,0.928219,,,0.630832,0.958146,0.555705,0.406232
96,protein_96,quant_96,0.586513,,0.706575,0.592081,,,0.997994,,0.589297,0.997994
97,protein_97,quant_97,0.020108,,0.414857,0.431785,0.437814,0.700928,0.987889,0.921060,0.431785,0.712852
98,protein_98,quant_98,0.828940,,,0.592780,0.974990,,,,0.710860,0.974990


Unnamed: 0,sample,condition
0,sample1_1,condition1
1,sample1_2,condition1
2,sample1_3,condition1
3,sample2_1,condition2
4,sample2_2,condition2
5,sample3_1,condition3
6,sample3_2,condition3
7,sample3_3,condition3
8,median_rep0,median_reference
9,median_rep1,median_reference


Test for add_and_save_median_condition passed.


### Test ProteoformConditionAligner

In [5]:
import alphaquant.multicond.median_condition_analysis as aqmca
import anytree



def test_ProteoformConditionAligner(cluster_matrix, expected_number_of_proteoforms, expected_peptide_groups):
    simulated_nodes = simulate_list_of_protein_nodes_from_cluster_matrix(cluster_matrix)
    aligner = aqmca.ProteoformConditionAligner(simulated_nodes)
    test_that_num_proteoforms_is_as_expected(aligner.proteoform_df, expected_number_of_proteoforms)
    test_that_expected_peptides_are_in_groups(aligner.proteoform_df, expected_peptide_groups)


def simulate_list_of_protein_nodes_from_cluster_matrix(cluster_matrix):
    list_of_protein_nodes = []
    num_condition = len(cluster_matrix[0])
    for cond_idx in range(num_condition):
        list_of_cluster_idxs_of_every_peptide = [x[cond_idx] for x in cluster_matrix]
        list_of_protein_nodes.append(simulate_protein_node_w_clustermatrix_subset(list_of_cluster_idxs_of_every_peptide, f"cond{cond_idx}"))
    return list_of_protein_nodes 
        

def simulate_protein_node_w_clustermatrix_subset(list_of_cluster_idxs_of_every_peptide, condition):
    condition = anytree.Node([condition, "median_reference"])
    protein = anytree.Node("protein", parent=condition)
    for idx, cluster in enumerate(list_of_cluster_idxs_of_every_peptide):
        peptide = anytree.Node(f"pep_{idx}", parent=protein)
        peptide.cluster = cluster
        peptide.fc = cluster
    return protein


def test_that_num_proteoforms_is_as_expected(peptide_cluster_df, num_proteoforms_exptected):
    assert len(set(peptide_cluster_df["proteoform_id"])) == num_proteoforms_exptected
    print("Test passed: Number of proteoforms is as expected.")


def test_that_expected_peptides_are_in_groups(peptide_cluster_df, list_of_expected_peptide_groups):
    expected_peptides_derived_from_peptide_groups = set(peptide_cluster_df["peptides"])
    assert expected_peptides_derived_from_peptide_groups == set(list_of_expected_peptide_groups)
    print("Test passed: Expected peptides are in groups.")





cluster_matrix = [[0, 0, 1, 0],
                  [1, 1, 0, 1],
                  [0, 0, 1, 1],
                  [1, 1, 0, 1],
                  [1, 2, 3, 4]
                  ]
expected_number_of_proteoforms = 3
expected_peptide_groups = ["pep_0;pep_1;pep_3", "pep_2", "pep_4"]

test_ProteoformConditionAligner(cluster_matrix, expected_number_of_proteoforms, expected_peptide_groups)


cluster_matrix = [[0, 1, 1, 0], 
                  [1, 1, 0, 1], 
                  [0, 0, 1, 0], 
                  [1, 1, 0, 1]]
expected_number_of_proteoforms = 2
expected_peptide_groups = ["pep_0", "pep_1;pep_2;pep_3"]

test_ProteoformConditionAligner(cluster_matrix, expected_number_of_proteoforms, expected_peptide_groups)






Test passed: Number of proteoforms is as expected.
Test passed: Expected peptides are in groups.
Test passed: Number of proteoforms is as expected.
Test passed: Expected peptides are in groups.


### Test ProteoformPeptideDfCreator

In [6]:
import alphaquant.multicond.median_condition_analysis as aqmca

def test_ProtoformPeptideDfCreator(list_of_cluster_and_fc, list_of_numpep_per_cluster, number_conditions ):
    creator = get_ProtoformPeptideDfCreator_on_specified_simulated_input(list_of_cluster_and_fc, list_of_numpep_per_cluster, number_conditions)

    test_dataframe_dimensions(creator, list_of_numpep_per_cluster, number_conditions)
    test_cluster_values(creator, list_of_numpep_per_cluster)
    test_fc_values(creator, list_of_numpep_per_cluster, list_of_cluster_and_fc)

def get_ProtoformPeptideDfCreator_on_specified_simulated_input(list_of_cluster_and_fc = [1.1, -1.1], list_of_numpep_per_cluster = [2, 4], number_conditions = 5):
    nodes_same_protein_different_conditions = simulate_nodes_same_protein_different_conditions(list_of_cluster_and_fc, list_of_numpep_per_cluster, number_conditions)
    return aqmca.ProteoformPeptideDfCreator(nodes_same_protein_different_conditions)


def simulate_nodes_same_protein_different_conditions(list_of_cluster_and_fc, list_of_numpep_per_cluster, number_conditions):
    list_of_conditions = [f"cond_{i}" for i in range(number_conditions)]

    return [simulate_protein_node(list_of_cluster_and_fc, list_of_numpep_per_cluster, condition) for condition in list_of_conditions]


def simulate_protein_node(list_of_cluster_and_fc = [1.1, -1.1], list_of_numpep_per_cluster = [2, 2], condition = "cond1"):
    condition = anytree.Node([condition, "median_reference"])
    protein = anytree.Node("protein", parent=condition)
    for cluster_idx in range(len(list_of_numpep_per_cluster)):
        numpep = list_of_numpep_per_cluster[cluster_idx]
        for pep_idx in range(numpep):
            peptide = anytree.Node(f"pep_{cluster_idx}_{pep_idx}", parent=protein)
            peptide.cluster = cluster_idx
            peptide.fc = list_of_cluster_and_fc[cluster_idx]
    return protein
        

def test_dataframe_dimensions(creator, list_of_numpep_per_cluster, num_conditions):
    num_peptides = sum(list_of_numpep_per_cluster)
    assert creator.peptide_cluster_df.shape == (num_peptides, num_conditions), "Incorrect dimensions for peptide_cluster_df"
    assert creator.peptide_fc_df.shape == (num_peptides, num_conditions), "Incorrect dimensions for peptide_fc_df"
    print("dimensions test passed")

def test_cluster_values(creator, list_of_numpep_per_cluster):
    expected_cluster_values = get_expected_cluster_values(list_of_numpep_per_cluster)
    for col in creator.peptide_cluster_df.columns:
        assert all(creator.peptide_cluster_df[col] == expected_cluster_values), f"Mismatch in clusters for {col}"
    print("clusters test passed")

def get_expected_cluster_values(list_of_numpep_per_cluster):
    expected_cluster_values = []
    for cluster_idx in range(len(list_of_numpep_per_cluster)):
        expected_cluster_values.extend([cluster_idx] * list_of_numpep_per_cluster[cluster_idx])
    return expected_cluster_values        

def test_fc_values(creator, list_of_numpeps_per_cluster, list_of_cluster_and_fc):
    expected_fcs = get_expected_fc_values(list_of_numpeps_per_cluster, list_of_cluster_and_fc)
    for col in creator.peptide_fc_df.columns:
        assert all(creator.peptide_fc_df[col] == expected_fcs), f"Mismatch in fc values for {col}"
    print("fcs test passed")

def get_expected_fc_values(list_of_numpeps_per_cluster, list_of_cluster_and_fc):
    expected_fcs = []
    for cluster_idx in range(len(list_of_numpeps_per_cluster)):
        expected_fcs.extend([list_of_cluster_and_fc[cluster_idx]] * list_of_numpeps_per_cluster[cluster_idx])
    return expected_fcs




test_ProtoformPeptideDfCreator(list_of_cluster_and_fc = [1.1, -1.1], list_of_numpep_per_cluster = [2, 4], number_conditions = 5)
test_ProtoformPeptideDfCreator(list_of_cluster_and_fc = [1.1, -1.1, 10, 3], list_of_numpep_per_cluster = [2, 4, 12, 200], number_conditions = 60)
test_ProtoformPeptideDfCreator(list_of_cluster_and_fc = [1.1], list_of_numpep_per_cluster = [1], number_conditions = 2)


dimensions test passed
clusters test passed
fcs test passed
dimensions test passed
clusters test passed
fcs test passed
dimensions test passed
clusters test passed
fcs test passed


### Test ProteoformDfCreator

In [7]:
import alphaquant.multicond.median_condition_analysis as aqmca
import numpy as np
import numpy.random

def test_ProteoformDfCreator(groups_of_peptide_clusters, peptide_fc_df, protein_name):
    creator = aqmca.ProteoformDfCreator(groups_of_peptide_clusters, peptide_fc_df, protein_name)

    test_initialization(creator)
    test_proteoform_grouping(creator, groups_of_peptide_clusters, protein_name)
    test_fold_change_data(creator, peptide_fc_df)
    test_dataframe_structure(creator, peptide_fc_df)

def test_initialization(creator):
    display(creator.proteoform_df)
    assert creator.proteoform_df is not None, "proteoform_df should not be None"
    print("Initialization test passed")

def test_proteoform_grouping(creator, groups_of_peptide_clusters, protein_name):
    for idx, group in enumerate(groups_of_peptide_clusters):
        proteoform_id = f"{protein_name}_{idx}"
        group_df = creator.proteoform_df[creator.proteoform_df['proteoform_id'] == proteoform_id]
        assert all(group_df['peptides'] == ';'.join(group)), "Incorrect peptides in proteoform"
        assert all(group_df['protein'] == protein_name), "Incorrect protein name"
    print("Proteoform grouping test passed")

def test_fold_change_data(creator, peptide_fc_df):
    for peptide in peptide_fc_df.index:
        for cond in peptide_fc_df.columns:
            assert np.isclose(creator.proteoform_df.loc[peptide, cond], peptide_fc_df.loc[peptide, cond]), f"Mismatch in fold change data for {peptide}, {cond}"
    print("Fold change data test passed")

def test_dataframe_structure(creator, peptide_fc_df):
    expected_columns = ['peptides', 'protein', 'proteoform_id'] + list(peptide_fc_df.columns)
    assert set(creator.proteoform_df.columns) == set(expected_columns), "Incorrect dataframe columns"
    expected_rows = sum(len(group) for group in groups_of_peptide_clusters)
    assert len(creator.proteoform_df) == expected_rows, "Incorrect number of rows in dataframe"
    print("Dataframe structure test passed")

def generate_peptide_fc_df(groups_of_peptide_clusters, number_conditions):

    peptide2conds = {}
    for idx in range(len(groups_of_peptide_clusters)):
        for peptide in groups_of_peptide_clusters[idx]:
            peptide2conds[peptide] = [numpy.random.random() for idx in range(number_conditions)]
    
    peptide2conds_df =pd.DataFrame(peptide2conds).T
    peptide2conds_df.columns = [f"cond_{idx}" for idx in range(number_conditions)]
    return peptide2conds_df


# Example usage
groups_of_peptide_clusters = [["A", "B", "C"], ["D", "E"]]
peptide_fc_df = generate_peptide_fc_df(groups_of_peptide_clusters, 3)
protein_name = "protein"

test_ProteoformDfCreator(groups_of_peptide_clusters, peptide_fc_df, protein_name)

groups_of_peptide_clusters = [["A", "B"], ["D", "E"], ["C"], ["F", "G", "H"]]
peptide_fc_df = generate_peptide_fc_df(groups_of_peptide_clusters, 3)
protein_name = "protein"

test_ProteoformDfCreator(groups_of_peptide_clusters, peptide_fc_df, protein_name)


Unnamed: 0,peptides,protein,proteoform_id,cond_0,cond_1,cond_2
A,A;B;C,protein,protein_0,0.743531,0.866292,0.92749
B,A;B;C,protein,protein_0,0.013638,0.480421,0.917152
C,A;B;C,protein,protein_0,0.169935,0.039222,0.430146
D,D;E,protein,protein_1,0.431253,0.531195,0.470647
E,D;E,protein,protein_1,0.697994,0.541822,0.54234


Initialization test passed
Proteoform grouping test passed
Fold change data test passed
Dataframe structure test passed


Unnamed: 0,peptides,protein,proteoform_id,cond_0,cond_1,cond_2
A,A;B,protein,protein_0,0.667213,0.52239,0.062131
B,A;B,protein,protein_0,0.811259,0.286454,0.90165
D,D;E,protein,protein_1,0.642234,0.763869,0.312511
E,D;E,protein,protein_1,0.156098,0.651467,0.337081
C,C,protein,protein_2,0.430235,0.849336,0.022332
F,F;G;H,protein,protein_3,0.794306,0.924292,0.50625
G,F;G;H,protein,protein_3,0.144179,0.396234,0.148975
H,F;G;H,protein,protein_3,0.122745,0.963913,0.303421


Initialization test passed
Proteoform grouping test passed
Fold change data test passed
Dataframe structure test passed
