In [1]:
# first we used text mining to find the biomarkers from the litreature
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt  
from scipy.stats import f_oneway, ttest_ind
from statsmodels.stats.multitest import fdrcorrection
import numpy as np
import math

def do_t_test_on_whole_data_frame(df:pd.DataFrame,group_of_interest:str='CHDM') -> pd.DataFrame:
    """
    :df: a dataframe with at least 4 columns 'patient_group' and 'intensity' and 'type' and 'normalization_scenario'
     type could be before_normalization adn after_normalization

    Returns
    A pandas data frame after doing all t_Tests across all the biomarkers

    """
    final_results = pd.DataFrame([do_t_test_per_biomarker(df,x,group_of_interest=group_of_interest) for x in df.biomarker.unique().tolist()])
    final_results['p_value_before_normalization'] = final_results['before_normalization'].apply(lambda x:x[1])
    final_results['p_value_after_normalization'] = final_results['after_normalization'].apply(lambda x:x[1])
    final_results['fdr_before_normalization'] = fdrcorrection(final_results.p_value_before_normalization.to_numpy(), alpha=0.01, method='indep', is_sorted=False)[1]
    final_results['fdr_after_normalization'] = fdrcorrection(final_results.p_value_after_normalization.to_numpy(), alpha=0.01, method='indep', is_sorted=False)[1]
    final_results['P_padjusted_before_normalization'] = final_results['fdr_before_normalization'].apply(lambda x:math.log10(1/x))
    final_results['P_padjusted_after_normalization'] = final_results['fdr_after_normalization'].apply(lambda x:math.log10(1/x))
    final_results['normalization_scenario'] = df['normalization_scenario'].unique().tolist()[0]
    return final_results



def t_test(x: tuple, y: tuple) -> tuple:
    """ 
    Performs t_test from two numerical tuples as the two groups
    :returns : F_statistics and P_values as a tuple
    
    """
    t_test = ttest_ind(x, y)
    F = list(t_test)[0]
    p = list(t_test)[1]
    return F, p




def do_t_test_per_entity(df:pd.DataFrame,group_of_interest:str='CHDM') -> tuple:
    """
    :df: a datafrme with at least two columns 'patient_group' and 'intensity'
    Returns
    Tuple with F-statitistics and P_values between two disease state like CHDM and non CHDM

    """
    df = df.dropna(subset=['intensity'])
    g1=tuple(df['intensity'][df.patient_group==group_of_interest])
    g2=tuple(df['intensity'][df.patient_group!=group_of_interest])
    return t_test(g1,g2)



def do_t_test_per_biomarker(df:pd.DataFrame,biomarker_name:str,group_of_interest:str='CHDM') ->dict:
    """
    :df: a datafrme with at least three columns 'patient_group' and 'intensity' and 'type'
    Returns
    A dictionary with the keys entity, before_normalization, after_normalization, biomarker

    """
    df = df[df.biomarker == biomarker_name]
    df_before_normalization = df[df.type=='before_normalization']
    df_after_normalization = df[df.type == 'after_normalization']
    before_normalization_results = do_t_test_per_entity(df_before_normalization,group_of_interest)
    after_normalization_results = do_t_test_per_entity(df_after_normalization,group_of_interest)
    return {'before_normalization':before_normalization_results,
            'after_normalization':after_normalization_results,
            'entity':group_of_interest,
            'biomarker':biomarker_name}


def process_dataframe(df:pd.DataFrame) -> pd.DataFrame:
    df = df.set_index('Gene names')
    df = df.loc[:,~df.columns.str.contains('Identification')]
    return df
    

def make_swarm_df_per_biomarker(after_treatment_df,before_treatment_df,onco_tree,protein_name,patients_interest):
    common_samples = [x for x in after_treatment_df.columns if x in before_treatment_df.columns]
    after_normalization = pd.DataFrame(after_treatment_df.loc[protein_name,common_samples])
    after_normalization['type'] = 'after_normalization'
    after_normalization['patients'] = after_normalization.index
    before_normalization = pd.DataFrame(before_treatment_df.loc[protein_name,common_samples])
    before_normalization['type'] = 'before_normalization'
    before_normalization['patients'] = before_normalization.index    
    final_df = pd.concat([before_normalization,after_normalization])
    final_df.columns = ['intensity','type','patients']
    final_df['patient_group'] = 'others'
    final_df['patient_group'][final_df['patients'].isin(patients_interest)] = onco_tree
    final_df['biomarker'] = protein_name
    return final_df


def get_non_normalized_df(report_dir_before_norm:str,intensity_file_name:str) -> pd.DataFrame:
    fp_before_treatment_path = report_dir_before_norm + intensity_file_name
    before_treatment_df = pd.read_csv(fp_before_treatment_path)
    before_treatment_df = process_dataframe(before_treatment_df)
    return before_treatment_df 


    

# parameters

In [2]:
main_targets = [
    'ACAN',
    'KRT8',
    'RAB3A',
    'CD109',
    'S100A1',
    'VIM',
    'ACE',
    'CDKN2A-p16;CDKN2B',
    'MTAP',
    'RHEB'
 ]

onco_tree = 'CHDM'
general_directory_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/'
meta_data_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_MTBs_Evaluation/METADATA_PAN CANCER_Batch155_AS_231116.xlsx'
text_mining_result = 'publications_frequency_chordoma_14NOV2023.xlsx'
intensity_file_name = 'preprocessed_fp.csv'


after_normalization_path = {
    'ms1Redist':'/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2023.11.29_AhS_PANCANCER_157_non_normalized_except_redist_of_ms1_only/',
    'ms1MNCN+ms1Redist':'/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2023.11.29_AhS_PANCANCER_157_non_normalized+redist_ms1+ms1/',
    'ms3MNCN+ms1Redist':'/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2023.11.29_AhS_PANCANCER_157_non_normalized_ms3_only+redist_ms1/',
    'AllNormalizations':'/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2023.11.27_CJ_PANCANCER_157/',
    'ms3MNCN': '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2023.11.29_AhS_PANCANCER_157_non_normalized_ms3_only/'
    }

report_dir_before_norm =  general_directory_path + '2023.11.29_AhS_PANCANCER_157_non_normalized_all_levels/'


# reading the files

In [3]:
fp_before_treatment_path = report_dir_before_norm + intensity_file_name
befpre_treatment_df = pd.read_csv(fp_before_treatment_path)
before_treatment_df = process_dataframe(befpre_treatment_df)

meta_data_df = pd.read_excel(meta_data_path)
patients_interest = meta_data_df['Sample name'][meta_data_df.code_oncotree == onco_tree]
pubmed_results_df = pd.read_excel(text_mining_result)
all_non_normalized_dfs = [get_non_normalized_df(x,intensity_file_name) for x in after_normalization_path.values()]

# Biomarker selection

In [4]:
all_swarm_df = []
for label,after_treatment_df in zip(after_normalization_path.keys(),all_non_normalized_dfs):
    main_targets = [x for x in before_treatment_df.index.tolist() if x in main_targets]
    swarm_dfs = [make_swarm_df_per_biomarker(after_treatment_df,before_treatment_df,onco_tree,x,patients_interest) for x in main_targets]
    final_swarm_df = pd.concat(swarm_dfs)
    final_swarm_df['normalization_scenario'] = label
    all_swarm_df.append(final_swarm_df)

cocatenated_swarm_dfs = pd.concat(all_swarm_df)
cocatenated_swarm_dfs.to_csv('concatanated_swarm_inputs.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['patient_group'][final_df['patients'].isin(patients_interest)] = onco_tree
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['patient_group'][final_df['patients'].isin(patients_interest)] = onco_tree
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['patient_group'][final_df['patients'].isin(patients_interest)] = onco_tree
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pyda

# T-test between the group  of the patients with/whitout entity

In [7]:
res_df = [do_t_test_on_whole_data_frame(x) for x in all_swarm_df]
t_test_res = pd.concat(res_df)
t_test_res.to_csv('t_test_results.csv')

In [22]:
normalization_df= t_test_res[['biomarker', 'P_padjusted_after_normalization','normalization_scenario']]
normalization_df.columns = ['biomarker', 'value','normalization_scenario']

In [23]:
non_normalized_df = t_test_res[['biomarker', 'P_padjusted_before_normalization','normalization_scenario']]
non_normalized_df.columns = ['biomarker', 'value','normalization_scenario']
non_normalized_df = non_normalized_df.drop_duplicates(subset='biomarker')
non_normalized_df.normalization_scenario = 'Not_normalized'

In [24]:
df = pd.concat([non_normalized_df,normalization_df])

In [25]:
df.to_csv('t_statistics_long.csv')

# Boxplot: visualizations moved to R: boxplot.R

In [None]:
sns.catplot(
    data=df, kind="box",
    x="bio", y="intensity", hue="patient_group", col="biomarker",
    aspect=2
)


In [6]:
"""
all_swarm_df[0].reset_index(inplace=True)
sns.catplot(
    data=all_swarm_df, kind="box",
    x="type", y="intensity", hue="patient_group", col="biomarker",
    aspect=2
)

plt.savefig('swarm.svg',dpi=300)
"""

"""
g = sns.FacetGrid(all_swarm_df, col="biomarker", height=4, aspect=2)
g.map_dataframe(
                sns.stripplot, x="type", y="intensity", hue="patient_group", 
                dodge=True)

g.map_dataframe(
                sns.boxplot, x="type", y="intensity", hue="patient_group"
                )
# Note: the default legend is not resulting in the correct entries.
#       Some fix-up step is required here...
# g.add_legend()
plt.show()
"""

'\ng = sns.FacetGrid(all_swarm_df, col="biomarker", height=4, aspect=2)\ng.map_dataframe(\n                sns.stripplot, x="type", y="intensity", hue="patient_group", \n                dodge=True)\n\ng.map_dataframe(\n                sns.boxplot, x="type", y="intensity", hue="patient_group"\n                )\n# Note: the default legend is not resulting in the correct entries.\n#       Some fix-up step is required here...\n# g.add_legend()\nplt.show()\n'