# The Neurogenomics Database: Dotplot of entire dataset predictions
Author: Nienke Mekkes <br>
Date: 11-10-2022. <br>
Correspond: n.j.mekkes@umcg.nl <br>

## Script: Dotplot of entire dataset predictions
Builds Dot Plots for each diagnosis category. <br>
Why: to give an overview of what symptoms are frequently observed in different diagnosis groups

### Input files:
- prediction file (donors as row names, observations as columns)
- General information: to assign metadata to donors (e.g. diagnosis, age)
- Optional: attribute metadata to cluster observations
- Optional: metadata to highlight expected findings in the plot

- also needs scattermap.py, code to create the plot
- also needs helper_functions, which contains code to run permutation test and how to select donors


### Output:
- dotplot, file with p values for permutation test



#### Minimal requirements
- to do

## IMPORTANT

this script works with a clinical trajectory dictionary pickle. this pickle can be a rules of thumb or a original pickle, and was generated by the script proces_predictions. This processing script removed short sentences etc. and the attributes that performed poorly. This processing script did not remove any donors. Donors that you wish to be excluded can be excluded in two ways: <br>
1. in this script, manually. for example remove donors younger than 21. or donors with the NAD diagnosis, or reassign diagnosis (e.g. a SSA, CON donor NBB xxx needs to become HIV).
2. with an input file, for example the general information that contains minimally one column with donorids, and one column that mentions which donors should have a changed diagnosis or should be excluded

## PATHS

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
# path_to_predictions = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/final_predictions/ALL_clinical_trajectories_dictionary_2023-01-31.pkl"
path_to_predictions = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/final_predictions/ALL_clinical_trajectories_dictionary_2023-08-14.pkl"
# path_to_predictions = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/final_predictions/ALL_clinical_trajectories_dictionary_rules_of_thumb_visit_2023-07-11.pkl"
path_to_attribute_grouping = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/input_data/sup3.xlsx" ## for rules of thumb
figure_folder = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/final_predictions/figures/dotplots"
expected_attributes_path = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/input_data/Clustercategories_and_expected_symptoms_03_may_2023.xlsx"
general_information = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/input_data/General_information_11-08-2023.xlsx"
# path_clinical_diagnosis = '/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_diagnosis/output/selected_diagnoses_overview.xlsx'
path_clinical_diagnosis = '/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/data/grud_clin_subset_overview_both.xlsx'
# '/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/data/grud_clin_subset_overview_both.xlsx'
# path_to_cleaned_training_data = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/training_data/cleaned_training_data.pkl"
# path_to_diag = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_diagnosis/clinical_diagnosis_02_may_2023.xlsx"
train_plot = False

### IMPORTS

In [None]:
import seaborn as sns; sns.set()
import matplotlib
import numpy as np; np.random.seed(0)
from matplotlib import pyplot as plt 
import xlsxwriter
import pandas as pd
import os
import numpy as np
import scattermap
from scattermap import scattermap
import pickle
import multiprocessing
import statsmodels
from functools import partial
from multiprocessing import Pool
import sys

import scipy
from helper_functions import permutation_of_individual_test, table_selector
import datetime

In [None]:
if not os.path.exists(figure_folder):
    print(f"Creating output folder {figure_folder}")
    os.makedirs(figure_folder)
else:
    print(f"Output folder {figure_folder} already exists")

### Load data

We can make a dotplot on the full predicted set. However, a dotplot can also be made on the training data. The difference is that the training data contains more attributes (90 iso 80), and has a slightly different format.

In [None]:
if train_plot == False:
    with open(path_to_predictions,"rb") as file:
        predictions_pickle = pickle.load(file)

    d = []
    for i,j in zip(predictions_pickle,predictions_pickle.values()):
        k = pd.DataFrame.from_dict(j,orient="index")
        k["DonorID"] = i
        k['Age'] = k.index
        d.append(k)

    predictions_df =pd.concat(d, ignore_index=True)
    display(predictions_df)
    print(f"there are {len(list(predictions_df['DonorID'].unique()))} unique donor IDs")
    print(predictions_df.shape)

In [None]:
if train_plot == True:
    with open(path_to_cleaned_training_data,"rb") as file:
        predictions_pickle = pickle.load(file)
        predictions_df = pd.DataFrame(predictions_pickle)

    predictions_df = predictions_df.rename(columns={"NBB_nr": "DonorID"})
    predictions_df.drop(['Year_Sentence_nr'], axis=1,inplace=True,  errors='ignore')
    display(predictions_df)   

In [None]:
predictions_df.columns

### exclude/change donors for the paper, using general info
- read in the general information
- make a list of donors to remove
- remove donors from our predictions
- change column neuropathological diagnosis to the neuropathological diagnosis from the general information

In [None]:
general_information_df = pd.read_excel(general_information, engine='openpyxl', sheet_name="Sheet1")
donors_to_remove = list(general_information_df[general_information_df['paper diagnosis']=='exclude'].DonorID)
predictions_df = predictions_df[~predictions_df['DonorID'].isin(donors_to_remove)]
print(f"there are {len(list(predictions_df['DonorID'].unique()))} unique donor IDs")
print(len(donors_to_remove))
predictions_df['neuropathological_diagnosis'] = predictions_df['DonorID'].map(general_information_df.set_index('DonorID')['paper diagnosis'])
display(predictions_df.head())
print(sorted(predictions_df['neuropathological_diagnosis'].unique()))
print(f"there are {len(list(predictions_df['DonorID'].unique()))} unique donor IDs")


In [None]:
## for easy manipulation later, make a list of the 80 (or 90 in case of training) signs and symptoms
if train_plot == False:
    non_attribute_columns = ['DonorID','Year','age_at_death','sex',
                            'neuropathological_diagnosis','Age'] #'birthyear',,'death_year','year_before_death','sex',
if train_plot == True:
    non_attribute_columns = ['DonorID','neuropathological_diagnosis','Sentence'] #'birthyear',,'death_year','year_before_death','sex',
attributes = [col for col in predictions_df.columns if col not in non_attribute_columns]
# display(attributes)
print(f"there are {predictions_df.shape[0]} rows and {len(attributes)} attributes")
print(f"there are {len(list(predictions_df['DonorID'].unique()))} unique donor IDs")

## setting future dotplot colors based on domain or grouping
The attributes all belong to different 'domains' and subgroups. we color them based on these grouping. <br>

We make two dictionaries with the attributes (values) grouped with their domains (keys). One with the finalised names, this dictionary will be used to plot the attribute names (e.g. no "_", with capitals). The other one has the 'pc_friendly' names, these are used for selection

In [None]:
attribute_grouping = pd.read_excel(path_to_attribute_grouping, engine='openpyxl', index_col=[0])#,header=3, sheet_name='S3. 90 signs and symptoms')
attribute_grouping

### assign a custom color to each grouping

In [None]:
import collections
group_dict_fancy = dict()
color_dict_fancy = dict()
count = 0

colors = {'Aspecific symptoms':'#ce6dbd',#'Aspecific_symptoms'
          'Autonomic dysfunction':'#b5cf6b',#'Autonomic_dysfunction'
         'Cerebellar & vestibular system dysfunction': '#6b6ecf',#'Cerebral_vestibular_system_dysfunction'
         'Changes in consciousness, awareness & orientation': '#d6616b',# Changes_in_consciousness_awareness_orientation
         'Cognitive and memory impairment':'#e7ba52',#'cognitive_and_memory_impairment'
          'Signs of (dis)inhibition':'#bd9e39',#'Disinhibition'
          'Disturbances in mood & behaviour':'#ad494a',#Disturbances_in_mood_behaviour
          'Extrapyramidal symptoms':'#9c9ede',#Extrapyramidal_signs_symptoms
          'General decline':'#a55194',#
          'Signs of impaired mobility':'#393b79',#Mobility_problems
          'Motor deficits':'#5254a3',#Motor_deficit
         'Other signs & symptoms of cortical dysfunction': '#e7cb94',#oth_signs_symptoms_cortical_dysfunction
        'Other psychiatric signs & symptoms':  '#e7969c',#other_psychiatric_signs_symptoms
          'Sensory deficits':'#8ca252'}#Sensory_deficits]


    
for attr, group in zip(attribute_grouping["AttributeUpdated"], attribute_grouping["Grouping"]):
    if group not in group_dict_fancy:
#         print(f"{attr} belonging to {group}, grouping is new")
        if not isinstance(group, float):
            group_dict_fancy[group] = []
            color_dict_fancy[group] = colors[group]
#             print(colors[group])
#             print(color_dict_fancy)
            group_dict_fancy[group].append(attr)
            count +=1
    else:
        group_dict_fancy[group].append(attr)



# Sort the lists within the dictionary
group_dict_fancy = {k: sorted(v) for k, v in group_dict_fancy.items()}

print(group_dict_fancy, '\n')
print(color_dict_fancy)

#### Define order of displaying groupings/domains

In [None]:
group_order = ['Aspecific symptoms',
                'General decline',
                'Extrapyramidal symptoms',
                'Cerebellar & vestibular system dysfunction',
                'Motor deficits',
                'Signs of impaired mobility',
                'Autonomic dysfunction',
                'Sensory deficits',
                'Other signs & symptoms of cortical dysfunction',
                'Cognitive and memory impairment',
                'Signs of (dis)inhibition',
                'Other psychiatric signs & symptoms',
                'Changes in consciousness, awareness & orientation',
                'Disturbances in mood & behaviour',
               ]

new_order_fancy = []
for x in group_order:
    # print(x)
    group_fancy = group_dict_fancy[x]
    for attr in group_fancy:
        new_order_fancy.append(attr)
        # print(attr)
new_order_fancy.reverse()
print(new_order_fancy)

#### remove 10 poorly performing attributes & synonyms
while the 10 signs and symptoms are not present in the prediction, they are still present in the dictionaries created above. We remove them here

In [None]:
if train_plot == False:
    # new_order_fancy.remove('Unspecified disturbed gait patterns')
    # new_order_fancy.remove('Loss of sympathy / empathy')
    # new_order_fancy.remove('Headturning sign')
    # new_order_fancy.remove('Impaired comprehension')
    # new_order_fancy.remove('Changed behavior/personality')
    # new_order_fancy.remove('Frontal release signs')
    new_order_fancy.remove('Disturbed gait')
    new_order_fancy.remove('Loss of sympathy / empathy')
    new_order_fancy.remove('Headturning sign')
    new_order_fancy.remove('Limited language comprehension')
    new_order_fancy.remove('Changed behavior/personality')
    new_order_fancy.remove('Frontal release signs')
    # new_order_fancy

    # remove 4 synonyms
    # new_order_fancy.remove('Ataxia')
    # new_order_fancy.remove('Lack of initiative')
    # new_order_fancy.remove('Lack of planning / organization / overview')
    # new_order_fancy.remove('Cognitive decline')
len(new_order_fancy)

#### We want to display the attributes using the official names from google drive

In [None]:
correct_names = {}

for attr, real_name in zip(attribute_grouping["AttributeUpdated"], attribute_grouping["ITname"]):
    if not isinstance(real_name, float):
        correct_names[real_name] = attr
# correct_names

#### rename columns

In [None]:
predictions_df = predictions_df.rename(correct_names,axis=1)

In [None]:
predictions_df.columns

#### change the attribute order to the one we want to display

In [None]:
predictions_df.head()

In [None]:
information_from_symptoms_df = predictions_df[non_attribute_columns]
attribute_columns_to_sort = predictions_df.loc[:,[i for i in list(predictions_df.columns) if i not in non_attribute_columns]]
attribute_columns_to_sort = attribute_columns_to_sort[new_order_fancy]
          
              
# updated symptoms_df, now with the right columns order (e.g. start with communication problems)
predictions_df = pd.concat([information_from_symptoms_df, attribute_columns_to_sort], axis=1)
display(predictions_df)
predictions_df.columns

## ONLY FOR ANALYSIS OF CLINICAL DIAGNOSIS!
- this selects a specififc subset of diagnoses and donors
- these are: wantedx = ['CON','AD', 'PD', 'VD', 'FTD', 'DLB', 'AD-DLB', 'ATAXIA', 'MND', 'PSP', 'MS', 'MSA']

In [None]:
cd = False
# cd = False
if cd == True:
    # grouper = 'donorcount'
    cd_df = pd.read_excel(path_clinical_diagnosis, engine='openpyxl')
    # display(cd_df.head())
    # display(predictions_df)
    merged_df = predictions_df.merge(cd_df[['DonorID', 'diagnosis_info','pred_info','neuropathological_diagnosis']], on='DonorID', how='inner')
    
#     ordered_diagnoses = ['CON','AD', 'PD', 'VD', 'FTD', 'DLB', 'AD-DLB', 'ATAXIA', 'MND', 'PSP', 'MS', 'MSA']
#     ordered_diagnoses = ['CON_coherent','CON_non-coherent','AD_coherent','AD_non-coherent', 'PD_coherent','PD_non-coherent',
#                          'VD_coherent','VD_non-coherent', 'FTD_coherent','FTD_non-coherent', 'DLB_coherent','DLB_non-coherent',
#                          'AD-DLB_coherent','AD-DLB_non-coherent', 'ATAXIA_coherent','ATAXIA_non-coherent',
#                          'PSP_coherent','PSP_non-coherent', # 'MND_coherent','MND_non-coherent',
#                          'MS_coherent', 'MS_non-coherent', 'MSA_coherent','MSA_non-coherent']
    grouper = 'neuropathological_diagnosis'
    merged_df.rename(columns={'neuropathological_diagnosis_y': 'neuropathological_diagnosis'}, inplace=True)
    merged_df.drop(['Year','age_at_death','sex','Age','neuropathological_diagnosis_x'], axis=1,inplace=True, errors='ignore') #

    def determine_judge(row):
        if row['pred_info'] == 'non-coherent' and row['diagnosis_info'] == 'non-coherent':
            return 'non-coherent'
        elif row['diagnosis_info'] == 'non-coherent' and row['pred_info'] == 'ambiguous':
            return 'non-coherent'
        else:
            return 'coherent'

    # Apply the function to create the 'judge' column
    merged_df['judge'] = merged_df.apply(determine_judge, axis=1)   
    display(merged_df['diagnosis_info'].value_counts())
    display(merged_df['pred_info'].value_counts())
    display(merged_df['judge'].value_counts())
    # merged_df['judge'] = 
    # display(merged_df.head())

#     #### IMPORTANT, for simplicity change ambiguous to coherent!!!!!!!!
    # merged_df['judge'] = merged_df['diagnosis_info']
    # merged_df['judge'] = merged_df['judge'].replace('ambiguous', 'coherent')
    
#     ###### or ignore ambiguous
      # merged_df['judge'] = merged_df['diagnosis_info']
#     merged_df = merged_df[merged_df['judge'] != 'ambiguous']
#     # display(merged_df)
    
    flattened_t1 = merged_df.groupby(['DonorID','neuropathological_diagnosis','judge'], as_index=False).sum()
    flattened_t1_forp = flattened_t1.copy()
    flattened_t1_forp['new_neuropathological_diagnosis'] = flattened_t1_forp['neuropathological_diagnosis'] + '_' + flattened_t1_forp['judge']
    flattened_t1_forp.drop(columns=['neuropathological_diagnosis', 'judge'], inplace=True)
    flattened_t1_forp.rename(columns={'new_neuropathological_diagnosis': 'neuropathological_diagnosis'}, inplace=True)
    columns = flattened_t1_forp.columns.tolist()
    columns.insert(1, columns.pop())
    flattened_t1_forp = flattened_t1_forp[columns]
    flattened = merged_df.groupby(['DonorID','neuropathological_diagnosis','judge'], as_index=False).sum()
    flattened_forp = flattened.copy()
    flattened_forp['new_neuropathological_diagnosis'] = flattened_forp['neuropathological_diagnosis'] + '_' + flattened_forp['judge']
    flattened_forp.drop(columns=['neuropathological_diagnosis', 'judge'], inplace=True)
    flattened_forp.rename(columns={'new_neuropathological_diagnosis': 'neuropathological_diagnosis'}, inplace=True)
    columns = flattened_forp.columns.tolist()
    columns.insert(1, columns.pop())
    flattened_forp = flattened_forp[columns]
    ordered_diagnoses = flattened_forp['neuropathological_diagnosis'].unique()

display(flattened_forp['neuropathological_diagnosis'].value_counts())
display(flattened_forp.head(20))

In [None]:
grouper

#### intermezzo: alyses clusters
comment out 

In [None]:
# ## for MS analyses we have to load the MS cluster IDs
# alyse_df = selected_diagnoses.copy()
# alyse_clusters = "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/input_data/alyse_ms_clusters.xlsx"
# alyse_clusters = pd.read_excel(alyse_clusters)

# ## this is the order we want to display
# ordered_diagnoses =  ['CON','Cluster1','Cluster2','Cluster3','Cluster4']

# ## we add the cluster ID to our predictions and keep only these donors and the controls
# alyse_df['alyse_clusters'] = alyse_df['DonorID'].map(alyse_clusters.set_index('DonorID')['Cluster'])
# alyse_df.alyse_clusters.fillna(alyse_df.neuropathological_diagnosis, inplace=True)
# alyse_df= alyse_df[alyse_df['alyse_clusters'].isin(ordered_diagnoses)]
# alyse_df['neuropathological_diagnosis'] = alyse_df['alyse_clusters']
# alyse_df = alyse_df.drop('alyse_clusters',axis=1)
# display(alyse_df['neuropathological_diagnosis'].value_counts())
# selected_diagnoses = alyse_df.copy()
# display(alyse_df)

#### for ms analysis, the background is all donors that are NOT MS. to do this, select table1_ms. for standard uses, select table1 as background


## (FOR ALL DONORS) Selecting groups of diagnoses to display in dotplot
We do not want to print all hundreds of diagnoses, but make a selection. here we use a dictionary approach to select diagnoses and give them an appropriate abbreviation in one go. The function also returns a default order of diagnoses. for example, all FTD subtypes are grouped into 'FTD'. <br>

#### selecting a subset

In [None]:
# cd = False
if cd == False:
    table_of_choice = 'table1_p' #fig 4a table3_with_con_p #table2_p #fig 3a table1_P fig sup 5a:table2_p
    selected_diagnoses,ordered_diagnoses = table_selector(table_of_choice, predictions_df)
    print('After selecting for {}, we have {} donors'.format(selected_diagnoses['neuropathological_diagnosis'].unique(),
                                                                                        selected_diagnoses['DonorID'].nunique()) )
    display(selected_diagnoses[selected_diagnoses['neuropathological_diagnosis']=='AD'].head(5))

### selecting background
For the permutation analysis, we want to compare a finding within a diagnosis group to a random background. The default random background is formed by the diagnoses as defined here in 'table 1'. For example, 90% of the 500 donors with AD experience symptom x. If we randomly select 500 donors from the random background, what is their observation? repeat this random sampling n times.

### preprocessing
- We do not need all information columns, clean them up
- flatten the data: we sum the nr of observations per donor. 
- create an overview of the number of observations within each diagnosis group

In [None]:
# cd = False
if cd == False:
    table1, _ = table_selector('table1_p', predictions_df)
    print('After selecting for {}, we have {} donors'.format(table1['neuropathological_diagnosis'].unique(),
                                                                                        table1['DonorID'].nunique()) )
    grouper = 'neuropathological_diagnosis'

    if train_plot == False:
        table1.drop(['Year','age_at_death','sex','Age'], axis=1,inplace=True, errors='ignore') #,anti_grouper
        selected_diagnoses.drop(['Year','age_at_death','sex','Age'], axis=1,inplace=True,  errors='ignore') #,anti_grouper
    if train_plot == True:
        table1.drop(['Sentence'], axis=1,inplace=True, errors='ignore') ## for training data
        selected_diagnoses.drop(['Sentence'], axis=1,inplace=True,  errors='ignore') ## for training data

    flattened_t1 = table1.groupby(['DonorID',grouper], as_index=False).sum()
    flattened = selected_diagnoses.groupby(['DonorID',grouper], as_index=False).sum()

    selected_donorcountpredict = table1['DonorID'].nunique()
    disease_counts = pd.DataFrame(flattened[grouper].value_counts())
    disease_counts = disease_counts.reindex(ordered_diagnoses)
    display(disease_counts)
    


## from here, code works for all sets. calculate proportions
goal: if 500 donors have AD, and 250 of those donors experience symptom x, then the proportion of symptom x equals 0.5
To calculate this we perform the following steps:

In [None]:
## endgoal: divide total nr of donors in a diagnosis group by the nr of donors suffering from attribute x
## step 1: make boolean df: a donor either has a attribute or not, i dont care about how often.
## by setting it to 1, we can sum and see how many donors have this attribute
# display(group_ready.head(5))
# cd = False
if cd == False:
    group_ready = flattened.copy()
    group_ready.loc[:,[i for i in new_order_fancy]] = group_ready.loc[:,[i for i in new_order_fancy]].apply(lambda x: [y if y <= 1 else 1 for y in x])
    grouped_df = group_ready.groupby([grouper])
    # Calculating the percentage for each column in each group
    proportion_df = grouped_df.apply(lambda x: pd.Series({
        'total': len(x),
        **{column: x[column].mean() * 300 for column in new_order_fancy}
    }))

if cd == True:
    group_ready = flattened_forp.copy()
    group_ready.loc[:,[i for i in new_order_fancy]] = group_ready.loc[:,[i for i in new_order_fancy]].apply(lambda x: [y if y <= 1 else 1 for y in x])

    grouped_df = group_ready.groupby(['neuropathological_diagnosis'])
    # grouped_df = group_ready.groupby(['neuropathological_diagnosis', 'diagnosis_info'])
    proportion_df = grouped_df.apply(lambda x: pd.Series({
        'donorcount': len(x),
        **{column: x[column].mean() * 300 for column in new_order_fancy}
    }))
    proportion_df['total'] = proportion_df.groupby(['neuropathological_diagnosis']).transform('sum')['donorcount']


proportion_df = proportion_df.reindex(ordered_diagnoses, level='neuropathological_diagnosis')
# display(proportion_df)
if cd == True:
    disease_counts = pd.DataFrame(proportion_df['donorcount']) ## if you have cd
    disease_counts.rename(columns={'donorcount': 'neuropathological_diagnosis'}, inplace=True)
    disease_counts.index.name = None
proportion_df = proportion_df.drop(['total'],axis=1)
if cd == True:
    proportion_df = proportion_df.drop(['donorcount'],axis=1) ## if you have cd
    proportion_df.to_excel(f"{figure_folder}/clindiags/percentages.xlsx") ## if you have cd
if cd == False:
    proportion_df.to_excel(f"{figure_folder}/{table_of_choice}/percentages.xlsx")

# display(disease_counts_forp)
display(disease_counts)
## results:
## - flattened_t1_forp
## - flattened_forp
## grouper
## ordered_diagnoses
## disease_counts
## proportion_df

### MS intermezzo

In [None]:
# diag = 'MS'

# # Filter by rows with the diagnosis and select columns with values greater than 10
# symptoms = proportion_df.loc[diag] 
# symptoms = list(symptoms[symptoms >= 25].index)
# # symptoms

In [None]:
# from scipy.stats import zscore

# flattened_ms = flattened[flattened['neuropathological_diagnosis']==diag]
# flattened_ms = flattened_ms.drop('neuropathological_diagnosis',axis=1)
# flattened_ms.set_index('DonorID',drop=True,inplace=True)

# # check which columns have all zeros
# zeros = (flattened_ms == 0).all(axis=0)

# # drop the columns with all zeros
# flattened_ms = flattened_ms.loc[:, ~zeros]

# # calculate the total number of observations for each donor
# totals = flattened_ms.sum(axis=1)
# coltotals = flattened_ms.sum(axis=0)
# # divide each row by its corresponding total
# flattened_ms = flattened_ms.div(totals, axis=0)
# # flattened_ms = flattened_ms.div(coltotals, axis=1)
# # flattened_ms = flattened_ms.apply(zscore, axis=1)

# flattened_ms.fillna(0, inplace=True)
# # display(flattened_ms.sort_values('Fasciculations')['Fasciculations'])
# display(flattened_ms['Dementia'].describe())
# # display(flattened_ms['Fasciculations'].describe())
# # display(totals)
# # display(flattened_ms)

In [None]:
# from scipy.cluster import hierarchy


# MS_interesting_dict = {}
# MS_interesting_dict['expob'] = ['Decreased (fine) motor skills','Fatigue','Spasticity','Declined / deteriorated health','Urinary incontinence','Muscular weakness']
# MS_interesting_dict['expnob'] = ['Concentration problems','Dementia','Depressed mood']
# MS_interesting_dict['nexpob'] = ['Vertigo','Headache / migraine','Nystagmus','Help in ADL','Balance problems','Swallowing problems / dysphagia',
#                     'Dysarthria','Constipation','Urinary problems (other)','Visual problems','Negative sensory symptoms','Positive sensory symptoms',
#                     'Hyperreflexia and other pathological reflexes','Loss of coordination','Mobility problems'                   
#                    ]


# # # Create a list of all the interesting symptom names
# interesting_symptoms = [symptom for symptom_list in MS_interesting_dict.values() for symptom in symptom_list]

# # # Select the interesting columns from the DataFrame
# flattened_ms = flattened_ms.loc[:, interesting_symptoms]
# # flattened_ms = flattened_ms.loc[:, symptoms]

# # Set the figure size
# plt.figure(figsize=(30, 30))
# sns.set(font_scale=1.5)


# corr_matrix = flattened_ms.corr()
# sns.heatmap(corr_matrix, cmap='coolwarm', annot=False, fmt='.2f',
#             xticklabels=corr_matrix.columns.values, yticklabels=corr_matrix.columns.values,
#             # annot_kws={"size": 5}
#            )

In [None]:
# # plt.figure(figsize=(10, 12))
# sns.set(font_scale=0.8)

# sns.clustermap(corr_matrix, method='average', metric='cosine',#,method='ward',metric='euclidean',
#                cmap='coolwarm',
#                cbar_pos=(1.05, 0.2, 0.05, 0.6),
#                figsize=(10, 10)
              
#               )
# plt.savefig(f"/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/final_predictions_extrasyny/figures/cooccurence/{diag}_cooc.png",bbox_inches='tight',dpi=300)

### we also calculate the mean number of observations
Goal: if 250 donors with AD observe symptom x 2 times, and 250 never observe it, than the mean equals 1

In [None]:
if cd == False:
    general_mean_multi = flattened.groupby(grouper).mean()
    general_mean_multi = general_mean_multi.reindex(ordered_diagnoses)

if cd == True:
    general_mean_multi = flattened_forp.groupby(['neuropathological_diagnosis']).mean()
    general_mean_multi = general_mean_multi.reindex(ordered_diagnoses, level='neuropathological_diagnosis')
    display(general_mean_multi.head())
    print(general_mean_multi.max())
    # general_mean_multi_forp = general_mean_multi.copy()
    # general_mean_multi_forp.reset_index(inplace=True)
    # general_mean_multi_forp['combined_index'] = general_mean_multi_forp['neuropathological_diagnosis'] + '_' + general_mean_multi_forp['diagnosis_info']
    # general_mean_multi_forp.set_index('combined_index', inplace=True)
    # general_mean_multi_forp.drop(columns=['neuropathological_diagnosis', 'diagnosis_info'], inplace=True)
    # general_mean_multi_forp = general_mean_multi_forp.rename_axis('neuropathological_diagnosis')
    # ordered_diagnoses = disease_counts_forp.index
    # display(general_mean_multi_forp)

### Permutation testing
Only run this script once, takes a lot of time if you dont have resources. can be skipped if you dont need significane, then do not plot significance in the visualization or you will get an error. Can also be faster if you run less permutations. standard set to 100.000, which is very high! <br>

main concept <br>
- loop over each diagnosis
- how many donors have diagnosis x?
- run permutation test, which needs:
    - the name of the current diagnosis
    - the flattened dataframe (for every donor, summed observations)
    - the mean dataframe (for every diagnosis, how many mean observations)
    - the number of donors
    - the background flattened dataframe (for every donor, summed observations)

In [None]:
if train_plot == False:
    if cd == False:
        table_folder = "{}/{}".format(figure_folder,table_of_choice)
    elif cd == True:
        table_folder = "{}/{}".format(figure_folder,'clindiags')
if train_plot == True:
    table_folder = "{}/{}_training".format(figure_folder,table_of_choice)
print(table_folder)

if not os.path.exists(table_folder):
    print('Creating output folder....')
    os.makedirs(table_folder)
    
save_permutation = "{}/p_values.xlsx".format(table_folder)
save_permutation_cor = "{}/cor_p_values.xlsx".format(table_folder)
print(save_permutation)

In [None]:
import math
def identify_diagnosis_Pvalues(flattened, general_mean_multi, disease_counts, flattened_t1):

    ##Function that is a wrapper around permutation_of_individual_test and saves all P-values
    ##Make a diagnosis dictionary with a attrobite dictionary that contains all the P-values 
    p_values_diagnosis_dictionary = {} 
    perms = 10000
    nr = 1
    # for d in ordered_diagnoses:
    for d in disease_counts.index:
        ##Print messages
        message = '--------------------------------------- \n \
                   Working on {primary_diagnosis}, {v} out of {len_pd}'.format(primary_diagnosis=d,v=nr,len_pd=len(disease_counts.index))
        print(message)
        nr = nr + 1
        # display(d)
        nr_donors_with_d = disease_counts.loc[d][grouper]
        if math.isnan(nr_donors_with_d):
            print('no instances!')
        else:
            nr_donors_with_d = int(nr_donors_with_d)
            print('diagnosis affects {} donors.'.format(nr_donors_with_d))

            ## multiproc
            p_values_attribute_dictionary = {} 
            iterable = [attribute_nr for attribute_nr in range(2,flattened.shape[1])]
            # print(iterable)
            pool = multiprocessing.Pool(multiprocessing.cpu_count()-1)
            # display(disease_counts)
            # display(flattened)
            # display(general_mean_multi)
            func = partial(permutation_of_individual_test, d, flattened, general_mean_multi, nr_donors_with_d,perms,flattened_t1,grouper)
            res = pool.map(func, iterable)
            pool.close()
            pool.join()

            p_values_diagnosis_dictionary[d] = res
        
#         ## without multiproc
#         p_values_attribute_dictionary = {} 

#         for attribute_nr in range(2,flattened.shape[1]): #range(2,6):
#             message2 = 'Working on attribute {nr}: {attribute}'.format(nr=attribute_nr-2,
#                                                                        attribute=flattened.columns[attribute_nr])
#             print(message2)
#             p_value = permutation_of_individual_test(d,
#                                                      flattened,
#                                                      general_mean_multi,
#                                                      nr_donors_with_d,
#                                                      flattened_t1,
#                                                      attribute_nr,
#                                                      m_or_m='mean')#, donor_diagnosis_list)
#             p_values_attribute_dictionary[attribute_nr] = p_value

#         p_values_diagnosis_dictionary[d] = p_values_attribute_dictionary
    
    return p_values_diagnosis_dictionary


#### running the permutation test

In [None]:
# if cd == True:
#     flattend_forp = flattened.copy()
#     flattend_forp['new_neuropathological_diagnosis'] = flattend_forp['neuropathological_diagnosis'] + '_' + flattend_forp['diagnosis_info']
#     flattend_forp.drop(columns=['neuropathological_diagnosis', 'diagnosis_info'], inplace=True)
#     flattend_forp.rename(columns={'new_neuropathological_diagnosis': 'neuropathological_diagnosis'}, inplace=True)
#     columns = flattend_forp.columns.tolist()
#     columns.insert(1, columns.pop())
#     flattend_forp = flattend_forp[columns]
#     flattened_t1_forp = flattened_t1.copy()
#     flattened_t1_forp['new_neuropathological_diagnosis'] = flattened_t1_forp['neuropathological_diagnosis'] + '_' + flattened_t1_forp['diagnosis_info']
#     flattened_t1_forp.drop(columns=['neuropathological_diagnosis', 'diagnosis_info'], inplace=True)
#     flattened_t1_forp.rename(columns={'new_neuropathological_diagnosis': 'neuropathological_diagnosis'}, inplace=True)
#     columns = flattened_t1_forp.columns.tolist()
#     columns.insert(1, columns.pop())
#     flattened_t1_forp = flattened_t1_forp[columns]
#     display(ordered_diagnoses)
#     # general_mean_multi_forp.index.name = None
#     # display(flattend_forp.head())
#     # display(flattened_t1_forp.head())
# display(disease_counts.head())
#     # display(general_mean_multi_forp.head())


In [None]:
if cd == True:
    Pvalues_dict = identify_diagnosis_Pvalues(flattened_forp, general_mean_multi, disease_counts, flattened_t1_forp)
elif cd == False:
    Pvalues_dict = identify_diagnosis_Pvalues(flattened, general_mean_multi, disease_counts, flattened_t1)    
prelim = pd.DataFrame(Pvalues_dict)
Pvalues_dataframe = prelim.T
Pvalues_dataframe.columns = list(general_mean_multi.columns)
display(Pvalues_dataframe)

writer = pd.ExcelWriter(save_permutation, engine='xlsxwriter')
Pvalues_dataframe.to_excel(writer)
writer.save()

#### Loading stored permutation test

In [None]:
Pvalues_dataframe = pd.read_excel(save_permutation, engine='openpyxl', index_col=[0])


In [None]:

# Pvalues_dataframe=Pvalues_dataframe.reindex(ordered_diagnoses)
display(Pvalues_dataframe)
print(Pvalues_dataframe.shape)

In [None]:
Pvalues_dataframe.columns

#### correct for multiple testing

In [None]:
def FDR_conversion(Pvalues_dataframe):
    ##Function that converts a P-value dataframe to FDR dataframe 
    import statsmodels.stats.multitest as smt
    
    Pvalues_list = [] 
    for index_value in Pvalues_dataframe.index:
        # print(index_value)
        Pvalues_list+= Pvalues_dataframe.loc[index_value,:].values.tolist()
    # print(Pvalues_list)

    FDRvalues_list = smt.multipletests(Pvalues_list, method='fdr_bh', is_sorted= False)[1]    
    FDRvalues_array = np.array(FDRvalues_list) 
    FDRvalues_array = np.reshape(FDRvalues_array, Pvalues_dataframe.shape)
    FDR_df = pd.DataFrame(FDRvalues_array, columns= Pvalues_dataframe.columns, index= Pvalues_dataframe.index)
    
    return FDR_df
    

In [None]:
FDR_df = FDR_conversion(Pvalues_dataframe)
FDR_Cutoff = 0.1
significance_boolean = (FDR_df < FDR_Cutoff) * 1

In [None]:
display(significance_boolean)

writer = pd.ExcelWriter(save_permutation_cor, engine='xlsxwriter')
FDR_df.to_excel(writer)
writer.save()

#### Our colleagues from the NHB wrote down their expectations for each attribute and diagnosis. we plot these as well

In [None]:
###Add expected attributes per diagnosis - for plotting 
expected_attributes_df = pd.read_excel(expected_attributes_path, index_col=0,sheet_name='Updated version 20042022')

expected_attributes_df.fillna(0,inplace=True)
expected_attributes_df = expected_attributes_df.astype('int')
print(pd.unique(expected_attributes_df.values.ravel('K')))
expected_attributes_df = expected_attributes_df.rename(columns={'Executive_dysfunction': 'Executive_function_disorder',
                                                                'Lack_of_planning_organisation':'Lack_of_planning_organis_overv',
                                                               'Unspecified_disturbed_gait_patterns': 'Unspecified_disturbed_gait_patt'})
expected_attributes_df = expected_attributes_df.rename(columns={"Fatique": "Fatigue"})
expected_attributes_df= expected_attributes_df.rename(correct_names,axis=1)
expected_attributes_df = expected_attributes_df.reindex(ordered_diagnoses)
# display(expected_attributes_df)



In [None]:
# def synonym_merger_new(df, how=None):
#     if how == 'sum':
#         df["Loss of coordination"] = df[["Ataxia", "Loss of coordination"]].sum(axis=1)
#         # df["Apathy / inertia"] = df[["Apathy / inertia", "Lack of initiative"]].sum(axis=1)
#         # df["Dementia"] = df[["Dementia", "Cognitive decline"]].sum(axis=1)
#         # df["Executive function disorders"] = df[["Executive function disorders", "Lack of planning / organization / overview"]].sum(axis=1)
#         # df["Memory_impairment"] = df[["Memory_impairment", "Amnesia","Forgetfulness","Imprinting_disturbances"]].sum(axis=1)
#         # df["Disorientation"] = df[["Disorientation", "Wandering"]].sum(axis=1)
#     elif how == 'max':
#         df["Loss of coordination"] = df[["Ataxia", "Loss of coordination"]].max(axis=1)
#         # df["Apathy / inertia"] = df[["Apathy / inertia", "Lack of initiative"]].max(axis=1)
#         # df["Dementia"] = df[["Dementia", "Cognitive decline"]].max(axis=1)
#         # df["Executive function disorders"] = df[["Executive function disorders", "Lack of planning / organization / overview"]].max(axis=1)
#         # df["Memory impairment"] = df[["Memory impairment", "Amnesia","Forgetfulness","Imprinting disturbances"]].max(axis=1)
#         # df["Disorientation"] = df[["Disorientation", "Wandering"]].max(axis=1)
#     df.drop(['Ataxia',], axis=1,inplace=True)
#     return df

In [None]:
# def synonym_merger(df, how=None):
#     if how == 'sum':
#         df["Loss of coordination"] = df[["Ataxia", "Loss of coordination"]].sum(axis=1)
#         df["Apathy / inertia"] = df[["Apathy / inertia", "Lack of initiative"]].sum(axis=1)
#         df["Dementia"] = df[["Dementia", "Cognitive decline"]].sum(axis=1)
#         df["Executive function disorders"] = df[["Executive function disorders", "Lack of planning / organization / overview"]].sum(axis=1)
#     elif how == 'max':
#         df["Loss of coordination"] = df[["Ataxia", "Loss of coordination"]].max(axis=1)
#         df["Apathy / inertia"] = df[["Apathy / inertia", "Lack of initiative"]].max(axis=1)
#         df["Dementia"] = df[["Dementia", "Cognitive decline"]].max(axis=1)
#         df["Executive function disorders"] = df[["Executive function disorders", "Lack of planning / organization / overview"]].max(axis=1)
#     df.drop(['Lack of initiative','Cognitive decline','Ataxia',"Lack of planning / organization / overview"], axis=1,inplace=True)
#     return df

# if train_plot == False:
#     expected_attributes_df = synonym_merger_new(expected_attributes_df, how='max') ## turn off ## for training data
expected_attributes_df = expected_attributes_df[list(general_mean_multi.columns)]
expected_attributes_df = expected_attributes_df.fillna(0)
display(expected_attributes_df)
print(expected_attributes_df.columns)

In [None]:
# significance_boolean

In [None]:
# create an empty dataframe to store the results
results = pd.DataFrame(index=expected_attributes_df.index, columns=['expected_and_observed','expected_but_not_observed', 'not_expected_but_observed'])

# loop over each diagnosis and compute the results
for diagnosis in expected_attributes_df.index:
    expected = set(expected_attributes_df.columns[expected_attributes_df.loc[diagnosis]==1])
    observed = set(significance_boolean.columns[significance_boolean.loc[diagnosis]==1])
    expected_and_observed = expected & observed
    expected_but_not_observed = expected - observed
    not_expected_but_observed = observed - expected
    results.loc[diagnosis] = [expected_and_observed,expected_but_not_observed, not_expected_but_observed]

# print the results
display(results)
results.to_excel('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/final_predictions/expect_table.xlsx')

## all dataframes are nice, now we calculate bar size based on these

In [None]:
proportion_df = proportion_df.sort_index()
disease_counts = disease_counts.sort_index()
general_mean_multi = general_mean_multi.sort_index()
significance_boolean = significance_boolean.sort_index()
# display(flattened.head())

In [None]:
# VERTICAL BARPLOT
attribute_bar = pd.DataFrame(flattened[list(general_mean_multi.columns)].sum(),columns=['Attribute'])
attribute_bar['freq']=(attribute_bar['Attribute']*-2)/attribute_bar['Attribute'].max()
print('attributes min max')
print(attribute_bar['Attribute'].max())
print(attribute_bar['Attribute'].min())
print(attribute_bar['freq'].max())
print(attribute_bar['freq'].min())
display(attribute_bar)
freq =attribute_bar['freq'].tolist()

# Get positions for attribute barplots
positions = np.arange(start=.5, stop=.5*len(list(general_mean_multi.columns))*2, step=1)
print(len(positions))

if cd == False:
    disease_counts['freq']= (disease_counts[grouper]*2)/disease_counts[grouper].max()
    print('diagnoses max min')
    print(disease_counts[grouper].max())
    print(disease_counts[grouper].min())
    print(disease_counts['freq'].max())
    print(disease_counts['freq'].min())
    display(disease_counts)
    prop_freq_diag = disease_counts['freq'].tolist()
    print(prop_freq_diag)

if cd == True:
    disease_counts['freq']= (disease_counts[grouper]*2)/disease_counts[grouper].max()
    print('diagnoses max min')
    print(disease_counts[grouper].max())
    print(disease_counts[grouper].min())
    print(disease_counts['freq'].max())
    print(disease_counts['freq'].min())
    display(disease_counts)
    prop_freq_diag = disease_counts['freq'].tolist()
    print(prop_freq_diag)


diag_pos = np.arange(start=.5, stop=.5*(len(prop_freq_diag))*2, step=1)
print(diag_pos)
print(attribute_bar.index)

In [None]:
added_colors = []

for col in list(general_mean_multi.columns):
#     print(col)
    for g in group_dict_fancy:
#         print(g)
        if col in group_dict_fancy[g]:
#             print('grouping: {}'.format(g))
            added_colors.append(color_dict_fancy[g])
#         print('\n')
#         print(col)

print(len(added_colors))
print(added_colors)

### Plot the Heatmap

In [None]:
plt.figure(dpi=1200)

In [None]:
# num = int(len(proportion_df.columns)/2)+2
if train_plot == False:
    num = 84
if train_plot == True:
    num = 90 ## for training data

In [None]:
# first40_colnames = proportion_df.columns[0:42]
first40_colnames = proportion_df.columns
# first40_colnames = ['Communication impairment', 'Unable to concentrate',
#        'Sleep Disorders, Circadian Rhythm', 'Fatigue', 'Headache', 'Seizures',
#        'Sleep disturbances', 'Stress', 'Vivid dreaming', 'Weight loss',
#        'Admission to nursing home', 'Cachexia', 'Day Care, Medical',
#        'General health deterioration', 'Activities of daily living impaired',
#        'Markedly reduced dietary intake']
# first40_colnames = [ 'Bradykinesia','Expressionless face', 'Parkinsonism', 'Muscle Rigidity', 'Tremor',
#        'Ataxia', 'Equilibration disorder', 'Loss of coordination', 'Nystagmus',
#        'Vertigo', 'Dysarthria', 'Muscular fasciculation',
#        'Hyperreflexia and other pathological reflexes', 'Muscle Weakness',
#        'Muscle Spasticity', 'Swallowing problem',
#        'Decreased (fine) motor skills', 'Frequent falls', 'Impaired mobility']
# first40_colnames = ['Constipation', 'Hypotension, Orthostatic', 'Urinary Incontinence',
#        'Other urinary problems', 'Hearing problem',
#        'Negative sensory symptoms', 'Olfactory and gustatory dysfunction',
#        'Positive sensory symptoms', 'Visual Impairment']
# first40_colnames = ['Aphasia', 'Apraxias',
#        'Executive dysfunction', 'Façade behavior', 'Anosognosia',
#        'Lack of planning / organization / overview', 'Verbal impairment',
#        'Anomia', 'Amnesia', 'Bradyphrenia', 'Cognitive decline',
#        'Confabulation', 'Dementia', 'Forgetful', 'Agnosia',
#        'Poor short-term memory', 'Memory impairment', 'Apathy',
#        'Social disinihibition', 'Hyperorality','Lack of initiative',
#        'Socially inappropriate behavior']
# first40_colnames = ['Admission to psychiatric hospital',
#        'Feeling suicidal', 'Confusion', 'Delirium', 'Delusions',
#        'Disorientation', 'Hallucinations', 'Paranoia', 'Psychosis',
#        'Wandering Behavior', 'Aggressive behavior', 'Agitation', 'Anxiety',
#        'Changed moods or emotions', 'Compulsive behavior', 'Depressed mood',
#        'Manic', 'Restlessness']
    
num = len(first40_colnames)
first40_colors = added_colors[0:num]
first40_colors =  added_colors
print(first40_colnames)
print(len(first40_colnames))
print(num)
# print(proportion_df.columns)

In [None]:
proportion_df.head()

In [None]:
disease_counts#[grouper]

In [None]:
%matplotlib inline
sns.set(style="darkgrid", font_scale=1.5, rc={'axes.facecolor':'#F0E6EB', "grid.linestyle": "-","grid.color": '#b0b0b0'})

##PLOT 
w = disease_counts.shape[0]/1
w = (10+ disease_counts.shape[0]/1)-2
h = 2+num/3
w = 16
h= 27
fig = plt.figure(figsize=(w,h ),dpi=200)
print('figsize:',w,h)
# fig = plt.figure(figsize=(15, 20))
ax1 = plt.subplot2grid((9,9), (0, 0), colspan=8,rowspan=9)

# #Dots for expected attributes per disease
ax1 = scattermap(expected_attributes_df[first40_colnames].T,
                marker='o',
                marker_size=proportion_df[first40_colnames].T*expected_attributes_df[first40_colnames].T * 1.3,
                cmap="Oranges",
                cbar=False,ax=ax1)

##Dots for proportions and averages 
ax1 = scattermap(general_mean_multi[first40_colnames].T,
                cmap="YlGnBu",
                marker_size=proportion_df[first40_colnames].T,
                ax=ax1,
                vmax=5,
                 linecolor = 'black',
                 linewidths = 0.2,
                 
                cbar_kws={"shrink": .5})#cbar_kws = {"orientation": "horizontal", "pad":0.02}
#                 cbar_kws = dict(use_gridspec=True,location="right"))#,
#                 cbar=False)

#Significance 
ax1 = scattermap(significance_boolean[first40_colnames].T,
                marker='*',
                marker_size=significance_boolean[first40_colnames].T * 100,
                cbar=False,
                ax=ax1,
                 linecolor = 'black',
                 linewidths = 0.2,
                cmap="Wistia")

# x axis on top
ax1.xaxis.tick_top() 
ax1.xaxis.set_label_position('top')
# plt.xticks(rotation=75) (old, no subplot)
ax1.tick_params('x', labelrotation=90)

# Add frequencies of attributes as barplot to y-axis
ax1.barh(list(positions)[:num], freq[0:num], 0.6, alpha=1, color=first40_colors,edgecolor = "none")
# plt.axvline(x=0, color='k') (old, no subplot)
ax1.axvline(x=0, color='k')
ax1.axhline(0, color='k')

ax1.set_xlim([attribute_bar['freq'].min()-0.1,disease_counts.shape[0]])

# Add frequencies of diagnosis as barplot to x-axis
ax1.bar(diag_pos, prop_freq_diag, 0.6,color='#41b6c4',bottom=num,edgecolor = "none")

ax1.set_ylim([0,num+disease_counts['freq'].max()+0.1])

# plt.title('{}: {} to {} diagnoses, {} to {} attributes, for {} donors'.format(table_of_choice,
#                                                                              disease_counts[grouper].min(),
#                                                                              disease_counts[grouper].max(),
#                                                                              attribute_bar['Attribute'].min(),
#                                                                              attribute_bar['Attribute'].max(),
#                                                                              selected_donorcountpredict))

## same size legend
ax2 = plt.subplot2grid((9, 9), (3, 8),colspan=1,rowspan=1)
x = [2,3,4,5]
y = [2,3,4,5]
a2 = [75,150,225,300]

sc = ax2.scatter(x, y, s=a2, alpha=0.5,c='white')
L = ax2.legend(*sc.legend_elements("sizes"),loc='center left', bbox_to_anchor=(1, 0.7),frameon=False)
L.get_texts()[0].set_text('25%')
L.get_texts()[1].set_text('50%')
L.get_texts()[2].set_text('75%')
L.get_texts()[3].set_text('100%')

ax2.axis('off')

ax3 = plt.subplot2grid((9, 9), (2, 8),colspan=1,rowspan=1)
x2 = [2]
x3 = [2]
y2 = [2]
y3 = [2]
a3 = [300]
t = ['significance','expected']
ax3.scatter(x2, y2, s=a3, alpha=1,c='#E59400', marker='*')
ax3.scatter(x3, y3, s=a3, alpha=1,facecolors='none',edgecolors='#E59400')
ax3.legend(t, loc='center left',bbox_to_anchor=(1, 0.3),frameon=False)
ax3.set_ylim([0,1])
ax3.set_xlim([0,1])
ax3.axis('off')

# print('based on table {}'.format(table_of_choice))
print(disease_counts.shape[0])
print('{} to {} diagnosis, {} to {} attributes'.format(disease_counts[grouper].min(),
                                                         disease_counts[grouper].max(),
                                                         attribute_bar['Attribute'].min(),
                                                         attribute_bar['Attribute'].max() ))
plt.tight_layout()
# plt.savefig("{}/dotplot_{}.png".format(table_folder,table_of_choice), bbox_inches="tight")
# plt.savefig("{}/dotplot_{}.pdf".format(table_folder,table_of_choice), bbox_inches="tight")

plt.show()
plt.close()

In [None]:
break

In [None]:
num = int(len(proportion_df.columns)/2)+2

In [None]:
first40_colnames = proportion_df.columns[num:]
first40_colors = added_colors[num:]
print(first40_colnames)
num2 = len(first40_colnames)
print(num2)
# print(proportion_df.columns)

In [None]:
h2 = h - (((h-2)/num)*(abs(num2-num)))
w2 = (h2*w)/h
print(w2,h2)

In [None]:
%matplotlib inline
sns.set(style="darkgrid", font_scale=1.5, rc={'axes.facecolor':'#F0E6EB', "grid.linestyle": "-","grid.color": '#b0b0b0'})

##PLOT 
# Define plot size. Alter second value to change y-axis size
# fig, ax = plt.subplots(figsize=(disease_counts.shape[0]/1.5, 35)) (old, no subplot)
# fig, (ax1, ax2) = plt.subplots(1,2,figsize=(disease_counts.shape[0]/1.5, 35))

# fig = plt.figure(figsize=(disease_counts.shape[0]/1, 2+num2/2.4))
w2=16
fig = plt.figure(figsize=(w2, h2),dpi=200)
print('figsize:',w2,h2)
# fig = plt.figure(figsize=(15, 20))
ax1 = plt.subplot2grid((9,9), (0, 0), colspan=8,rowspan=9)

# ##Dots for expected attributes per disease
ax1 = scattermap(expected_attributes_df[first40_colnames].T,
                marker='o',
                marker_size=proportion_df[first40_colnames].T*expected_attributes_df[first40_colnames].T * 1.3,
                cmap="Wistia",
                cbar=False,ax=ax1)

##Dots for proportions and averages 
ax1 = scattermap(general_mean_multi[first40_colnames].T,
                cmap="YlGnBu",
                marker_size=proportion_df[first40_colnames].T,
                ax=ax1,
                vmax=5,
                 
                 linecolor = 'black',
                 linewidths = 0.2,
                cbar_kws={"shrink": .5})#cbar_kws = {"orientation": "horizontal", "pad":0.02}
#                 cbar_kws = dict(use_gridspec=True,location="right"))#,
#                 cbar=False)

#Significance 
ax1 = scattermap(significance_boolean[first40_colnames].T,
                marker='*',
                marker_size=significance_boolean[first40_colnames].T * 100,
                cbar=False,
                ax=ax1,
                 
                 linecolor = 'black',
                 linewidths = 0.2,
                cmap="Wistia")

# x axis on top
ax1.xaxis.tick_top() 
ax1.xaxis.set_label_position('top')
# plt.xticks(rotation=75) (old, no subplot)
ax1.tick_params('x', labelrotation=75)

# Add frequencies of attributes as barplot to y-axis
ax1.barh(list(positions)[:num2], freq[num:], 0.6, alpha=1, color=first40_colors,edgecolor = "none")
# plt.axvline(x=0, color='k') (old, no subplot)
ax1.axvline(x=0, color='k')
ax1.axhline(0, color='k')

#plt.xlim(attribute_bar['freq'].min()-0.1,disease_counts.shape[0])(old, no subplot)
ax1.set_xlim([attribute_bar['freq'].min()-0.1,disease_counts.shape[0]])

# Add frequencies of diagnosis as barplot to x-axis
ax1.bar(diag_pos, prop_freq_diag, 0.6,color='#41b6c4',bottom=num2,edgecolor = "none")

# plt.ylim(0,80+disease_counts['freq'].max()+0.1)(old, no subplot)
ax1.set_ylim([0,num2+disease_counts['freq'].max()+0.1])

plt.title('{}: {} to {} diagnoses, {} to {} attributes, for {} donors'.format(table_of_choice,
                                                                             disease_counts['neuropathological_diagnosis'].min(),
                                                                             disease_counts['neuropathological_diagnosis'].max(),
                                                                             attribute_bar['Attribute'].min(),
                                                                             attribute_bar['Attribute'].max(),
                                                                             selected_donorcountpredict))
ax2 = plt.subplot2grid((9, 9), (3, 8),colspan=1,rowspan=1)
x = [2,3,4,5]
y = [2,3,4,5]
a2 = [75,150,225,300]

sc = ax2.scatter(x, y, s=a2, alpha=0.5,c='white')
L = ax2.legend(*sc.legend_elements("sizes"),loc='center left', bbox_to_anchor=(1, 0.7),frameon=False)
L.get_texts()[0].set_text('25%')
L.get_texts()[1].set_text('50%')
L.get_texts()[2].set_text('75%')
L.get_texts()[3].set_text('100%')

ax2.axis('off')

ax3 = plt.subplot2grid((9, 9), (2, 8),colspan=1,rowspan=1)
x2 = [2]
x3 = [2]
y2 = [2]
y3 = [2]
a3 = [300]
t = ['significance','expected']
ax3.scatter(x2, y2, s=a3, alpha=1,c='#E59400', marker='*')
ax3.scatter(x3, y3, s=a3, alpha=1,facecolors='none',edgecolors='#E59400')
ax3.legend(t, loc='center left',bbox_to_anchor=(1, 0.3),frameon=False)
ax3.set_ylim([0,1])
ax3.set_xlim([0,1])
ax3.axis('off')

print('based on table {}'.format(table_of_choice))
print(disease_counts.shape[0])
print('{} to {} diagnosis, {} to {} attributes'.format(disease_counts['neuropathological_diagnosis'].min(),
                                                         disease_counts['neuropathological_diagnosis'].max(),
                                                         attribute_bar['Attribute'].min(),
                                                         attribute_bar['Attribute'].max() ))
plt.tight_layout()
plt.savefig("{}/dotplot_2{}.png".format(table_folder,table_of_choice), bbox_inches="tight")
plt.savefig("{}/dotplot_2{}.pdf".format(table_folder,table_of_choice), bbox_inches="tight")

plt.show()
plt.close()

In [None]:
def chi_square_testing_of_dataframes(expected_attributes_df, significance_boolean_df):
    ##Function to calculate the Chi-square statistic to deteremine the overlap between the significance boolean dataframe and expected dataframe
    import scipy
    ###Chi-square testing
    boolean_matrix = (expected_attributes_df == 1) & (significance_boolean_df ==1)
    category00 = np.sum(np.sum((boolean_matrix * 1)))
    boolean_matrix = (expected_attributes_df == 0) & (significance_boolean_df ==1)
    category01 = np.sum(np.sum((boolean_matrix * 1)))
    boolean_matrix = (expected_attributes_df == 1) & (significance_boolean_df ==0)
    category10 = np.sum(np.sum((boolean_matrix * 1)))
    boolean_matrix = (expected_attributes_df == 0) & (significance_boolean_df ==0)
    category11 = np.sum(np.sum((boolean_matrix * 1)))
    observations = np.array([[category00,category01], [category10, category11]])
    chi2, p, dof, expected = scipy.stats.chi2_contingency(observations)
    print(f"chi2 statistic:   {chi2:.5g}")
    print(f"p-value:      {p:.5g}")
    print(f"degrees of freedom: {dof}")
    print()
    print("expected frequencies:")
    print(expected)
    print()
    print("observed frequencies:")
    print(observations)
    
chi_square_testing_of_dataframes(expected_attributes_df, significance_boolean)