# The Neurogenomics Database: Dotplot of entire dataset predictions
Author: Nienke Mekkes <br>
Date: 11-10-2022. <br>
Correspond: n.j.mekkes@umcg.nl <br>

## Script: Dotplot of entire dataset predictions
Builds Dot Plots for each diagnosis category. <br>
Why: to give an overview of what symptoms are frequently observed in different diagnosis groups

### Input files:
- prediction file (donors as row names, observations as columns)
- General information: to assign metadata to donors (e.g. diagnosis, age)
- Optional: attribute metadata to cluster observations
- Optional: metadata to highlight expected findings in the plot

- also needs scattermap.py, code to create the plot
- also needs helper_functions, which contains code to run permutation test and how to select donors


### Output:
- dotplot, file with p values for permutation test



#### Minimal requirements
- to do

## IMPORTANT

this script works with a clinical trajectory dictionary pickle. this pickle can be a rules of thumb or a original pickle, and was generated by the script proces_predictions. This processing script removed short sentences etc. and the attributes that performed poorly. This processing script did not remove any donors. Donors that you wish to be excluded can be excluded in two ways: <br>
1. in this script, manually. for example remove donors younger than 21. or donors with the NAD diagnosis, or reassign diagnosis (e.g. a SSA, CON donor NBB xxx needs to become HIV).
2. with an input file, for example the general information that contains minimally one column with donorids, and one column that mentions which donors should have a changed diagnosis or should be excluded

## PATHS

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
# path_to_predictions = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/final_predictions/ALL_clinical_trajectories_dictionary_2023-07-11.pkl"
path_to_predictions = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/final_predictions/ALL_clinical_trajectories_dictionary_rules_of_thumb_visit_2023-08-14.pkl"
# path_to_attribute_grouping = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/input_data/sup3.xlsx" ## for rules of thumb
general_information = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/input_data/General_information_11-08-2023.xlsx"
path_clinical_diagnosis = '/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_diagnosis/output/selected_diagnoses_overview.xlsx'

### IMPORTS

In [None]:
import seaborn as sns; sns.set()
import matplotlib
import numpy as np; np.random.seed(0)
from matplotlib import pyplot as plt 
import xlsxwriter
import pandas as pd
import os
import numpy as np
import scattermap
from scattermap import scattermap
import pickle
import multiprocessing
import statsmodels
from functools import partial
from multiprocessing import Pool
import sys

import scipy
from helper_functions import permutation_of_individual_test, table_selector
import datetime

In [None]:
n=5

### Load data


In [None]:
with open(path_to_predictions,"rb") as file:
    predictions_pickle = pickle.load(file)

d = []
for i,j in zip(predictions_pickle,predictions_pickle.values()):
    k = pd.DataFrame.from_dict(j,orient="index")
    k["DonorID"] = i
    k['Age'] = k.index
    d.append(k)

predictions_df =pd.concat(d, ignore_index=True)
display(predictions_df)
print(f"there are {len(list(predictions_df['DonorID'].unique()))} unique donor IDs")
print(predictions_df.shape)

### exclude/change donors for the paper, using general info
- read in the general information
- make a list of donors to remove
- remove donors from our predictions
- change column neuropathological diagnosis to the neuropathological diagnosis from the general information

In [None]:
general_information_df = pd.read_excel(general_information, engine='openpyxl', sheet_name="Sheet1")
donors_to_remove = list(general_information_df[general_information_df['paper diagnosis']=='exclude'].DonorID)
predictions_df = predictions_df[~predictions_df['DonorID'].isin(donors_to_remove)]
print(f"there are {len(list(predictions_df['DonorID'].unique()))} unique donor IDs")
print(len(donors_to_remove))
predictions_df['neuropathological_diagnosis'] = predictions_df['DonorID'].map(general_information_df.set_index('DonorID')['paper diagnosis'])
display(predictions_df.head())
print(sorted(predictions_df['neuropathological_diagnosis'].unique()))
print(f"there are {len(list(predictions_df['DonorID'].unique()))} unique donor IDs")


In [None]:
predictions_df['Age'].value_counts()

In [None]:
non_attribute_columns = ['DonorID','Year','age_at_death','sex',
                        'neuropathological_diagnosis','Age'] #'birthyear',,'death_year','year_before_death','sex',
attributes = [col for col in predictions_df.columns if col not in non_attribute_columns]
# display(attributes)
print(f"there are {predictions_df.shape[0]} rows and {len(attributes)} attributes")
print(f"there are {len(list(predictions_df['DonorID'].unique()))} unique donor IDs")

#### adding in clinical diagnosis

In [None]:
# cd_df = pd.read_excel(path_clinical_diagnosis, engine='openpyxl')
# cd_df
# predictions_df['perfect_diagnosis'] = predictions_df['DonorID'].map(cd_df.set_index('DonorID')['perfect_diagnosis'])
# predictions_df['medium_diagnosis'] = predictions_df['DonorID'].map(cd_df.set_index('DonorID')['medium_diagnosis'])
# predictions_df['wrong_diagnosis'] = predictions_df['DonorID'].map(cd_df.set_index('DonorID')['wrong_diagnosis'])
# def get_diagnosis_info(row):
#     if row['perfect_diagnosis'] == 1:
#         return 'perfect'
#     elif row['medium_diagnosis'] == 1:
#         return 'medium'
#     elif row['wrong_diagnosis'] == 1:
#         return 'wrong'
#     else:
#         return None

# predictions_df['diagnosis_info'] = predictions_df.apply(get_diagnosis_info, axis=1)
# predictions_df = predictions_df.drop(columns=['perfect_diagnosis','medium_diagnosis','wrong_diagnosis'])
# predictions_df.tail()

In [None]:
table1_dict_paper = {
                'CON': 'CON',
                'AD': 'AD',
                'PD': 'PD',
                'PDD':'PDD',
                'DLB':'DLB',
                'VD' : 'VD',

                'FTD,FTD-TDP':'FTD','FTD,FTD-TDP-A,PROG':'FTD','FTD,FTD-TDP-B,C9ORF72':'FTD','FTD,FTD-TDP-C':'FTD', 
                'FTD,FTD-TAU,TAU':'FTD',
                'FTD,FTD-FUS':'FTD',
                'FTD,FTD-TDP,MND':'FTD',
                # 'FTD,FTD-UPS':'FTD',               
                'FTD,PID':'FTD',
                # 'FTD':'FTD', 
    'FTD_undefined':'FTD',
    'FTD,FTD-TDP_undefined':'FTD',

                'MND,ALS':'MND',
                'MND_other':'MND',

                'PSP' : 'PSP',

                'ATAXIA,SCA':'ATAXIA',
                'ATAXIA,ADCA':'ATAXIA',
                'ATAXIA,FA':'ATAXIA',
                'ATAXIA,FXTAS':'ATAXIA',

                'MS,MS-PP':'MS',
                'MS,MS-SP':'MS',
                # 'MS,MS-UN':'MS',
                'MS,MS-RR':'MS',
                'MS_undefined':'MS',

                'MSA' : 'MSA',
                'PSYCH,MDD':'MDD',
                'PSYCH,BP':'BP',
                'PSYCH,SCZ':'SCZ'
                                }



In [None]:
data = predictions_df[predictions_df.Age >= 0]
display(data['DonorID'].nunique())
data = predictions_df.copy()
display(data['DonorID'].nunique())
data['file_year'] = data['DonorID'].str.extract(r'NBB (\d{4})-\d{3}', expand=False)
data['file_year'] = pd.to_numeric(data['file_year'])
data = data[data['file_year'] >= 1997]
display(data['DonorID'].nunique())
unique_diagnoses = data[['DonorID', 'neuropathological_diagnosis']].drop_duplicates()
display(unique_diagnoses['neuropathological_diagnosis'].value_counts().head(20))
# display(merged_df.head())
## how many observations has each donor?
data2 = data.copy()
# display(data2['Age'].drop_duplicates().sort_values())
# display(data.groupby('DonorID')['Age'].nunique())

## df showing number of observations
data2 = data2.drop(columns=['age_at_death','sex','Age','file_year','Year'])#,'diagnosis_info'])
data2 = data2.groupby(['DonorID','neuropathological_diagnosis']).sum()
data2 = pd.DataFrame(data2.sum(axis=1),columns=['count'])
data2 = data2.reset_index()  
data2 = data2.set_index('DonorID')
data2['uniqueage'] = data.groupby('DonorID')['Age'].nunique()
display(data2)

# ## con are the exception, they are allowed to have little data
data3 = data2[data2['neuropathological_diagnosis'] != 'CON']
donors_not_enough_data = data3.index[data3['count'] < n].tolist()
# # donors_not_enough_data = data3.index[(data3['count'] < 5) | (data3['uniqueage'] < 3)].tolist()


print(donors_not_enough_data)
# print(len(donors_not_enough_data))
data = data[~data['DonorID'].isin(donors_not_enough_data)]
data = data.reset_index(drop=True)
display(data['DonorID'].nunique())
data['neuropathological_diagnosis'] = data['neuropathological_diagnosis'].replace('PDD', 'PD')
data['simplified_diagnosis'] = data['neuropathological_diagnosis'].map(table1_dict_paper)
data['simplified_diagnosis'] = data['neuropathological_diagnosis'].apply(lambda x: 'AD,DLB' if x == 'AD,DLB' else table1_dict_paper.get(x, None))

other_dems = ['CBD','AD,DLB','AD,CA','AD,ENCEPHA,VE','PD,AD', #,'ILBD','AD,ILBD','ENCEPHA,VE'
              'DLB,SICC','DEM,SICC','DEM,SICC,AGD','DEM,ENCEPHA,VE']
other_psych = ['PSYCH,PTSD','PSYCH,ASD','PSYCH,OCD']

def update_psych(row):
    if row['neuropathological_diagnosis'] in other_psych:
        return 'other_psych'
    return row['simplified_diagnosis']

def update_dem(row):
    if row['neuropathological_diagnosis'] in other_dems:
        return 'other_dem'
    return row['simplified_diagnosis']



data['simplified_diagnosis'] = data.apply(update_psych, axis=1)
data['simplified_diagnosis'] = data.apply(update_dem, axis=1)
data['simplified_diagnosis'] = data['simplified_diagnosis'].apply(lambda x: 'Other' if x is None else x)
display(data.head())
display(data['Age'].drop_duplicates().sort_values())
unique_diagnoses = data[['DonorID', 'simplified_diagnosis','neuropathological_diagnosis']].drop_duplicates()
display(unique_diagnoses.tail(10))
display(unique_diagnoses['simplified_diagnosis'].value_counts().head(60))
display(data['DonorID'].nunique())

In [None]:
# data[data['simplified_diagnosis']=='Other']

#### for seurat we select certain donors based on neuropathological_diagnosis

In [None]:
unique_diagnoses = data[['DonorID', 'neuropathological_diagnosis']].drop_duplicates()
# display(unique_diagnoses)
alldiag = list(unique_diagnoses['neuropathological_diagnosis'].unique())
print(list(alldiag))
len(unique_diagnoses[unique_diagnoses['neuropathological_diagnosis'] == 'VD,ILBD'])

In [None]:
list(data['neuropathological_diagnosis'].unique())

seurat_diagnoses = [## main diagnoses 
                    'AD','DLB','VD','CON','PD','PSP','MSA','MND,ALS','MND_other',
                    ## other dementias
                    'CBD','AD,DLB','AD,CA','AD,ENCEPHA,VE','PD,AD', #,'ILBD','AD,ILBD','ENCEPHA,VE'
                    'DLB,SICC','DEM,SICC','DEM,SICC,AGD','DEM,ENCEPHA,VE',
                    ## ataxia subtypes
                    'ATAXIA,SCA','ATAXIA,ADCA','ATAXIA,FXTAS','ATAXIA,FA',
                    ### FTD subtypes
                    'FTD_undefined','FTD,FTD-TAU,TAU','FTD,PID','FTD,FTD-FUS','FTD,FTD-TDP,MND',
                    'FTD,FTD-TDP_undefined','FTD,FTD-TDP-A,PROG','FTD,FTD-TDP-B,C9ORF72','FTD,FTD-TDP-C',
                    ## psych subtypes
                    'PSYCH,MDD','PSYCH,BP','PSYCH,SCZ','PSYCH,PTSD','PSYCH,ASD','PSYCH,OCD',
                    ## MS subtypes
                    'MS,MS-SP','MS,MS-PP','MS_undefined','MS,MS-RR']

not_used_seurat = list(set(alldiag) - set(seurat_diagnoses))
not_used_seurat = data[data['neuropathological_diagnosis'].isin(not_used_seurat)]
unique_diagnoses = not_used_seurat[['DonorID', 'simplified_diagnosis','neuropathological_diagnosis']].drop_duplicates()
# display(unique_diagnoses['neuropathological_diagnosis'].value_counts())
seurat = data[data['neuropathological_diagnosis'].isin(seurat_diagnoses)]
seurat = seurat[seurat['DonorID'] != 'NBB 1999-072'] ## this donor is cursed :) 
unique_diagnoses = seurat[['DonorID', 'simplified_diagnosis','neuropathological_diagnosis']].drop_duplicates()
display(unique_diagnoses['simplified_diagnosis'].value_counts())
# display(unique_diagnoses['neuropathological_diagnosis'].value_counts())
# display(unique_diagnoses.tail(20))
# display(unique_diagnoses[unique_diagnoses['neuropathological_diagnosis']=='CBD'])
display(seurat['DonorID'].nunique())

In [None]:
display(len(unique_diagnoses['DonorID'].unique()))

In [None]:
## note, now does not have clinical diagnosis! first do analysis, then load the clindiag in seurat itself
seurat.to_csv('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/data/seurat_input.csv', index=False)

In [None]:
data[data['DonorID']=='NBB 1999-072']

#### for the GRU-D model we only take the simplified donors, and delete the psychiatric cases

In [None]:
gru_d = data.copy()
def update_b(row):
    if 'AD,DLB' in row['neuropathological_diagnosis']:
        return 'AD,DLB'
    return row['simplified_diagnosis']
gru_d['simplified_diagnosis'] = gru_d.apply(update_b, axis=1)
gru_d = gru_d[gru_d['simplified_diagnosis'] != 'Other']
gru_d = gru_d[gru_d['simplified_diagnosis'] != 'other_dem']
gru_d = gru_d[gru_d['simplified_diagnosis'] != 'other_psych']
gru_d = gru_d[gru_d['simplified_diagnosis'] != 'MDD']
gru_d = gru_d[gru_d['simplified_diagnosis'] != 'BP']
gru_d = gru_d[gru_d['simplified_diagnosis'] != 'SCZ']
datalist = gru_d['DonorID'].unique()
display(len(datalist))
# unique_diagnoses = gru_d[['DonorID', 'simplified_diagnosis','neuropathological_diagnosis']].drop_duplicates()
# display(unique_diagnoses.tail(10))
# display(unique_diagnoses['simplified_diagnosis'].value_counts())

In [None]:
gru_d = seurat.copy()
def update_b(row):
    if 'AD,DLB' in row['neuropathological_diagnosis']:
        return 'AD,DLB'
    return row['simplified_diagnosis']
gru_d['simplified_diagnosis'] = gru_d.apply(update_b, axis=1)
gru_d = gru_d[gru_d['simplified_diagnosis'] != 'Other']
gru_d = gru_d[gru_d['simplified_diagnosis'] != 'other_dem']
gru_d = gru_d[gru_d['simplified_diagnosis'] != 'other_psych']
gru_d = gru_d[gru_d['simplified_diagnosis'] != 'MDD']
gru_d = gru_d[gru_d['simplified_diagnosis'] != 'BP']
gru_d = gru_d[gru_d['simplified_diagnosis'] != 'SCZ']
seuratlist = gru_d['DonorID'].unique()
display(len(seuratlist))
# unique_diagnoses = gru_d[['DonorID', 'simplified_diagnosis','neuropathological_diagnosis']].drop_duplicates()
# display(unique_diagnoses.tail(10))
# display(unique_diagnoses['simplified_diagnosis'].value_counts())
print(set(datalist) - set(seuratlist))

In [None]:
display(len(unique_diagnoses['DonorID'].unique()))

#### then we need to sort

In [None]:
# gru_d.to_csv('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/data/donors.csv', index=False)
gru_d = gru_d[gru_d.Age >= 0]
display(gru_d['DonorID'].nunique())

In [None]:
gru_d = gru_d.sort_values(['DonorID', 'Age'],
              ascending = [True, True])
display(gru_d)

In [None]:
gru_d.to_csv('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_analysis/data/grud_clin_subset_input.csv', index=False)

#### INPUT
array of arrays. each array is for a single donor, consisting of shape time x attribute

In [None]:
inp = gru_d.copy()
inp = inp.drop(['neuropathological_diagnosis','file_year','simplified_diagnosis','Year'],axis=1)#'diagnosis_info',
inp['sex'] = inp['sex'].map({'F': 1, 'M': 0}).astype(int)
inp['Age']  = inp['Age'].astype(int)
inp['age_at_death']  = inp['age_at_death'].astype(int)
def sum_except_donors(df):
    return df.iloc[:, ].sum()

inp = inp.sort_values(['DonorID', 'Age'],
              ascending = [True, True])

inp_with_nan = inp.copy()
inp_with_nan = inp_with_nan.reset_index(drop=True)
# final_row = inp_with_nan.groupby(['DonorID','sex']).sum().reset_index()
# final_row['Age'] = 150
# display(final_row)
# inp_with_nan = inp_with_nan.drop(['Age'], axis=1)
# inp_with_nan = pd.concat([inp_with_nan, final_row], ignore_index=True)
display(inp_with_nan.head(5))
final_input = inp_with_nan.set_index('DonorID')
# display(final_input)
final_input = final_input.groupby('DonorID').apply(pd.DataFrame.to_numpy).to_numpy()
print(final_input)

In [None]:
print(final_input.shape)
print(final_input[0].shape)

#### LABEL TASKNAME

array in the shape samples X diagnosis

In [None]:
np.set_printoptions(threshold=30)
lt = gru_d[['DonorID','simplified_diagnosis']].copy()
# lt = lt[~lt['DonorID'].isin(weirds)]
lt = lt.drop_duplicates().reset_index(drop=True)

display(lt)

donorcount = len(lt['DonorID'])
print(donorcount)
print(lt['simplified_diagnosis'].value_counts())
print(lt.simplified_diagnosis.unique())

one_hot = pd.get_dummies(lt.simplified_diagnosis)

# Define the ordered list
wanted = ['CON', 'AD', 'PD', 'VD', 'FTD','DLB','AD,DLB','ATAXIA', 'MND', 'PSP', 'MS','MSA'] #'AD,DLB' 'DLB,SICC',

# Get the current columns of the dataframe
current_cols = list(one_hot.columns)

# Create a new list of columns in the order of the 'wanted' list
new_cols = [col for col in wanted if col in current_cols]

# Reorder the columns of the dataframe using the new list of columns
one_hot = one_hot.reindex(columns=new_cols)
display(one_hot)
final_label_taskname = one_hot.to_numpy()
display(final_label_taskname)

#### MASKING
Masking is a datatype of the same shape as input. it is used to indicate which data is present, and which data is absent. If a row of data would be [nan, nan, 0, 0, 3.14, 10], the masking row would be [0,0,1,1,1,1]. In our case, we have options:
- every value larger than 1 to 1, every zero to zero. because a zero can mean that the symptom is present, it is just not written down that year?
- every value to 1. 

In [None]:
# final_input

In [None]:
import copy
## simplest
final_masking = copy.deepcopy(final_input)

In [None]:
# option 1
# for i in range(len(final_masking)):
#     final_masking[i][final_masking[i] > 1] = 1
    
# option 2
for k in range(len(final_masking)):
    final_masking[k][final_masking[k] >= 0] = 1    

print(final_masking)

#### TIMESTAMP
Timestamp is another array of arrays. There is an array for every donor, that consists of the timepoints that are known for that donor. e.g. if donor1 has information from age 34, 61, and 62, then his timestamp would be [34,61,62].

In [None]:
timestamp_df = inp[['DonorID','Age']].copy()
timestamp_df
final_timestamp = timestamp_df.set_index('DonorID').groupby('DonorID').apply(pd.DataFrame.to_numpy).to_numpy()
final_timestamp

### SPLITTING BALANCED

In [None]:
## This function takes the created train and test data as input
## it returns a measure of how similar train is to test
## it also shows an overview of the number of cases per attribute
## and a corrected version of this overview
def split_vis(x_train,x_test,y_train, y_test,train_val_size,test_size):
    """
    something
    """
    counts = {}
    counts["train_counts"] = Counter(str(combination) for row in get_combination_wise_output_matrix(
        y_train, order=1) for combination in row)
    counts["test_counts"] = Counter(str(combination) for row in get_combination_wise_output_matrix(
        y_test, order=1) for combination in row)    

    # view distributions
    multi_split_dist = pd.DataFrame({
        "for_train_and_val": counts["train_counts"],
        "test": counts["test_counts"]
    }).T.fillna(0)
    multi_split_dist = multi_split_dist.reindex(natsorted(multi_split_dist.columns), axis=1)
#     multi_split_dist.columns = labels
    
    for k in counts["test_counts"].keys():
        counts["test_counts"][k] = int(counts["test_counts"][k] * (train_val_size/test_size))
        
    # View size corrected distributions
    multi_split_dist_corr = pd.DataFrame({
        "for_train_and_val": counts["train_counts"],
        "test": counts["test_counts"]
    }).T.fillna(0)
    multi_split_dist_corr =multi_split_dist_corr.reindex(natsorted(multi_split_dist_corr.columns), axis=1)
#     multi_split_dist_corr.columns = labels
    
    print(f"train: {len(x_train)} ({len(x_train)/(len(x_train)+len(x_test)):.2f})\n"
          f"test: {len(x_test)} ({len(x_test)/(len(x_train)+len(x_test)):.2f})")
    dist_split = np.mean(np.std(multi_split_dist_corr.to_numpy(), axis=0))
    
    return dist_split,multi_split_dist,multi_split_dist_corr

## for figure 5, counts for split are in here

In [None]:
lt['foldinfo'] = None
# display(lt)
import numpy as np
from sklearn.model_selection import StratifiedKFold

# if eighties == True:
#     n_split = 10 # 5 for 60-20-20, 10 for 80-10-10
#     n_split_in = 9 # 4 for 60-20-20, 9 for 80-10-10
# elif eighties == False:
n_split = 5 # 5 for 60-20-20, 10 for 80-10-10
n_split_in = 4 # 4 for 60-20-20, 9 for 80-10-10

fold_taskname = np.empty(shape=(5, 3), dtype=object)

# X = np.array([0, 2, 1, 1,0,2,0, 2, 1, 1,0,2,0, 2, 1, 1,0,2])
# y = np.array([0, 2, 1, 1,0,2,0, 2, 1, 1,0,2,0, 2, 1, 1,0,2])
X = np.array(lt['simplified_diagnosis'].values)
y = np.array(lt['simplified_diagnosis'].values)
print(y)
print(y.shape)


## SET UP SPLIT BETWEEN TEST AND TRAIN/VAL
skf = StratifiedKFold(n_splits=n_split, random_state=1, shuffle=True)
skf.get_n_splits(X, y)
print(skf)
j = 0
for train_val_index, test_index in skf.split(X, y):
#     print("TRAIN+VAL:", train_val_index, "TEST:", test_index)
    ## USE THE GENERATED INDICES TO SELECT DIAGNOSES
    q_train_val, q_test = X[train_val_index], X[test_index]
    r_train_val, r_test = y[train_val_index], y[test_index]
#     print('test:' , test_index)
#     print('trainval: ', train_val_index)
    skf2 = StratifiedKFold(n_splits=n_split_in, random_state=2, shuffle=True)
    skf2.get_n_splits(X, y)
    print(skf2)
    
    ## WITHIN EACH FOLD, SPLIT TRAIN/VAL INTO TRAIN AND VAL (ONLY NEEDED ONCE!)
    i = 0
    ## example: 
    ## [0,1,2,3,4,5,6,7,8,9]  full data ['a','b','c','d','e','f','g','h','i','j']
    ## [0,2,3,5,7,9] indices selected for train/val ['a','c','d','f','h','j']
    ## [0,1,2,3,4,5]
    ## [0,2,3,9] indices points selected for train ['a','c','d','j']
    for train_index, val_index in skf2.split(q_train_val, r_train_val):
        print(i)
        if i == 0:
            ## USE THE GENERATED INDICES TO CREATE NEW INDICES THAT WORK ON THE FULL DATA
            true_train = train_val_index[train_index]
            true_val = train_val_index[val_index]
            
            ## PRINT THE INDICES
            print("TRAIN:", true_train, "\nVAL:", true_val, "\nTEST:", test_index)
            q_train, q_val, q_test = X[true_train], X[true_val],X[test_index]
            r_train, r_val, q_test = y[true_train], y[true_val],y[test_index] 
            
            #print('trainval: ',train_val_index,train_val_index.shape )
            #print('train: ',train_index, train_index.shape)
            print(f"train: {len(q_train)} ({len(q_train)/(len(q_train)+len(q_test)+len(q_val)):.2f})\n"
                  f"val: {len(q_val)} ({len(q_val)/(len(q_train)+len(q_test)+len(q_val)):.2f})\n"
                  f"test: {len(q_test)} ({len(q_test)/(len(q_train)+len(q_test)+len(q_val)):.2f})")
            
            ## SAVE INTO NUMPY ARRAY
            fold_taskname[j][0] = np.asarray(true_train)
            fold_taskname[j][1] = np.asarray(true_val)
            fold_taskname[j][2] = np.asarray(test_index)
            lt.loc[test_index, 'foldinfo'] = j
            
            ## FOR VISUALIZING COUNTS PER DIAGNOSIS PER FOLD
            ## TRAINING
            foo, bar = np.unique(q_train, return_counts=True)
            my_dict = dict(zip(foo, bar))
            df = pd.DataFrame(list(my_dict.items()),columns = ['diagnosis','train'])
            
            ## VALIDATION
            foo, bar = np.unique(q_val, return_counts=True)
            my_dict = dict(zip(foo, bar))
            df1 = pd.DataFrame(list(my_dict.items()),columns = ['diagnosis2','val'])
            
            ## TEST
            foo, bar = np.unique(q_test, return_counts=True)
            my_dict = dict(zip(foo, bar))
            df2 = pd.DataFrame(list(my_dict.items()),columns = ['diagnosis3','test'])
            
            ## COMBINE ALL THREE
            df3 = pd.concat([df,df1, df2], ignore_index=True,axis=1)
            df3.columns = ['diagnosis','train','diagnosis2','val','diagnosis3','test']
            df3 = df3.drop(['diagnosis2','diagnosis3'], axis=1)
            display(df3)
            print(df3['diagnosis'])
#         elif i > 0:
#             print('finished fold {}, exiting...'.format(i))
            break
        i = i +1
    j = j + 1
    print('---------')
print(fold_taskname)
display(lt)

In [None]:
lt['foldinfo'].value_counts()
lt['indexes'] = lt.index
display(lt)

In [None]:
# fold_taskname[0,0]

In [None]:
# final_input

In [None]:
# final_input[fold_taskname[0][0]]

In [None]:
#np.set_printoptions(threshold=np.inf)
n_dim = 87#83
mean_taskname = np.zeros((5, 1, n_dim)) * np.nan
std_taskname = np.zeros((5, 1, n_dim)) * np.nan
for i_split in range(5):
    ## fold_taskname[i_split][0] selecteert de indexen van de training donors van elke fold
    ## final_input[fold_taskname[i_split][0]] selecteerd de training data van deze donoren
    ## de concatenate step combineer het, dus x_tr is training data per fold
    x_tr = np.concatenate(final_input[fold_taskname[i_split][0]], axis=0)
    display(x_tr)
    ## mean taskname contains the mean of each training column. eg. for the first fold, the average age is 75
    mean_taskname[i_split][0] = np.nanmean(x_tr, axis=0)
    ## std taskname contains the std of each training column. eg. for the first fold, the average age is 12
    std_taskname[i_split][0] = np.nanstd(x_tr, axis=0)
    
print(mean_taskname[0][0])
print(std_taskname[0][0])
mean_taskname

### SAVING

In [None]:
# if eighties == True:
#     prefix = '80_'
# elif eighties == False:
prefix = '60_'
savespace = f'clinical_history_{n}_observations'
output_path = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/temporal_model/data"    

print(savespace)
os.makedirs(os.path.join(output_path,  savespace),
            exist_ok=True)
np.savez(os.path.join(output_path, savespace, 'data.npz'),
         input=final_input, masking=final_masking, timestamp=final_timestamp, label_taskname=final_label_taskname)
np.savez(os.path.join(output_path, savespace, 'fold.npz'),
         fold_taskname=fold_taskname, mean_taskname=mean_taskname, std_taskname=std_taskname)
lt.to_excel(os.path.join(output_path, savespace,'donorindexes.xlsx'), index=False)

In [None]:
break

In [None]:
gru_d.to_csv('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_diagnosis/output/gru_d_july.csv', index=False)

In [None]:
#### for seurat:

#### selecting a subset

In [None]:
table_of_choice = 'table1_p' #fig 4a table3_with_con_p #table2_p #fig 3a table1_P fig sup 5a:table2_p

In [None]:
selected_diagnoses,ordered_diagnoses = table_selector(table_of_choice, predictions_df)
print('After selecting for {}, we have {} donors'.format(selected_diagnoses['neuropathological_diagnosis'].unique(),
                                                                                    selected_diagnoses['DonorID'].nunique()) )
display(selected_diagnoses[selected_diagnoses['neuropathological_diagnosis']=='AD'].head(5))


#### merge the table1 diagnoses back with the other donors
we do this so we can run the analysis on the table1 diagnosis, but also include other groups such as ad dlb)


In [None]:
unique_donor_ids = selected_diagnoses['DonorID'].unique().tolist()
print(len(unique_donor_ids))
# Filter rows from dfa where DonorID is not in the unique_donor_ids list
filtered_predictions_df = predictions_df[~predictions_df['DonorID'].isin(unique_donor_ids)]
print(filtered_predictions_df.shape)
print(predictions_df.shape)
# Concatenate filtered_dfa with dfb
merged_df = pd.concat([selected_diagnoses, filtered_predictions_df], ignore_index=True)
# merged_df['neuropathological_diagnosis'].value_counts().head(40)
unique_diagnoses = merged_df[['DonorID', 'neuropathological_diagnosis']].drop_duplicates()
unique_diagnoses['neuropathological_diagnosis'].value_counts().head(20)

In [None]:
merged_df.to_csv('/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_diagnosis/output/selected_diagnoses_july.csv', index=False)

In [None]:
# psych = ['MDD', 'SCZ', 'BP']
# selected_diagnoses = selected_diagnoses[~selected_diagnoses['neuropathological_diagnosis'].isin(psych)]
# ordered_diagnoses =  ['AD', 'PD', 'VD','ATAXIA','DLB','FTD', 'MND', 'PSP', 'MS','MSA']

In [None]:
# pd = merged_df[merged_df['neuropathological_diagnosis'] == 'PD']
pad = merged_df.copy()
pad['neuropathological_diagnosis'] = pad['neuropathological_diagnosis'].replace('PDD', 'PD')

pad = pad[['neuropathological_diagnosis','age_at_death','sex','Constipation','Weight_loss','DonorID','Age']]
# pd['Constipation'].value_counts()
# Weight loss Constipation
pad = pad.groupby(['neuropathological_diagnosis', 'age_at_death', 'sex', 'DonorID']).agg({'Constipation': 'sum', 'Weight_loss': 'sum'}).reset_index()

# display(pad)
con = pad.groupby('neuropathological_diagnosis')['Constipation'].apply(lambda x: (x > 0).sum()).reset_index(name='Constipation Count')
wl = pad.groupby('neuropathological_diagnosis')['Constipation'].apply(lambda x: (x > 0).sum()).reset_index(name='Weight loss Count')

# display(con)

percentage_df = pad.groupby('neuropathological_diagnosis')[['Constipation','Weight_loss']].apply(lambda x: (x > 0).mean() * 100).reset_index()
percentage_df.rename(columns={'Constipation': 'Constipation Percentage','Weight_loss':'Weight_loss Percentage'}, inplace=True)
percentage_df['Constipation Count'] = con['Constipation Count']
percentage_df['Weight loss Count'] = wl['Weight loss Count']
percentage_df.sort_values(by=['Constipation Percentage'],ascending=False,inplace=True)
display(percentage_df.head(30))

In [None]:
# attribute_grouping = pd.read_excel(path_to_attribute_grouping, engine='openpyxl', index_col=[0])#,header=3, sheet_name='S3. 90 signs and symptoms')
# # display(attribute_grouping.head())
# df = predictions_df.copy()

# df['symptoms'] = df[attributes].apply(lambda row: ', '.join([col for col in attributes if row[col] != 0]), axis=1)
# df.loc[(df[attributes] == 0).all(axis=1), 'symptoms'] = 'none'
# columns_to_keep = set(df.columns).difference(attributes)
# # df = df[columns_to_keep].copy()
# df['symptoms'] = df['symptoms'].str.split(',').apply(lambda x: ', '.join(set(x))).str.strip()

# df.tail(10)

# pd.set_option('display.max_colwidth', 100)
# dfgrouping = predictions_df.copy()

# # Iterate over the columns
# for column in attributes:
#     mask = dfgrouping[column] == 1
#     grouping = attribute_grouping.loc[attribute_grouping['ITname'] == column, 'Grouping'].iloc[0]
#     dfgrouping.loc[mask, column] = grouping

# dfgrouping['groupings'] = dfgrouping[attributes].apply(lambda x: ', '.join([val for val in x if val != 0]), axis=1)
# dfgrouping.loc[(dfgrouping[attributes] == 0).all(axis=1), 'groupings'] = 'none'

# columns_to_keep = set(dfgrouping.columns).difference(attributes)
# dfgrouping = dfgrouping[columns_to_keep].copy()
# dfgrouping['groupings'] = dfgrouping['groupings'].str.split(',').apply(lambda x: ', '.join(set([item.strip() for item in x]))).str.strip()



# ############ domain #############
# dfdomain = predictions_df.copy()
# for column in attributes:
#     mask = dfdomain[column] == 1
#     domain = attribute_grouping.loc[attribute_grouping['ITname'] == column, 'Domain'].iloc[0]
#     dfdomain.loc[mask, column] = domain

# dfdomain['Domains'] = dfdomain[attributes].apply(lambda x: ', '.join([val for val in x if val != 0]), axis=1)
# dfdomain.loc[(dfdomain[attributes] == 0).all(axis=1), 'Domains'] = 'none'

# columns_to_keep = set(dfdomain.columns).difference(attributes)
# dfdomain = dfdomain[columns_to_keep].copy()
# dfdomain['Domains'] = dfdomain['Domains'].str.split(',').apply(lambda x: ', '.join(set([item.strip() for item in x]))).str.strip()



# merged_df = df.merge(dfgrouping, on=['neuropathological_diagnosis', 'DonorID', 'Year', 'Age', 'sex', 'age_at_death'])
# predictions_df = merged_df.merge(dfdomain, on=['neuropathological_diagnosis', 'DonorID', 'Year', 'Age', 'sex', 'age_at_death'])
# display(predictions_df.head(40))