In [1]:
import pandas as pd
import numpy as np
import math
import random as python_random
import io
import os
import glob
import matplotlib.pyplot as plt
from numpy import linalg as LA
import matplotlib.pyplot as plt

from IPython.display import clear_output

seeds=[19,31,38,47,77]

In [2]:
diseases = ['Enlarged Cardiomediastinum','Cardiomegaly','Lung Opacity','Lung Lesion','Edema',
        'Consolidation','Pneumonia','Atelectasis','Pneumothorax','Pleural Effusion','Pleural Other',
        'Fracture','Support Devices','No Finding']

diseases_abbr = {'Cardiomegaly': 'Cardiomegaly',
                 'Effusion': 'Effusion',
                 'Enlarged Cardiomediastinum': 'Enlarged Card.',
                 'Lung Lesion': 'Lung Lesion',
                 'Atelectasis': 'Atelectasis',
                 'Pneumonia': 'Pneumonia',
                 'Pneumothorax': 'Pneumothorax',
                 'Consolidation': 'Consolidation',
                 'Edema': 'Edema',
                 'Pleural Effusion': 'Effusion',
                 'Pleural Other': 'Pleural Other',
                 'Fracture': 'Fracture',
                 'Support Devices': 'Sup. Devices',
                 'Lung Opacity': 'Air. Opacity',
                 'No Finding': 'No Finding'
                }


age_decile = ['60-80', '40-60', '20-40', '80+', '0-20']

gender = ['M', 'F']

race = ['WHITE', 'BLACK/AFRICAN AMERICAN',
        'HISPANIC/LATINO', 'OTHER', 'ASIAN',
        'AMERICAN INDIAN/ALASKA NATIVE']

factor = [gender, age_decile, race]

factor_str = ['gender', 'age_decile', 'race']


## TPR Disparities

In [3]:
def tpr(df, d, c, category_name):

  pred_disease = "bi_" + d
  gt = df.loc[(df[d] == 1) & (df[category_name] == c), :]
  pred = df.loc[(df[pred_disease] == 1) & (df[d] == 1) & (df[category_name] == c), :]

  if len(gt) != 0:
    TPR = len(pred) / len(gt)
    return TPR
  else:
    # print("Disease", d, "in category", c, "has zero division error")
    return np.NAN

In [4]:
def TPR_Disparities(df,diseases,category,category_name,seed=19,tpr_gaps_results_path_dir_dir="default"):

  plt.rcParams.update({'font.size': 18})
  GAP_total = []
  percentage_total = []
  cate = []

  if category_name == 'gender':
    Run1_sex = pd.DataFrame(diseases, columns=["diseases"])

  if category_name == 'age_decile':
    Run1_age = pd.DataFrame(diseases, columns=["diseases"])

  if category_name == 'race':
    Run1_race = pd.DataFrame(diseases, columns=["diseases"])


  for c in category:

    GAP_y = []
    percentage_y = []

    for d in diseases:

      pred_disease = "bi_" + d

      gt = df.loc[(df[d] == 1) & (df[category_name] == c), :]
      pred = df.loc[(df[pred_disease] == 1) & (df[d] == 1) & (df[category_name] == c), :]

      n_gt = df.loc[(df[d] == 1) & (df[category_name] != c) & (df[category_name] != 0), :]
      n_pred = df.loc[(df[pred_disease] == 1) & (df[d] == 1) & (df[category_name] != c) & (df[category_name] != 0), :]

      pi_gy = df.loc[(df[d] == 1) & (df[category_name] == c), :]
      pi_y = df.loc[(df[d] == 1) & (df[category_name] != 0), :]

      if len(gt) != 0  and len(n_gt) != 0 and len(pi_y) != 0:

        TPR = len(pred) / len(gt)

        n_TPR = len(n_pred) / len(n_gt)

        percentage = len(pi_gy) / len(pi_y)

        if category_name != 'gender':
          temp_TPR= []

          for c1 in category:
            _tpr = tpr(df, d, c1, category_name)

            if _tpr != -1:
              temp_TPR.append(_tpr)


          temp_TPR_Filtered= [x for x in temp_TPR if not math.isnan(x)]
          temp_TPR_Filtered.sort()


          if len(temp_TPR_Filtered) % 2 == 0:

            median = (temp_TPR_Filtered[(len(temp_TPR_Filtered) // 2) - 1]
                      + temp_TPR_Filtered[(len(temp_TPR_Filtered) // 2)])/2

          else:
            median = temp_TPR_Filtered[(len(temp_TPR_Filtered) // 2)]

          GAP=TPR - median

        else:

          GAP = TPR - n_TPR

        """ This portion of code is used for debuging purpose only """
        # if category_name=='age_decile' and c=='60-80' and d=='Fracture':
        #   print(f'Current category : {c}')
        #   print(f'Current disease : {d}')
        #   pdb.set_trace()

        GAP_y.append(GAP)
        percentage_y.append(percentage)

      else:

        GAP_y.append(np.NAN)
        percentage_y.append(0)


    GAP_total.append(GAP_y)
    percentage_total.append(percentage_y)

    c = c.replace(' ', '_', 3)
    c = c.replace('/', '_', 3)
    cate.append(c)


  GAP_total = np.array(GAP_total)

  if category_name == 'age_decile':
    print(f'GAP_total: {GAP_total}')

  # Create a new array of x-values for the non-NaN diseases
  x = np.arange(len(diseases))

  print("len(GAP_total): ",len(GAP_total))
  for i in range(len(GAP_total)):

    if category_name=='age_decile':

      if i==0:

        Percent0 = pd.DataFrame(percentage_total[i], columns=["%60-80"])
        Run1_age = pd.concat([Run1_age, Percent0.reindex(Run1_age.index)], axis=1)

        Gap0 = pd.DataFrame(GAP_total[i], columns=["Gap_60-80"])
        Run1_age = pd.concat([Run1_age, Gap0.reindex(Run1_age.index)], axis=1)

      if i==1:

        Percent1 = pd.DataFrame(percentage_total[i], columns=["%40-60"])
        Run1_age = pd.concat([Run1_age, Percent1.reindex(Run1_age.index)], axis=1)

        Gap1 = pd.DataFrame(GAP_total[i], columns=["Gap_40-60"])
        Run1_age = pd.concat([Run1_age, Gap1.reindex(Run1_age.index)], axis=1)

      if i == 2:

        Percent2 = pd.DataFrame(percentage_total[i], columns=["%20-40"])
        Run1_age = pd.concat([Run1_age, Percent2.reindex(Run1_age.index)], axis=1)

        Gap2 = pd.DataFrame(GAP_total[i], columns=["Gap_20-40"])
        Run1_age = pd.concat([Run1_age, Gap2.reindex(Run1_age.index)], axis=1)

      if i == 3:

        Percent3 = pd.DataFrame(percentage_total[i], columns=["%80+"])
        Run1_age = pd.concat([Run1_age, Percent3.reindex(Run1_age.index)], axis=1)

        Gap3 = pd.DataFrame(GAP_total[i], columns=["Gap_80+"])
        Run1_age = pd.concat([Run1_age, Gap3.reindex(Run1_age.index)], axis=1)

      if i == 4:

        Percent4 = pd.DataFrame(percentage_total[i], columns=["%0-20"])
        Run1_age = pd.concat([Run1_age, Percent4.reindex(Run1_age.index)], axis=1)

        Gap4 = pd.DataFrame(GAP_total[i], columns=["Gap_0-20"])
        Run1_age = pd.concat([Run1_age, Gap4.reindex(Run1_age.index)], axis=1)

      Run1_age.to_csv(tpr_gaps_results_path_dir_dir+"Run_seed"+str(seed)+"_TPR_GAP_Age.csv")

    if category_name =='gender':

      if i == 0:

        MalePercent = pd.DataFrame(percentage_total[i], columns=["%M"])
        Run1_sex = pd.concat([Run1_sex, MalePercent.reindex(Run1_sex.index)], axis=1)

        MaleGap = pd.DataFrame(GAP_total[i], columns=["Gap_M"])
        Run1_sex = pd.concat([Run1_sex, MaleGap.reindex(Run1_sex.index)], axis=1)

      else:

        FeMalePercent = pd.DataFrame(percentage_total[i], columns=["%F"])
        Run1_sex = pd.concat([Run1_sex, FeMalePercent.reindex(Run1_sex.index)], axis=1)

        FeMaleGap = pd.DataFrame(GAP_total[i], columns=["Gap_F"])
        Run1_sex = pd.concat([Run1_sex, FeMaleGap.reindex(Run1_sex.index)], axis=1)


      Run1_sex.to_csv(tpr_gaps_results_path_dir_dir+"Run_seed"+str(seed)+"_TPR_GAP_sex.csv")

    if category_name=='race':

      if i==0:

        Percent_White = pd.DataFrame(percentage_total[i], columns=["%White"])
        Run1_race = pd.concat([Run1_race, Percent_White.reindex(Run1_race.index)], axis=1)

        Gap_White= pd.DataFrame(GAP_total[i], columns=["Gap_White"])
        Run1_race = pd.concat([Run1_race, Gap_White.reindex(Run1_race.index)], axis=1)

      if i==1:

        Percent_Black= pd.DataFrame(percentage_total[i], columns=["%Black"])
        Run1_race = pd.concat([Run1_race, Percent_Black.reindex(Run1_race.index)], axis=1)

        Gap_Black = pd.DataFrame(GAP_total[i], columns=["Gap_Black"])
        Run1_race = pd.concat([Run1_race, Gap_Black.reindex(Run1_race.index)], axis=1)

      if i == 2:

        Percent_Hisp= pd.DataFrame(percentage_total[i], columns=["%Hisp"])
        Run1_race = pd.concat([Run1_race, Percent_Hisp.reindex(Run1_race.index)], axis=1)

        Gap_Hisp= pd.DataFrame(GAP_total[i], columns=["Gap_Hisp"])
        Run1_race = pd.concat([Run1_race, Gap_Hisp.reindex(Run1_race.index)], axis=1)

      if i == 3:

        Percent_Other = pd.DataFrame(percentage_total[i], columns=["%Other"])
        Run1_race = pd.concat([Run1_race, Percent_Other.reindex(Run1_race.index)], axis=1)

        Gap_Other= pd.DataFrame(GAP_total[i], columns=["Gap_Other"])
        Run1_race = pd.concat([Run1_race, Gap_Other.reindex(Run1_race.index)], axis=1)


      if i == 4:

        Percent_Asian = pd.DataFrame(percentage_total[i], columns=["%Asian"])
        Run1_race = pd.concat([Run1_race, Percent_Asian.reindex(Run1_race.index)], axis=1)

        Gap_Asian= pd.DataFrame(GAP_total[i], columns=["Gap_Asian"])
        Run1_race = pd.concat([Run1_race, Gap_Asian.reindex(Run1_race.index)], axis=1)

      if i == 5:

        Percent_American = pd.DataFrame(percentage_total[i], columns=["%American"])
        Run1_race = pd.concat([Run1_race, Percent_American.reindex(Run1_race.index)], axis=1)

        Gap_American= pd.DataFrame(GAP_total[i], columns=["Gap_American"])
        Run1_race = pd.concat([Run1_race, Gap_American.reindex(Run1_race.index)], axis=1)

      Run1_race.to_csv(tpr_gaps_results_path_dir_dir+"Run_seed"+str(seed)+"_TPR_GAP_race.csv")


In [5]:
for seed in seeds:
    
    np.random.seed(seed)
    python_random.seed(seed)
    
    base_path = "./Prediction_results/"
    tpr_results_path_dir = os.path.dirname(base_path)
    # Create directory model weightes saving
    tpr_gaps_results_path_dir_dir='./TPR_GAPS/'
    os.makedirs(os.path.dirname(tpr_gaps_results_path_dir_dir), exist_ok=True)
    
    
    df = pd.read_csv(f"{base_path}bipred_{seed}.csv")
    
    ''' TPR Disparities '''

    for i in range(len(factor)):
        TPR_Disparities(df,diseases,factor[i],factor_str[i],seed,tpr_gaps_results_path_dir_dir)
    
    print("Seed : ",seed)

len(GAP_total):  2
GAP_total: [[ 0.0154793   0.00982416  0.02030334  0.04222408  0.01761168  0.01775197
   0.          0.02199111 -0.00658593  0.00559609  0.11616057  0.02312821
   0.00184984 -0.05047492]
 [ 0.         -0.01185942  0.          0.         -0.00916579  0.0132735
   0.04600282  0.          0.01296507  0.          0.1032532   0.
   0.          0.01688716]
 [-0.01306007 -0.0156152  -0.01878958  0.04662005  0.         -0.00684125
   0.05329522 -0.00895238  0.         -0.01407383 -0.01053121 -0.03909605
  -0.00196078  0.05605945]
 [ 0.0339197   0.0282835   0.02209436 -0.01430645  0.02163467  0.
  -0.07140143  0.01506588 -0.13019203  0.01035318  0.          0.00205256
  -0.04667048 -0.20202091]
 [-0.03839773  0.         -0.04330726 -0.09364548 -0.09023145 -0.03423983
  -0.10346413 -0.08024103  0.15589281 -0.02467507 -0.08913043 -0.06473707
   0.04797508  0.        ]]
len(GAP_total):  5
len(GAP_total):  6
Seed :  19
len(GAP_total):  2
GAP_total: [[ 0.01409432  0.          0.019