In [1]:
import pandas as pd
import numpy as np
import math
import random as python_random
import io
import os
import glob
import matplotlib.pyplot as plt
from numpy import linalg as LA
import matplotlib.pyplot as plt

from IPython.display import clear_output

seeds=[19,31,38,47,77]

In [2]:
diseases = ['Enlarged Cardiomediastinum','Cardiomegaly','Lung Opacity','Lung Lesion','Edema',
        'Consolidation','Pneumonia','Atelectasis','Pneumothorax','Pleural Effusion','Pleural Other',
        'Fracture','Support Devices','No Finding']

diseases_abbr = {'Cardiomegaly': 'Cardiomegaly',
                 'Effusion': 'Effusion',
                 'Enlarged Cardiomediastinum': 'Enlarged Card.',
                 'Lung Lesion': 'Lung Lesion',
                 'Atelectasis': 'Atelectasis',
                 'Pneumonia': 'Pneumonia',
                 'Pneumothorax': 'Pneumothorax',
                 'Consolidation': 'Consolidation',
                 'Edema': 'Edema',
                 'Pleural Effusion': 'Effusion',
                 'Pleural Other': 'Pleural Other',
                 'Fracture': 'Fracture',
                 'Support Devices': 'Sup. Devices',
                 'Lung Opacity': 'Air. Opacity',
                 'No Finding': 'No Finding'
                }

age_decile = ['60-80', '40-60', '20-40', '80+', '0-20']

gender = ['M', 'F']

race = ['WHITE', 'BLACK/AFRICAN AMERICAN',
        'HISPANIC/LATINO', 'OTHER', 'ASIAN',
        'AMERICAN INDIAN/ALASKA NATIVE']

factor = [gender, age_decile, race]

factor_str = ['gender', 'age_decile', 'race']


## TPR Disparities

In [3]:
def tpr(df, d, c, category_name):

  pred_disease = "bi_" + d
  gt = df.loc[(df[d] == 1) & (df[category_name] == c), :]
  pred = df.loc[(df[pred_disease] == 1) & (df[d] == 1) & (df[category_name] == c), :]

  if len(gt) != 0:
    TPR = len(pred) / len(gt)
    return TPR
  else:
    # print("Disease", d, "in category", c, "has zero division error")
    return np.NAN

In [4]:
def TPR_Disparities(df,diseases,category,category_name,seed=19,tpr_gaps_results_path_dir_dir="default"):

  plt.rcParams.update({'font.size': 18})
  GAP_total = []
  percentage_total = []
  cate = []

  if category_name == 'gender':
    Run1_sex = pd.DataFrame(diseases, columns=["diseases"])

  if category_name == 'age_decile':
    Run1_age = pd.DataFrame(diseases, columns=["diseases"])

  if category_name == 'race':
    Run1_race = pd.DataFrame(diseases, columns=["diseases"])


  for c in category:

    GAP_y = []
    percentage_y = []

    for d in diseases:

      pred_disease = "bi_" + d

      gt = df.loc[(df[d] == 1) & (df[category_name] == c), :]
      pred = df.loc[(df[pred_disease] == 1) & (df[d] == 1) & (df[category_name] == c), :]

      n_gt = df.loc[(df[d] == 1) & (df[category_name] != c) & (df[category_name] != 0), :]
      n_pred = df.loc[(df[pred_disease] == 1) & (df[d] == 1) & (df[category_name] != c) & (df[category_name] != 0), :]

      pi_gy = df.loc[(df[d] == 1) & (df[category_name] == c), :]
      pi_y = df.loc[(df[d] == 1) & (df[category_name] != 0), :]

      if len(gt) != 0  and len(n_gt) != 0 and len(pi_y) != 0:

        TPR = len(pred) / len(gt)

        n_TPR = len(n_pred) / len(n_gt)

        percentage = len(pi_gy) / len(pi_y)

        if category_name != 'gender':
          temp_TPR= []

          for c1 in category:
            _tpr = tpr(df, d, c1, category_name)

            if _tpr != -1:
              temp_TPR.append(_tpr)


          temp_TPR_Filtered= [x for x in temp_TPR if not math.isnan(x)]
          temp_TPR_Filtered.sort()


          if len(temp_TPR_Filtered) % 2 == 0:

            median = (temp_TPR_Filtered[(len(temp_TPR_Filtered) // 2) - 1]
                      + temp_TPR_Filtered[(len(temp_TPR_Filtered) // 2)])/2

          else:
            median = temp_TPR_Filtered[(len(temp_TPR_Filtered) // 2)]

          GAP=TPR - median

        else:

          GAP = TPR - n_TPR

        """ This portion of code is used for debuging purpose only """
        # if category_name=='age_decile' and c=='60-80' and d=='Fracture':
        #   print(f'Current category : {c}')
        #   print(f'Current disease : {d}')
        #   pdb.set_trace()

        GAP_y.append(GAP)
        percentage_y.append(percentage)

      else:

        GAP_y.append(np.NAN)
        percentage_y.append(0)


    GAP_total.append(GAP_y)
    percentage_total.append(percentage_y)

    c = c.replace(' ', '_', 3)
    c = c.replace('/', '_', 3)
    cate.append(c)


  GAP_total = np.array(GAP_total)

  if category_name == 'age_decile':
    print(f'GAP_total: {GAP_total}')

  # Create a new array of x-values for the non-NaN diseases
  x = np.arange(len(diseases))

  print("len(GAP_total): ",len(GAP_total))
  for i in range(len(GAP_total)):

    if category_name=='age_decile':

      if i==0:

        Percent0 = pd.DataFrame(percentage_total[i], columns=["%60-80"])
        Run1_age = pd.concat([Run1_age, Percent0.reindex(Run1_age.index)], axis=1)

        Gap0 = pd.DataFrame(GAP_total[i], columns=["Gap_60-80"])
        Run1_age = pd.concat([Run1_age, Gap0.reindex(Run1_age.index)], axis=1)

      if i==1:

        Percent1 = pd.DataFrame(percentage_total[i], columns=["%40-60"])
        Run1_age = pd.concat([Run1_age, Percent1.reindex(Run1_age.index)], axis=1)

        Gap1 = pd.DataFrame(GAP_total[i], columns=["Gap_40-60"])
        Run1_age = pd.concat([Run1_age, Gap1.reindex(Run1_age.index)], axis=1)

      if i == 2:

        Percent2 = pd.DataFrame(percentage_total[i], columns=["%20-40"])
        Run1_age = pd.concat([Run1_age, Percent2.reindex(Run1_age.index)], axis=1)

        Gap2 = pd.DataFrame(GAP_total[i], columns=["Gap_20-40"])
        Run1_age = pd.concat([Run1_age, Gap2.reindex(Run1_age.index)], axis=1)

      if i == 3:

        Percent3 = pd.DataFrame(percentage_total[i], columns=["%80+"])
        Run1_age = pd.concat([Run1_age, Percent3.reindex(Run1_age.index)], axis=1)

        Gap3 = pd.DataFrame(GAP_total[i], columns=["Gap_80+"])
        Run1_age = pd.concat([Run1_age, Gap3.reindex(Run1_age.index)], axis=1)

      if i == 4:

        Percent4 = pd.DataFrame(percentage_total[i], columns=["%0-20"])
        Run1_age = pd.concat([Run1_age, Percent4.reindex(Run1_age.index)], axis=1)

        Gap4 = pd.DataFrame(GAP_total[i], columns=["Gap_0-20"])
        Run1_age = pd.concat([Run1_age, Gap4.reindex(Run1_age.index)], axis=1)

      Run1_age.to_csv(tpr_gaps_results_path_dir_dir+"Run_seed"+str(seed)+"_TPR_GAP_Age.csv")

    if category_name =='gender':

      if i == 0:

        MalePercent = pd.DataFrame(percentage_total[i], columns=["%M"])
        Run1_sex = pd.concat([Run1_sex, MalePercent.reindex(Run1_sex.index)], axis=1)

        MaleGap = pd.DataFrame(GAP_total[i], columns=["Gap_M"])
        Run1_sex = pd.concat([Run1_sex, MaleGap.reindex(Run1_sex.index)], axis=1)

      else:

        FeMalePercent = pd.DataFrame(percentage_total[i], columns=["%F"])
        Run1_sex = pd.concat([Run1_sex, FeMalePercent.reindex(Run1_sex.index)], axis=1)

        FeMaleGap = pd.DataFrame(GAP_total[i], columns=["Gap_F"])
        Run1_sex = pd.concat([Run1_sex, FeMaleGap.reindex(Run1_sex.index)], axis=1)


      Run1_sex.to_csv(tpr_gaps_results_path_dir_dir+"Run_seed"+str(seed)+"_TPR_GAP_sex.csv")

    if category_name=='race':

      if i==0:

        Percent_White = pd.DataFrame(percentage_total[i], columns=["%White"])
        Run1_race = pd.concat([Run1_race, Percent_White.reindex(Run1_race.index)], axis=1)

        Gap_White= pd.DataFrame(GAP_total[i], columns=["Gap_White"])
        Run1_race = pd.concat([Run1_race, Gap_White.reindex(Run1_race.index)], axis=1)

      if i==1:

        Percent_Black= pd.DataFrame(percentage_total[i], columns=["%Black"])
        Run1_race = pd.concat([Run1_race, Percent_Black.reindex(Run1_race.index)], axis=1)

        Gap_Black = pd.DataFrame(GAP_total[i], columns=["Gap_Black"])
        Run1_race = pd.concat([Run1_race, Gap_Black.reindex(Run1_race.index)], axis=1)

      if i == 2:

        Percent_Hisp= pd.DataFrame(percentage_total[i], columns=["%Hisp"])
        Run1_race = pd.concat([Run1_race, Percent_Hisp.reindex(Run1_race.index)], axis=1)

        Gap_Hisp= pd.DataFrame(GAP_total[i], columns=["Gap_Hisp"])
        Run1_race = pd.concat([Run1_race, Gap_Hisp.reindex(Run1_race.index)], axis=1)

      if i == 3:

        Percent_Other = pd.DataFrame(percentage_total[i], columns=["%Other"])
        Run1_race = pd.concat([Run1_race, Percent_Other.reindex(Run1_race.index)], axis=1)

        Gap_Other= pd.DataFrame(GAP_total[i], columns=["Gap_Other"])
        Run1_race = pd.concat([Run1_race, Gap_Other.reindex(Run1_race.index)], axis=1)


      if i == 4:

        Percent_Asian = pd.DataFrame(percentage_total[i], columns=["%Asian"])
        Run1_race = pd.concat([Run1_race, Percent_Asian.reindex(Run1_race.index)], axis=1)

        Gap_Asian= pd.DataFrame(GAP_total[i], columns=["Gap_Asian"])
        Run1_race = pd.concat([Run1_race, Gap_Asian.reindex(Run1_race.index)], axis=1)

      if i == 5:

        Percent_American = pd.DataFrame(percentage_total[i], columns=["%American"])
        Run1_race = pd.concat([Run1_race, Percent_American.reindex(Run1_race.index)], axis=1)

        Gap_American= pd.DataFrame(GAP_total[i], columns=["Gap_American"])
        Run1_race = pd.concat([Run1_race, Gap_American.reindex(Run1_race.index)], axis=1)

      Run1_race.to_csv(tpr_gaps_results_path_dir_dir+"Run_seed"+str(seed)+"_TPR_GAP_race.csv")


In [5]:
for seed in seeds:

    np.random.seed(seed)
    python_random.seed(seed)
    
    base_path = "./Prediction_results/"
    tpr_results_path_dir = os.path.dirname(base_path)
    #Create directory model weightes saving

    tpr_gaps_results_path_dir_dir='./TPR_GAPS/'
    os.makedirs(os.path.dirname(tpr_gaps_results_path_dir_dir), exist_ok=True)

    
    #=====================================================================================================================================
    df = pd.read_csv(f"{base_path}bipred_{seed}.csv")
    #=====================================================================================================================================
    
    ''' TPR Disparities '''  
    for i in range(len(factor)):
        TPR_Disparities(df,diseases,factor[i],factor_str[i],seed,tpr_gaps_results_path_dir_dir)
    
    print(f'SEED : {seed}')

  df = pd.read_csv(f"{base_path}bipred_{seed}.csv")


len(GAP_total):  2
GAP_total: [[ 0.0187614   0.          0.020532    0.04718506  0.02716125  0.01961892
   0.          0.01743029  0.          0.01022471  0.09543162  0.03094044
   0.00188132 -0.04133736]
 [ 0.         -0.01748851  0.          0.          0.          0.01690214
   0.03594421  0.          0.00923281  0.00085307  0.07263477  0.
   0.          0.        ]
 [-0.02290441 -0.02915029 -0.01563051  0.04172494 -0.00405041 -0.00379564
   0.04601132 -0.00828745 -0.01009281  0.          0.         -0.02654592
  -0.00723083  0.04652956]
 [ 0.02722766  0.01779521  0.02655845 -0.00470661  0.03325851  0.
  -0.04822038  0.01329072 -0.12685027 -0.00105585 -0.01076806  0.01971942
  -0.04315355 -0.17347044]
 [-0.04582492  0.0062818  -0.0252133  -0.07056856 -0.07854    -0.01833722
  -0.10691294 -0.02520364  0.16808412 -0.02139483 -0.08975357 -0.12144285
   0.05320924  0.0001881 ]]
len(GAP_total):  5
len(GAP_total):  6
SEED : 19


  df = pd.read_csv(f"{base_path}bipred_{seed}.csv")


len(GAP_total):  2
GAP_total: [[ 0.02048882  0.02560821  0.02160958  0.04960981  0.02461043  0.00139563
   0.          0.01557015  0.          0.01134366  0.07485228  0.0245287
   0.0023559  -0.06163629]
 [ 0.          0.          0.          0.         -0.00071394  0.00235685
   0.03598905  0.          0.02597041  0.          0.05098978  0.
   0.          0.        ]
 [-0.02058169 -0.00965496 -0.01880599  0.05477855  0.         -0.02493287
   0.04474393 -0.02316704 -0.01247236 -0.01215151  0.         -0.01978509
  -0.01217677  0.03917095]
 [ 0.03915867  0.04769591  0.03003922 -0.03922178  0.03687641  0.
  -0.05912392  0.01970198 -0.11611572  0.01383007 -0.01443918  0.00491917
  -0.03716879 -0.20547191]
 [-0.08305652 -0.00598633 -0.01615543 -0.08082497 -0.07651102 -0.05651386
  -0.22371245 -0.03420664  0.14342044 -0.02345806 -0.06588846 -0.06553672
   0.03915965  0.00532949]]
len(GAP_total):  5
len(GAP_total):  6
SEED : 31


  df = pd.read_csv(f"{base_path}bipred_{seed}.csv")


len(GAP_total):  2
GAP_total: [[ 0.01953061  0.00862988  0.01847541  0.05411093  0.02953279  0.0100447
   0.          0.01892387 -0.00413768  0.01289078  0.07971014  0.01703278
   0.00216635 -0.04648943]
 [ 0.         -0.01501987 -0.00087747  0.          0.          0.
   0.02123182  0.          0.02253251  0.          0.06043417 -0.02101996
   0.          0.01134867]
 [-0.01948569 -0.0269033  -0.01529169  0.02261072 -0.00528315 -0.01518379
   0.03076303 -0.00980934  0.         -0.01448078 -0.01497209 -0.01422701
  -0.01293377  0.05510671]
 [ 0.03435596  0.02982204  0.02675644 -0.01923422  0.03895788  0.01368307
  -0.04156454  0.02270346 -0.11937042  0.0160797   0.          0.
  -0.03983783 -0.19212544]
 [-0.04408124  0.          0.         -0.093534   -0.07047417 -0.01154356
  -0.12599632 -0.00696899  0.15694938 -0.01499374 -0.12028986  0.03554675
   0.03420097  0.        ]]
len(GAP_total):  5
len(GAP_total):  6
SEED : 38


  df = pd.read_csv(f"{base_path}bipred_{seed}.csv")


len(GAP_total):  2
GAP_total: [[ 1.58449042e-02  5.64754463e-03  1.69958483e-02  5.35813824e-02
   2.20155585e-02  5.58741212e-03  0.00000000e+00  2.19800201e-02
   0.00000000e+00  8.72895836e-03  5.52564987e-02  3.75031910e-02
   6.53142783e-04 -5.31771465e-02]
 [ 0.00000000e+00 -1.53698861e-02 -7.10378433e-03  0.00000000e+00
  -7.61891843e-03  1.43852763e-04  3.58993658e-02  0.00000000e+00
   2.36497101e-02  0.00000000e+00  3.35193475e-02  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-2.40668615e-02 -2.90354948e-02 -1.60307873e-02  3.78787879e-02
   0.00000000e+00 -1.08371335e-02  3.55337443e-02 -5.86973327e-03
  -3.26954545e-03 -6.39876857e-03 -5.43393673e-02 -6.59133710e-03
  -7.64289187e-03  5.76196979e-02]
 [ 3.29882861e-02  2.63346838e-02  2.32061484e-02 -2.60384270e-02
   2.98583955e-02  0.00000000e+00 -5.22794409e-02  2.05895543e-02
  -1.20014683e-01  8.10241053e-03  0.00000000e+00  3.05114851e-02
  -4.33828318e-02 -2.22335659e-01]
 [-2.16709155e-02  0.00000000e+00  0.0

  df = pd.read_csv(f"{base_path}bipred_{seed}.csv")


len(GAP_total):  2
GAP_total: [[ 1.82112693e-02  1.41357300e-02  2.49841064e-02  3.98690078e-02
   2.57602254e-02  9.30654577e-03  0.00000000e+00  1.61185425e-02
  -1.96734826e-04  3.01509075e-03  8.05728088e-02  3.75164966e-02
   4.34751711e-05 -7.36152545e-02]
 [ 0.00000000e+00 -6.61944862e-03  0.00000000e+00  0.00000000e+00
  -3.89096724e-04  9.89666614e-03  3.37326244e-02  0.00000000e+00
   7.00814364e-03  0.00000000e+00  6.34236831e-02  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-1.86664213e-02 -2.15728082e-02 -1.46937304e-02  3.29836830e-02
   0.00000000e+00 -9.26972684e-03  3.67327246e-02 -2.22372524e-03
   0.00000000e+00 -1.59890368e-02 -9.60074437e-03 -2.26531517e-02
  -1.40715790e-02  4.93024314e-02]
 [ 3.64153980e-02  2.67706036e-02  2.95673244e-02 -2.18985417e-02
   3.20861476e-02  0.00000000e+00 -3.97664637e-02  1.62557397e-02
  -1.26580718e-01  1.96110606e-03  0.00000000e+00  1.93864587e-02
  -4.87885568e-02 -2.14045783e-01]
 [-2.63207266e-02  0.00000000e+00 -2.7