In [1]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('/mnt/c/Users/Morning/Desktop/MORNING_NEXT/HAM10000_metadata.csv')
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [3]:
df.shape

(10015, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10015 entries, 0 to 10014
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lesion_id     10015 non-null  object 
 1   image_id      10015 non-null  object 
 2   dx            10015 non-null  object 
 3   dx_type       10015 non-null  object 
 4   age           9958 non-null   float64
 5   sex           10015 non-null  object 
 6   localization  10015 non-null  object 
dtypes: float64(1), object(6)
memory usage: 547.8+ KB


In [5]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9958 entries, 0 to 9957
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lesion_id     9958 non-null   object 
 1   image_id      9958 non-null   object 
 2   dx            9958 non-null   object 
 3   dx_type       9958 non-null   object 
 4   age           9958 non-null   float64
 5   sex           9958 non-null   object 
 6   localization  9958 non-null   object 
dtypes: float64(1), object(6)
memory usage: 544.7+ KB


In [6]:
df.drop(['lesion_id', 'image_id'], axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,dx,dx_type,age,sex,localization
0,bkl,histo,80.0,male,scalp
1,bkl,histo,80.0,male,scalp
2,bkl,histo,80.0,male,scalp
3,bkl,histo,80.0,male,scalp
4,bkl,histo,75.0,male,ear


In [8]:
df['dx'].unique()

array(['bkl', 'nv', 'df', 'mel', 'vasc', 'bcc', 'akiec'], dtype=object)

In [9]:
df['dx'] = df['dx'].map({'bkl': 0, 'nv': 1, 'df': 2, 'mel': 3, 'vasc': 4, 'bcc': 5, 'akiec': 6})
df['dx'].unique()

array([0, 1, 2, 3, 4, 5, 6])

In [10]:
df['dx_type'].unique()

array(['histo', 'consensus', 'confocal', 'follow_up'], dtype=object)

In [11]:
enc = OneHotEncoder(categories='auto')
dxtype = df['dx_type'].values.reshape(-1, 1)
enc.fit(dxtype)
new_features = enc.get_feature_names_out()
print(new_features)
new_dxtype = pd.DataFrame(enc.transform(dxtype).toarray())

['x0_confocal' 'x0_consensus' 'x0_follow_up' 'x0_histo']


In [12]:
new_dxtype.columns = ['confocal', 'consensus', 'follow_up', 'histo']

In [13]:
df = pd.concat([df, new_dxtype], axis=1)
df.drop('dx_type', axis=1, inplace=True)

In [14]:
df.head()

Unnamed: 0,dx,age,sex,localization,confocal,consensus,follow_up,histo
0,0,80.0,male,scalp,0.0,0.0,0.0,1.0
1,0,80.0,male,scalp,0.0,0.0,0.0,1.0
2,0,80.0,male,scalp,0.0,0.0,0.0,1.0
3,0,80.0,male,scalp,0.0,0.0,0.0,1.0
4,0,75.0,male,ear,0.0,0.0,0.0,1.0


In [15]:
df = df[df['sex'] != 'unknown']

In [16]:
df.reset_index(drop=True, inplace=True)

In [17]:
df.shape

(9948, 8)

In [18]:
df['sex'].unique()

array(['male', 'female'], dtype=object)

In [19]:
df = df[df['localization'] != 'unknown']
df['localization'].unique()

array(['scalp', 'ear', 'face', 'back', 'trunk', 'chest',
       'upper extremity', 'abdomen', 'lower extremity', 'genital', 'neck',
       'hand', 'foot', 'acral'], dtype=object)

In [20]:
df.reset_index(drop=True, inplace=True)

In [21]:
df.shape

(9761, 8)

In [22]:
enc2 = OneHotEncoder(categories='auto')
local = df['localization'].values.reshape(-1, 1)
enc2.fit(local)
new_features = enc2.get_feature_names_out()
print(new_features)
new_local = pd.DataFrame(enc2.transform(local).toarray())

['x0_abdomen' 'x0_acral' 'x0_back' 'x0_chest' 'x0_ear' 'x0_face' 'x0_foot'
 'x0_genital' 'x0_hand' 'x0_lower extremity' 'x0_neck' 'x0_scalp'
 'x0_trunk' 'x0_upper extremity']


In [23]:
new_local.columns = ['abdomen', 'acral', 'back', 'chest', 'ear', 'face', 'foot', 'genital', 'hand', 'lower extremity', 'neck', 'scalp', 'trunk', 'upper extremity']

In [24]:
df = pd.concat([df, new_local], axis=1)
df.drop('localization', axis=1, inplace=True)

In [25]:
df.shape

(9761, 21)

In [26]:
df['sex'].unique()

array(['male', 'female'], dtype=object)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9761 entries, 0 to 9760
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   dx               9761 non-null   int64  
 1   age              9761 non-null   float64
 2   sex              9761 non-null   object 
 3   confocal         9761 non-null   float64
 4   consensus        9761 non-null   float64
 5   follow_up        9761 non-null   float64
 6   histo            9761 non-null   float64
 7   abdomen          9761 non-null   float64
 8   acral            9761 non-null   float64
 9   back             9761 non-null   float64
 10  chest            9761 non-null   float64
 11  ear              9761 non-null   float64
 12  face             9761 non-null   float64
 13  foot             9761 non-null   float64
 14  genital          9761 non-null   float64
 15  hand             9761 non-null   float64
 16  lower extremity  9761 non-null   float64
 17  neck          

In [28]:
df.head()

Unnamed: 0,dx,age,sex,confocal,consensus,follow_up,histo,abdomen,acral,back,...,ear,face,foot,genital,hand,lower extremity,neck,scalp,trunk,upper extremity
0,0,80.0,male,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0,80.0,male,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,80.0,male,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,80.0,male,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,75.0,male,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
df.isna().sum().sum()

np.int64(0)

In [30]:
df['sex'].unique()

array(['male', 'female'], dtype=object)

In [31]:
df['sex'] = df['sex'].map({'female': 0, 'male': 1})

In [32]:
df.head()

Unnamed: 0,dx,age,sex,confocal,consensus,follow_up,histo,abdomen,acral,back,...,ear,face,foot,genital,hand,lower extremity,neck,scalp,trunk,upper extremity
0,0,80.0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0,80.0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,80.0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,80.0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,75.0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9761 entries, 0 to 9760
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   dx               9761 non-null   int64  
 1   age              9761 non-null   float64
 2   sex              9761 non-null   int64  
 3   confocal         9761 non-null   float64
 4   consensus        9761 non-null   float64
 5   follow_up        9761 non-null   float64
 6   histo            9761 non-null   float64
 7   abdomen          9761 non-null   float64
 8   acral            9761 non-null   float64
 9   back             9761 non-null   float64
 10  chest            9761 non-null   float64
 11  ear              9761 non-null   float64
 12  face             9761 non-null   float64
 13  foot             9761 non-null   float64
 14  genital          9761 non-null   float64
 15  hand             9761 non-null   float64
 16  lower extremity  9761 non-null   float64
 17  neck          

In [34]:
y = df['dx'].values
X = df.drop('dx', axis=1)

In [35]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(9761, 20)
(9761, 8)


In [36]:
# build mask
gender_0_mask = df['sex'] == 0
gender_1_mask = df['sex'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  4453
Male:  5308


In [37]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=10, shuffle=True, random_state=seed)

In [38]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KN



Training and evaluating model: NB
Processing fold 6 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 7 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 8 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 9 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN



Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 6 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 9 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
    Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0      1  Female  0.980066  0.370370  0.629630  0.019934     295      20   
1      2  Female  0.980707  0.512195  0.487805  0.019293     305      21   
2      3  Female  0.990260  0.481481  0.518519  0.009740     305      26   
3      4  Female  0.987539  0.285714  0.714286  0.012461     317      10   
4      5  Female  0.971154  0.511111  0.488889  0.028846     303      23   
5      6  Female  0.974441  0.595238  0.404762  0.025559     305      25   
6      7  Female  0.980456  0.400000  0.600000  0.019544     301      18   
7      8  Female  0.975000  0.533333  0.466667  0.025000     312      24   
8      

In [39]:
results_df = pd.concat(results_list, ignore_index=True)

In [40]:
result_path = './results/k146_result.xlsx'
results_df.to_excel(result_path, index=False)

In [41]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.980066,0.37037,0.62963,0.019934,295,20,34,6,...,25,8,0.988764,1,0,0.011236,176,1,0,2
1,2,Female,0.980707,0.512195,0.487805,0.019293,305,21,20,6,...,14,8,0.979487,1,0,0.020513,191,4,0,4
2,3,Female,0.99026,0.481481,0.518519,0.00974,305,26,28,3,...,18,7,0.98913,1,0,0.01087,182,2,0,2
3,4,Female,0.987539,0.285714,0.714286,0.012461,317,10,25,4,...,21,9,0.988636,1,0,0.011364,174,1,0,2
4,5,Female,0.971154,0.511111,0.488889,0.028846,303,23,22,9,...,16,9,0.978142,1,0,0.021858,179,3,0,4


In [42]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: TtestResult(statistic=np.float64(-0.5242268681819191), pvalue=np.float64(0.6065166422498048), df=np.float64(18.0))
SVM - FPR: TtestResult(statistic=np.float64(-2.3773233158877596), pvalue=np.float64(0.028731558692566585), df=np.float64(18.0))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(64.0), pvalue=np.float64(0.3074894566186813))
DT -TPR: TtestResult(statistic=np.float64(-1.071256906442128), pvalue=np.float64(0.29820813785187966), df=np.float64(18.0))
DT - FPR: TtestResult(statistic=np.float64(-4.681081598843956), pvalue=np.float64(0.00018600521684467586), df=np.float64(18.0))
DT - FN/FP: TtestResult(statistic=np.float64(3.316385233781022), pvalue=np.float64(0.0038398420071534133), df=np.float64(18.0))
RF -TPR: TtestResult(statistic=np.float64(-1.8794998111613141), pvalue=np.float64(0.07646492162330884), df=np.float64(18.0))
RF - FPR: TtestResult(statistic=np.float64(-4.7053232038545465), pvalue=np.float64(0.00017638137254684188), df=np.float64(18.0))
RF - FN/FP: Tt