In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("masterdatasan/lung-cancer-mortality-datasets-v2")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/masterdatasan/lung-cancer-mortality-datasets-v2/versions/2


In [2]:
import os
os.listdir(path)

['lung_cancer_mortality_data_test_v2.csv',
 'lung_cancer_mortality_data_large_v2.csv']

In [3]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv(f'{path}/lung_cancer_mortality_data_test_v2.csv')
df.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,beginning_of_treatment_date,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Female,Slovakia,2016-04-07,Stage IV,2016-04-09,No,Former Smoker,21.2,191,0,0,0,0,Surgery,2017-02-10,0
1,2,50.0,Male,Slovenia,2023-04-22,Stage III,2023-05-05,Yes,Current Smoker,36.4,258,1,0,0,0,Chemotherapy,2024-08-23,0
2,3,65.0,Male,Italy,2023-04-07,Stage II,2023-04-12,Yes,Former Smoker,18.9,174,1,0,1,0,Chemotherapy,2025-03-24,1
3,4,51.0,Male,Latvia,2016-02-07,Stage I,2016-03-08,No,Passive Smoker,38.8,279,1,0,0,0,Combined,2017-03-01,0
4,5,37.0,Female,Spain,2023-12-01,Stage II,2023-12-04,Yes,Former Smoker,37.7,273,0,0,0,0,Combined,2025-07-16,0


In [5]:
df.shape

(1000, 18)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           1000 non-null   int64  
 1   age                          1000 non-null   float64
 2   gender                       1000 non-null   object 
 3   country                      1000 non-null   object 
 4   diagnosis_date               1000 non-null   object 
 5   cancer_stage                 1000 non-null   object 
 6   beginning_of_treatment_date  1000 non-null   object 
 7   family_history               1000 non-null   object 
 8   smoking_status               1000 non-null   object 
 9   bmi                          1000 non-null   float64
 10  cholesterol_level            1000 non-null   int64  
 11  hypertension                 1000 non-null   int64  
 12  asthma                       1000 non-null   int64  
 13  cirrhosis          

In [7]:
df.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,beginning_of_treatment_date,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Female,Slovakia,2016-04-07,Stage IV,2016-04-09,No,Former Smoker,21.2,191,0,0,0,0,Surgery,2017-02-10,0
1,2,50.0,Male,Slovenia,2023-04-22,Stage III,2023-05-05,Yes,Current Smoker,36.4,258,1,0,0,0,Chemotherapy,2024-08-23,0
2,3,65.0,Male,Italy,2023-04-07,Stage II,2023-04-12,Yes,Former Smoker,18.9,174,1,0,1,0,Chemotherapy,2025-03-24,1
3,4,51.0,Male,Latvia,2016-02-07,Stage I,2016-03-08,No,Passive Smoker,38.8,279,1,0,0,0,Combined,2017-03-01,0
4,5,37.0,Female,Spain,2023-12-01,Stage II,2023-12-04,Yes,Former Smoker,37.7,273,0,0,0,0,Combined,2025-07-16,0


In [8]:
df.drop(['id', 'country', 'diagnosis_date', 'beginning_of_treatment_date', 'end_treatment_date'], axis=1, inplace=True)

In [9]:
df['treatment_type'].unique()

array(['Surgery', 'Chemotherapy', 'Combined', 'Radiation'], dtype=object)

In [10]:
df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})
df['cancer_stage'] = df['cancer_stage'].map({'Stage I': 0, 'Stage II': 1, 'Stage III': 2, 'Stage IV': 3})
df['family_history'] = df['family_history'].map({'No': 0, 'Yes': 1})
df['smoking_status'] = df['smoking_status'].map({'Never Smoked': 0, 'Passive Smoker': 1, 'Former Smoker': 2, 'Current Smoker': 3})

In [11]:
df.head()

Unnamed: 0,age,gender,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,survived
0,64.0,0,3,0,2,21.2,191,0,0,0,0,Surgery,0
1,50.0,1,2,1,3,36.4,258,1,0,0,0,Chemotherapy,0
2,65.0,1,1,1,2,18.9,174,1,0,1,0,Chemotherapy,1
3,51.0,1,0,0,1,38.8,279,1,0,0,0,Combined,0
4,37.0,0,1,1,2,37.7,273,0,0,0,0,Combined,0


In [12]:
enc = OneHotEncoder(categories='auto')
treatment = df['treatment_type'].values.reshape(-1, 1)
enc.fit(treatment)
new_features = enc.get_feature_names_out()
print(new_features)
new_treatment = pd.DataFrame(enc.transform(treatment).toarray())

['x0_Chemotherapy' 'x0_Combined' 'x0_Radiation' 'x0_Surgery']


In [13]:
new_treatment.columns = ['Chemotherapy', 'Combined', 'Radiation', 'Surgery']

In [14]:
df = pd.concat([df, new_treatment], axis = 1)
df.drop('treatment_type', axis=1, inplace=True)

In [15]:
df['gender'].unique()

array([0, 1])

In [16]:
y = df['survived'].values
X = df.drop('survived', axis=1)

In [17]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(1000, 15)
(1000, 14)


In [18]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  492
Male:  508


In [19]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=10, shuffle=True, random_state=seed)

In [20]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 6 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 9 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 6 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 9 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
    Fold   Group  SVM_TPR  SVM_TNR  SVM_FPR  SVM_FNR  SVM_TP  SVM_TN  SVM_FP  \
0      1  Female      0.0      1.0      0.0      1.0       0      40       0   
1      2  Female      0.0      1.0      0.0      1.0       0      42       0   
2      3  Female      0.0      1.0      0.0      1.0       0      36       0   
3      4  Female      0.0      1.0      0.0      1.0       0      35       0   
4      5  Female      0.0      1.0      0.0      1.0       0      39       0   
5      6  Female      0.0      1.0      0.0      1.0       0      41       0   
6      7  Female      0.0      1.0      0.0      1.0       0      40       0   
7      8  Female      0.0      1.0      0.0      1.



In [21]:
results_df = pd.concat(results_list, ignore_index=True)

In [22]:
result_path = './results/K37_result.xlsx'
results_df.to_excel(result_path, index=False)

In [23]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0,1,0,1,0,40,0,10,...,2,9,0.0,1.0,0.0,1.0,0,40,0,10
1,2,Female,0,1,0,1,0,42,0,8,...,0,8,0.0,1.0,0.0,1.0,0,42,0,8
2,3,Female,0,1,0,1,0,36,0,13,...,0,13,0.0,1.0,0.0,1.0,0,36,0,13
3,4,Female,0,1,0,1,0,35,0,14,...,0,14,0.0,1.0,0.0,1.0,0,35,0,14
4,5,Female,0,1,0,1,0,39,0,10,...,0,10,0.0,0.974359,0.025641,1.0,0,38,1,10


In [24]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
SVM - FN/FP: TtestResult(statistic=np.float64(-1.4103495836700395), pvalue=np.float64(0.17548389789987048), df=np.float64(18.0))
DT -TPR: TtestResult(statistic=np.float64(-1.119380541323376), pvalue=np.float64(0.2776932438151955), df=np.float64(18.0))
DT - FPR: TtestResult(statistic=np.float64(0.045693079924945666), pvalue=np.float64(0.9640580010938415), df=np.float64(18.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(45.5), pvalue=np.float64(0.7621953377876631))
RF -TPR: MannwhitneyuResult(statistic=np.float64(35.0), pvalue=np.float64(0.07787247147704081))
RF - FPR: MannwhitneyuResult(statistic=np.float64(25.0), pvalue=np.float64(0.029771057697309585))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(66.5), pvalue=np.float64(0.22560188456509755))
LR -TPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.fl