In [73]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mysarahmadbhat/lung-cancer")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/mysarahmadbhat/lung-cancer/versions/1


In [74]:
import os
os.listdir(path)

['survey lung cancer.csv']

In [75]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

In [76]:
df = pd.read_csv(f'{path}/survey lung cancer.csv')
df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [78]:
df.shape

(309, 16)

In [79]:
df['GENDER'] = df['GENDER'].map({'M': 1, 'F': 0})
df['LUNG_CANCER'] = df['LUNG_CANCER'].map({'YES':1, 'NO':0})

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   GENDER                 309 non-null    int64
 1   AGE                    309 non-null    int64
 2   SMOKING                309 non-null    int64
 3   YELLOW_FINGERS         309 non-null    int64
 4   ANXIETY                309 non-null    int64
 5   PEER_PRESSURE          309 non-null    int64
 6   CHRONIC DISEASE        309 non-null    int64
 7   FATIGUE                309 non-null    int64
 8   ALLERGY                309 non-null    int64
 9   WHEEZING               309 non-null    int64
 10  ALCOHOL CONSUMING      309 non-null    int64
 11  COUGHING               309 non-null    int64
 12  SHORTNESS OF BREATH    309 non-null    int64
 13  SWALLOWING DIFFICULTY  309 non-null    int64
 14  CHEST PAIN             309 non-null    int64
 15  LUNG_CANCER            309 non-null    i

In [81]:
y = df['LUNG_CANCER'].values
X = df.drop('LUNG_CANCER', axis=1)

In [82]:
df.shape

(309, 16)

In [83]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(309, 15)
(309, 15)


In [84]:
# build mask
gender_0_mask = df['GENDER'] == 0
gender_1_mask = df['GENDER'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  147
Male:  162


In [85]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [86]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB




Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM




Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  1.000000  0.500000  0.500000  0.000000      26       2   
1     2  Female  1.000000  0.142857  0.857143  0.000000      23       1   
2     3  Female  0.884615  1.000000  0.000000  0.115385      23       3   
3     4  Female  1.000000  1.000000  0.000000  0.000000      27       2   
4     5  Female  0.956522  0.833333  0.166667  0.043478      22       5   
5     1    Male  0.968750  0.000000  1.000000  0.031250      31       0   
6     2    Male  1.000000  0.200000  0.800000  0.000000      28       1   
7     3    Male  1.000000  1.000000  0.000000  0.000000      31       1   
8     4    Male  



In [87]:
results_df = pd.concat(results_list, ignore_index=True)

In [88]:
result_path = './results/K6_result.xlsx'
results_df.to_excel(result_path, index=False)

In [89]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,1.0,0.5,0.5,0.0,26,2,2,0,...,2,0,0.769231,0.75,0.25,0.230769,20,3,1,6
1,2,Female,1.0,0.142857,0.857143,0.0,23,1,6,0,...,5,0,0.869565,0.857143,0.142857,0.130435,20,6,1,3
2,3,Female,0.884615,1.0,0.0,0.115385,23,3,0,3,...,0,4,0.653846,1.0,0.0,0.346154,17,3,0,9
3,4,Female,1.0,1.0,0.0,0.0,27,2,0,0,...,1,0,0.703704,1.0,0.0,0.296296,19,2,0,8
4,5,Female,0.956522,0.833333,0.166667,0.043478,22,5,1,1,...,0,1,0.782609,1.0,0.0,0.217391,18,6,0,5


In [90]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)

SVM -TPR: MannwhitneyuResult(statistic=np.float64(9.0), pvalue=np.float64(0.440686016488678))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(5.5), pvalue=np.float64(0.16660739402832658))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(15.5), pvalue=np.float64(0.5186050164287256))
DT -TPR: TtestResult(statistic=np.float64(-0.12015064487415217), pvalue=np.float64(0.9073265055651987), df=np.float64(8.0))
DT - FPR: TtestResult(statistic=np.float64(-0.8059437181004329), pvalue=np.float64(0.4435701097053385), df=np.float64(8.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(12.0), pvalue=np.float64(1.0))
RF -TPR: MannwhitneyuResult(statistic=np.float64(8.0), pvalue=np.float64(0.3727480704181131))
RF - FPR: MannwhitneyuResult(statistic=np.float64(4.0), pvalue=np.float64(0.08266225205671854))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.6558364965906589))
LR -TPR: MannwhitneyuResult(statistic=np.float64(9.0), pvalue=np.float64(0.5023349543605021