In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("thedevastator/cancer-patients-and-air-pollution-a-new-link")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/morning/.cache/kagglehub/datasets/thedevastator/cancer-patients-and-air-pollution-a-new-link/versions/2


In [2]:
import os
os.listdir(path)

['cancer patient data sets.csv']

In [3]:
import pandas as pd
import fairtl as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

In [None]:
df = pd.read_csv(f"{path}/cancer patient data sets.csv")

In [5]:
print(df.isna().sum().sum())

0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   index                     1000 non-null   int64 
 1   Patient Id                1000 non-null   object
 2   Age                       1000 non-null   int64 
 3   Gender                    1000 non-null   int64 
 4   Air Pollution             1000 non-null   int64 
 5   Alcohol use               1000 non-null   int64 
 6   Dust Allergy              1000 non-null   int64 
 7   OccuPational Hazards      1000 non-null   int64 
 8   Genetic Risk              1000 non-null   int64 
 9   chronic Lung Disease      1000 non-null   int64 
 10  Balanced Diet             1000 non-null   int64 
 11  Obesity                   1000 non-null   int64 
 12  Smoking                   1000 non-null   int64 
 13  Passive Smoker            1000 non-null   int64 
 14  Chest Pain               

In [7]:
df.shape

(1000, 26)

In [8]:
df.drop(['index', 'Patient Id'], axis=1, inplace=True)
df.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,Low
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High


In [9]:
print(df['Level'].unique())

['Low' 'Medium' 'High']


In [10]:
df['Level'] = df['Level'].map({'Low':0, 'Medium':1, 'High':2})
df.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,0
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,1
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,2
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,2
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,2


In [11]:
y = df['Level'].values
X = df.drop('Level', axis=1)

In [12]:
df['Gender'].unique()

array([1, 2])

In [13]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(1000, 23)
(1000, 23)


In [14]:
# build mask
gender_2_mask = df['Gender'] == 2
gender_1_mask = df['Gender'] == 1

count_gender_2 = gender_2_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_2)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_2 = X_scaled[gender_2_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_2 = y[gender_2_mask]
y_Gender_1 = y[gender_1_mask]

Female:  402
Male:  598


In [15]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=10, shuffle=True, random_state=seed)

In [16]:
print("Starting experiments for Female(2)")
fl.run_experiment(kf, models, X_scaled_Gender_2, y_Gender_2, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(2)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KN

In [17]:
results_df = pd.concat(results_list, ignore_index=True)

In [18]:
result_path = './results/k5_result.xlsx'
results_df.to_excel(result_path, index=False)

In [19]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,1,1.0,0.0,0,13,18,0,0,...,1,0,1,0.944444,0.055556,0,10,17,1,0
1,2,Female,1,1.0,0.0,0,8,19,0,0,...,0,0,1,0.947368,0.052632,0,6,18,1,0
2,3,Female,1,1.0,0.0,0,18,13,0,0,...,0,0,1,1.0,0.0,0,16,13,0,0
3,4,Female,1,1.0,0.0,0,11,14,0,0,...,0,0,1,0.928571,0.071429,0,11,13,1,0
4,5,Female,1,1.0,0.0,0,11,16,0,0,...,0,0,1,1.0,0.0,0,9,16,0,0


In [20]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(45.0), pvalue=np.float64(0.36812025069351895))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
DT -TPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
DT - FPR: MannwhitneyuResult(statistic=np.float64(49.5), pvalue=np.float64(1.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
RF -TPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
RF - FPR: MannwhitneyuResult(statistic=np.float64(45.0), pvalue=np.float64(0.36812025069351895))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
LR -TPR: MannwhitneyuResult(statistic=np.float64(50.0), pvalue=np.float64(1.0))
LR - FPR: MannwhitneyuResult(statistic=np.float64(45.0), pvalue=np.float64(0.5842485531772132))
LR - FN/FP: MannwhitneyuResult(statistic=np.float64(50.0)