In [44]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("amitvkulkarni/lifestyle-factors-influencing-osteoporosis")

print("Path to dataset files:", path)

Path to dataset files: /home/morning/.cache/kagglehub/datasets/amitvkulkarni/lifestyle-factors-influencing-osteoporosis/versions/2


In [45]:
import os
os.listdir(path)

['osteoporosis.csv']

In [46]:
import pandas as pd
import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [47]:
df = pd.read_csv(f"{path}/osteoporosis.csv")
df.head()

Unnamed: 0,Id,Age,Gender,Hormonal Changes,Family History,Race/Ethnicity,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis
0,1734616,69,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,Yes,Moderate,Rheumatoid Arthritis,Corticosteroids,Yes,1
1,1419098,32,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,No,,,,Yes,1
2,1797916,89,Female,Postmenopausal,No,Caucasian,Normal,Adequate,Sufficient,Active,No,Moderate,Hyperthyroidism,Corticosteroids,No,1
3,1805337,78,Female,Normal,No,Caucasian,Underweight,Adequate,Insufficient,Sedentary,Yes,,Rheumatoid Arthritis,Corticosteroids,No,1
4,1351334,38,Male,Postmenopausal,Yes,African American,Normal,Low,Sufficient,Active,Yes,,Rheumatoid Arthritis,,Yes,1


In [48]:
df.drop('Id', inplace=True, axis=1)

In [49]:
df.shape

(1958, 15)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1958 entries, 0 to 1957
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  1958 non-null   int64 
 1   Gender               1958 non-null   object
 2   Hormonal Changes     1958 non-null   object
 3   Family History       1958 non-null   object
 4   Race/Ethnicity       1958 non-null   object
 5   Body Weight          1958 non-null   object
 6   Calcium Intake       1958 non-null   object
 7   Vitamin D Intake     1958 non-null   object
 8   Physical Activity    1958 non-null   object
 9   Smoking              1958 non-null   object
 10  Alcohol Consumption  970 non-null    object
 11  Medical Conditions   1311 non-null   object
 12  Medications          973 non-null    object
 13  Prior Fractures      1958 non-null   object
 14  Osteoporosis         1958 non-null   int64 
dtypes: int64(2), object(13)
memory usage: 229.6+ KB


In [51]:
df['Alcohol Consumption'] = df['Alcohol Consumption'].map({'Moderate': 1})
df['Alcohol Consumption'] = df['Alcohol Consumption'].fillna(0)
df.head()

Unnamed: 0,Age,Gender,Hormonal Changes,Family History,Race/Ethnicity,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis
0,69,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,Yes,1.0,Rheumatoid Arthritis,Corticosteroids,Yes,1
1,32,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,No,0.0,,,Yes,1
2,89,Female,Postmenopausal,No,Caucasian,Normal,Adequate,Sufficient,Active,No,1.0,Hyperthyroidism,Corticosteroids,No,1
3,78,Female,Normal,No,Caucasian,Underweight,Adequate,Insufficient,Sedentary,Yes,0.0,Rheumatoid Arthritis,Corticosteroids,No,1
4,38,Male,Postmenopausal,Yes,African American,Normal,Low,Sufficient,Active,Yes,0.0,Rheumatoid Arthritis,,Yes,1


In [52]:
df['Medical Conditions'] = df['Medical Conditions'].fillna('No')
df['Medical Conditions'] = df['Medical Conditions'].map({'No': 0, 'Hyperthyroidism': 1, 'Rheumatoid Arthritis': 2})
df['Medical Conditions'].unique()


array([2, 0, 1])

In [53]:
df['Medications'] = df['Medications'].fillna('No')
df['Medications'] = df['Medications'].map({'No': 0, 'Corticosteroids': 1})
df['Medications'].unique()


array([1, 0])

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1958 entries, 0 to 1957
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  1958 non-null   int64  
 1   Gender               1958 non-null   object 
 2   Hormonal Changes     1958 non-null   object 
 3   Family History       1958 non-null   object 
 4   Race/Ethnicity       1958 non-null   object 
 5   Body Weight          1958 non-null   object 
 6   Calcium Intake       1958 non-null   object 
 7   Vitamin D Intake     1958 non-null   object 
 8   Physical Activity    1958 non-null   object 
 9   Smoking              1958 non-null   object 
 10  Alcohol Consumption  1958 non-null   float64
 11  Medical Conditions   1958 non-null   int64  
 12  Medications          1958 non-null   int64  
 13  Prior Fractures      1958 non-null   object 
 14  Osteoporosis         1958 non-null   int64  
dtypes: float64(1), int64(4), object(10)
me

In [55]:
df.head()

Unnamed: 0,Age,Gender,Hormonal Changes,Family History,Race/Ethnicity,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis
0,69,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,Yes,1.0,2,1,Yes,1
1,32,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,No,0.0,0,0,Yes,1
2,89,Female,Postmenopausal,No,Caucasian,Normal,Adequate,Sufficient,Active,No,1.0,1,1,No,1
3,78,Female,Normal,No,Caucasian,Underweight,Adequate,Insufficient,Sedentary,Yes,0.0,2,1,No,1
4,38,Male,Postmenopausal,Yes,African American,Normal,Low,Sufficient,Active,Yes,0.0,2,0,Yes,1


In [56]:
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})


In [57]:
df['Hormonal Changes'] = df['Hormonal Changes'].map({'Normal': 0, 'Postmenopausal': 1})
df['Hormonal Changes'].unique()

array([0, 1])

In [58]:
df['Family History'].unique()

array(['Yes', 'No'], dtype=object)

In [59]:
df['Family History'] = df['Family History'].map({'No': 0, 'Yes': 1})

In [60]:
df['Race/Ethnicity'].unique()

array(['Asian', 'Caucasian', 'African American'], dtype=object)

In [61]:
enc = OneHotEncoder(categories='auto')
race = df['Race/Ethnicity'].values.reshape(-1, 1)
enc.fit(race)
new_features = enc.get_feature_names_out()
print(new_features)
new_race = pd.DataFrame(enc.transform(race).toarray())

['x0_African American' 'x0_Asian' 'x0_Caucasian']


In [62]:
new_race.columns = ['African American', 'Asian', 'Caucasian']

In [63]:
df = pd.concat([df, new_race], axis=1)
df.drop('Race/Ethnicity', axis=1, inplace=True)

In [64]:
df.head()

Unnamed: 0,Age,Gender,Hormonal Changes,Family History,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis,African American,Asian,Caucasian
0,69,0,0,1,Underweight,Low,Sufficient,Sedentary,Yes,1.0,2,1,Yes,1,0.0,1.0,0.0
1,32,0,0,1,Underweight,Low,Sufficient,Sedentary,No,0.0,0,0,Yes,1,0.0,1.0,0.0
2,89,0,1,0,Normal,Adequate,Sufficient,Active,No,1.0,1,1,No,1,0.0,0.0,1.0
3,78,0,0,0,Underweight,Adequate,Insufficient,Sedentary,Yes,0.0,2,1,No,1,0.0,0.0,1.0
4,38,1,1,1,Normal,Low,Sufficient,Active,Yes,0.0,2,0,Yes,1,1.0,0.0,0.0


In [65]:
df['Body Weight'].unique()

array(['Underweight', 'Normal'], dtype=object)

In [66]:
df['Body Weight'] = df['Body Weight'].map({'Underweight': 1, 'Normal': 0})

In [67]:
df['Calcium Intake'] = df['Calcium Intake'].map({'Low': 1, 'Adequate': 0})

In [68]:
df.head()

Unnamed: 0,Age,Gender,Hormonal Changes,Family History,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis,African American,Asian,Caucasian
0,69,0,0,1,1,1,Sufficient,Sedentary,Yes,1.0,2,1,Yes,1,0.0,1.0,0.0
1,32,0,0,1,1,1,Sufficient,Sedentary,No,0.0,0,0,Yes,1,0.0,1.0,0.0
2,89,0,1,0,0,0,Sufficient,Active,No,1.0,1,1,No,1,0.0,0.0,1.0
3,78,0,0,0,1,0,Insufficient,Sedentary,Yes,0.0,2,1,No,1,0.0,0.0,1.0
4,38,1,1,1,0,1,Sufficient,Active,Yes,0.0,2,0,Yes,1,1.0,0.0,0.0


In [69]:
df['Vitamin D Intake'].unique()

array(['Sufficient', 'Insufficient'], dtype=object)

In [70]:
df['Vitamin D Intake'] = df['Vitamin D Intake'].map({'Insufficient': 1, 'Sufficient': 0})

In [71]:
df['Physical Activity'].unique()

array(['Sedentary', 'Active'], dtype=object)

In [72]:
df['Physical Activity'] = df['Physical Activity'].map({'Active': 0, 'Sedentary': 1})

In [73]:
df.head()

Unnamed: 0,Age,Gender,Hormonal Changes,Family History,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis,African American,Asian,Caucasian
0,69,0,0,1,1,1,0,1,Yes,1.0,2,1,Yes,1,0.0,1.0,0.0
1,32,0,0,1,1,1,0,1,No,0.0,0,0,Yes,1,0.0,1.0,0.0
2,89,0,1,0,0,0,0,0,No,1.0,1,1,No,1,0.0,0.0,1.0
3,78,0,0,0,1,0,1,1,Yes,0.0,2,1,No,1,0.0,0.0,1.0
4,38,1,1,1,0,1,0,0,Yes,0.0,2,0,Yes,1,1.0,0.0,0.0


In [74]:
df['Smoking'] = df['Smoking'].map({'No': 0, 'Yes': 1})

In [75]:
df['Prior Fractures'].unique()

array(['Yes', 'No'], dtype=object)

In [76]:
df['Prior Fractures'] = df['Prior Fractures'].map({'No': 0, 'Yes': 1})

In [77]:
df.head()

Unnamed: 0,Age,Gender,Hormonal Changes,Family History,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis,African American,Asian,Caucasian
0,69,0,0,1,1,1,0,1,1,1.0,2,1,1,1,0.0,1.0,0.0
1,32,0,0,1,1,1,0,1,0,0.0,0,0,1,1,0.0,1.0,0.0
2,89,0,1,0,0,0,0,0,0,1.0,1,1,0,1,0.0,0.0,1.0
3,78,0,0,0,1,0,1,1,1,0.0,2,1,0,1,0.0,0.0,1.0
4,38,1,1,1,0,1,0,0,1,0.0,2,0,1,1,1.0,0.0,0.0


In [78]:
y = df['Osteoporosis'].values
X = df.drop('Osteoporosis', axis=1)

In [79]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(1958, 16)
(1958, 16)


In [80]:
# build mask
gender_0_mask = df['Gender'] == 0
gender_1_mask = df['Gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  966
Male:  992


In [81]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=10, shuffle=True, random_state=seed)

In [82]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 6 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 9 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 6 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 7 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 8 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 9 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 10 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
    Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0      1  Female  0.829268  0.982143  0.017857  0.170732      34      55   
1      2  Female  0.681818  0.962264  0.037736  0.318182      30      51   
2      3  Female  0.736842  0.900000  0.100000  0.263158      42      36   
3      4  Female  0.685185  0.953488  0.046512  0.314815      37      41   
4      5  Female  0.807692  0.933333  0.066667  0.192308      42      42   
5      6  Female  0.830189  0.977273  0.022727  0.169811      44      43   
6      7  Female  0.690476  0.962963  0.037037  0.309524      29      52   
7      8  Female  0.736842  0.982759  0.017241  0.263158      28      57   
8      



In [83]:
results_df = pd.concat(results_list, ignore_index=True)

In [84]:
result_path = './results/K50_result.xlsx'
results_df.to_excel(result_path, index=False)

In [85]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,0.829268,0.982143,0.017857,0.170732,34,55,1,7,...,8,4,0.878049,0.982143,0.017857,0.121951,36,55,1,5
1,2,Female,0.681818,0.962264,0.037736,0.318182,30,51,2,14,...,1,11,0.75,0.962264,0.037736,0.25,33,51,2,11
2,3,Female,0.736842,0.9,0.1,0.263158,42,36,4,15,...,6,14,0.77193,0.925,0.075,0.22807,44,37,3,13
3,4,Female,0.685185,0.953488,0.046512,0.314815,37,41,2,17,...,7,14,0.759259,0.953488,0.046512,0.240741,41,41,2,13
4,5,Female,0.807692,0.933333,0.066667,0.192308,42,42,3,10,...,9,8,0.826923,0.933333,0.066667,0.173077,43,42,3,9


In [86]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(57.0), pvalue=np.float64(0.622783737416428))
SVM - FPR: TtestResult(statistic=np.float64(-1.6868512650806007), pvalue=np.float64(0.1088853295282254), df=np.float64(18.0))
SVM - FN/FP: TtestResult(statistic=np.float64(1.00390776415118), pvalue=np.float64(0.3287287314287508), df=np.float64(18.0))
DT -TPR: TtestResult(statistic=np.float64(0.5134734782289122), pvalue=np.float64(0.6163589779926506), df=np.float64(12.81316790598145))
DT - FPR: TtestResult(statistic=np.float64(-1.4137977289230907), pvalue=np.float64(0.1744837856924343), df=np.float64(18.0))
DT - FN/FP: TtestResult(statistic=np.float64(0.023175434400842538), pvalue=np.float64(0.9817653205211401), df=np.float64(18.0))
RF -TPR: TtestResult(statistic=np.float64(0.3127971047781034), pvalue=np.float64(0.7580298783076265), df=np.float64(18.0))
RF - FPR: TtestResult(statistic=np.float64(-0.27908615271753967), pvalue=np.float64(0.7833599809278983), df=np.float64(18.0))
RF - FN/FP: Mann