In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
autistic_spectrum_disorder_screening_data_for_children = fetch_ucirepo(id=419) 
  
# data (as pandas dataframes) 
X = autistic_spectrum_disorder_screening_data_for_children.data.features 
y = autistic_spectrum_disorder_screening_data_for_children.data.targets 
  
# metadata 
print(autistic_spectrum_disorder_screening_data_for_children.metadata) 
  
# variable information 
print(autistic_spectrum_disorder_screening_data_for_children.variables) 


{'uci_id': 419, 'name': 'Autistic Spectrum Disorder Screening Data for Children  ', 'repository_url': 'https://archive.ics.uci.edu/dataset/419/autistic+spectrum+disorder+screening+data+for+children', 'data_url': 'https://archive.ics.uci.edu/static/public/419/data.csv', 'abstract': 'Children screening data for autism suitable for classification and predictive tasks ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 292, 'num_features': 20, 'feature_types': ['Integer'], 'demographics': ['\x00', 'Age', 'Gender', 'Ethnicity', 'Nationality'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2017, 'last_updated': 'Wed Apr 03 2024', 'dataset_doi': '10.24432/C5659W', 'creators': ['Fadi Thabtah'], 'intro_paper': None, 'additional_info': {'summary': "see attached file for variables' description ", 'purpose': None, 'funded_by': None, 'instances_repr

In [2]:
import pandas as pd

feature = pd.DataFrame(X)
feature.head()
target = pd.DataFrame(y)
df = pd.concat([feature, target], axis=1)
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jaundice,autism,country_of_res,used_app_before,result,age_desc,relation,class
0,1,1,0,0,1,1,0,1,0,0,...,m,Others,no,no,Jordan,no,5,'4-11 years',Parent,NO
1,1,1,0,0,1,1,0,1,0,0,...,m,'Middle Eastern ',no,no,Jordan,no,5,'4-11 years',Parent,NO
2,1,1,0,0,0,1,1,1,0,0,...,m,,no,no,Jordan,yes,5,'4-11 years',,NO
3,0,1,0,0,1,1,0,0,0,1,...,f,,yes,no,Jordan,no,4,'4-11 years',,NO
4,1,1,1,1,1,1,1,1,1,1,...,m,Others,yes,no,'United States',no,10,'4-11 years',Parent,YES


In [3]:

import fairtl_statisticaltest as fl
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
df.shape

(292, 21)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         292 non-null    int64  
 1   A2_Score         292 non-null    int64  
 2   A3_Score         292 non-null    int64  
 3   A4_Score         292 non-null    int64  
 4   A5_Score         292 non-null    int64  
 5   A6_Score         292 non-null    int64  
 6   A7_Score         292 non-null    int64  
 7   A8_Score         292 non-null    int64  
 8   A9_Score         292 non-null    int64  
 9   A10_Score        292 non-null    int64  
 10  age              288 non-null    float64
 11  gender           292 non-null    object 
 12  ethnicity        249 non-null    object 
 13  jaundice         292 non-null    object 
 14  autism           292 non-null    object 
 15  country_of_res   292 non-null    object 
 16  used_app_before  292 non-null    object 
 17  result          

In [6]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         248 non-null    int64  
 1   A2_Score         248 non-null    int64  
 2   A3_Score         248 non-null    int64  
 3   A4_Score         248 non-null    int64  
 4   A5_Score         248 non-null    int64  
 5   A6_Score         248 non-null    int64  
 6   A7_Score         248 non-null    int64  
 7   A8_Score         248 non-null    int64  
 8   A9_Score         248 non-null    int64  
 9   A10_Score        248 non-null    int64  
 10  age              248 non-null    float64
 11  gender           248 non-null    object 
 12  ethnicity        248 non-null    object 
 13  jaundice         248 non-null    object 
 14  autism           248 non-null    object 
 15  country_of_res   248 non-null    object 
 16  used_app_before  248 non-null    object 
 17  result          

In [7]:
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jaundice,autism,country_of_res,used_app_before,result,age_desc,relation,class
0,1,1,0,0,1,1,0,1,0,0,...,m,Others,no,no,Jordan,no,5,'4-11 years',Parent,NO
1,1,1,0,0,1,1,0,1,0,0,...,m,'Middle Eastern ',no,no,Jordan,no,5,'4-11 years',Parent,NO
2,1,1,1,1,1,1,1,1,1,1,...,m,Others,yes,no,'United States',no,10,'4-11 years',Parent,YES
3,1,0,1,1,1,1,0,1,0,1,...,m,White-European,no,no,'United Kingdom',no,7,'4-11 years',Parent,YES
4,1,1,1,1,1,1,1,1,0,0,...,f,'Middle Eastern ',no,no,Bahrain,no,8,'4-11 years',Parent,YES


In [8]:
df['gender'] = df['gender'].map({'f': 0, 'm': 1})

In [9]:
enc = OneHotEncoder(categories='auto')
race = df['ethnicity'].values.reshape(-1, 1)
enc.fit(race)
new_features = enc.get_feature_names_out()
print(new_features)
new_race = pd.DataFrame(enc.transform(race).toarray())

["x0_'Middle Eastern '" "x0_'South Asian'" 'x0_Asian' 'x0_Black'
 'x0_Hispanic' 'x0_Latino' 'x0_Others' 'x0_Pasifika' 'x0_Turkish'
 'x0_White-European']


In [10]:
new_race.columns = ['Middle Eastern', 'South Asian', 'Asian', 'Black', 'Hispanic', 'Latino', 'Others', 'Pasifika', 'Turkish', 'White-European']

In [11]:
df = pd.concat([df, new_race], axis=1)
df.drop('ethnicity', axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 30 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         248 non-null    int64  
 1   A2_Score         248 non-null    int64  
 2   A3_Score         248 non-null    int64  
 3   A4_Score         248 non-null    int64  
 4   A5_Score         248 non-null    int64  
 5   A6_Score         248 non-null    int64  
 6   A7_Score         248 non-null    int64  
 7   A8_Score         248 non-null    int64  
 8   A9_Score         248 non-null    int64  
 9   A10_Score        248 non-null    int64  
 10  age              248 non-null    float64
 11  gender           248 non-null    int64  
 12  jaundice         248 non-null    object 
 13  autism           248 non-null    object 
 14  country_of_res   248 non-null    object 
 15  used_app_before  248 non-null    object 
 16  result           248 non-null    int64  
 17  age_desc        

In [13]:
df['jaundice'] = df['jaundice'].map({'no': 0, 'yes': 1})
df['autism'] = df['autism'].map({'no': 0, 'yes': 1})
df['used_app_before'] = df['used_app_before'].map({'no': 0, 'yes': 1})

In [14]:
df.drop(['country_of_res', 'age_desc'], axis=1, inplace=True)

In [15]:
df['relation'].unique()

array(['Parent', 'Self', 'Relative', "'Health care professional'", 'self'],
      dtype=object)

In [16]:
df['relation'] = df['relation'].replace('self', 'Self')

In [17]:
enc2 = OneHotEncoder(categories='auto')
relation = df['relation'].values.reshape(-1, 1)
enc2.fit(relation)
new_features = enc2.get_feature_names_out()
print(new_features)
new_relation = pd.DataFrame(enc2.transform(relation).toarray())


["x0_'Health care professional'" 'x0_Parent' 'x0_Relative' 'x0_Self']


In [18]:
new_relation.columns = ['Health care professional', 'Parent', 'Relative', 'Self']

In [19]:
df = pd.concat([df, new_relation], axis=1)
df.drop('relation', axis=1, inplace=True)

In [20]:
df['class'].unique()

array(['NO', 'YES'], dtype=object)

In [21]:
df['class'] = df['class'].map({'NO': 0, 'YES': 1})

In [22]:
y = df['class'].values
X = df.drop('class', axis=1)

In [23]:
print(X.shape)
selector = VarianceThreshold(threshold=0.1)
X = selector.fit_transform(X)
print(X.shape)

(248, 30)
(248, 18)


In [24]:
# build mask
gender_0_mask = df['gender'] == 0
gender_1_mask = df['gender'] == 1

count_gender_0 = gender_0_mask.sum()
count_gender_1 = gender_1_mask.sum()

print("Female: ", count_gender_0)
print("Male: ", count_gender_1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_Gender_0 = X_scaled[gender_0_mask]
X_scaled_Gender_1 = X_scaled[gender_1_mask]
y_Gender_0 = y[gender_0_mask]
y_Gender_1 = y[gender_1_mask]

Female:  74
Male:  174


In [25]:
seed = 42
models = fl.build_models(seed)

results_list = []

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [26]:
print("Starting experiments for Female(0)")
fl.run_experiment(kf, models, X_scaled_Gender_0, y_Gender_0, 'Female', results_list)

print("Starting experiments for Male(1)")
fl.run_experiment(kf, models, X_scaled_Gender_1, y_Gender_1, 'Male', results_list)

final_results_df = pd.concat(results_list, ignore_index=True)
print(final_results_df)

Starting experiments for Female(0)
Processing fold 1 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 2 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 3 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Processing fold 5 for group Female
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
Starting experiments for Male(1)
Processing fold 1 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF




Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 2 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 3 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 4 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN




Training and evaluating model: NB
Processing fold 5 for group Male
Training and evaluating model: SVM
Training and evaluating model: LR
Training and evaluating model: KNN
Training and evaluating model: RF
Training and evaluating model: DT
Training and evaluating model: ANN
Training and evaluating model: NB
   Fold   Group   SVM_TPR   SVM_TNR   SVM_FPR   SVM_FNR  SVM_TP  SVM_TN  \
0     1  Female  1.000000  1.000000  0.000000  0.000000       8       7   
1     2  Female  1.000000  0.625000  0.375000  0.000000       7       5   
2     3  Female  1.000000  0.750000  0.250000  0.000000       7       6   
3     4  Female  0.833333  1.000000  0.000000  0.166667       5       9   
4     5  Female  0.875000  1.000000  0.000000  0.125000       7       6   
5     1    Male  1.000000  0.947368  0.052632  0.000000      16      18   
6     2    Male  0.941176  0.777778  0.222222  0.058824      16      14   
7     3    Male  1.000000  1.000000  0.000000  0.000000      19      16   
8     4    Male  



In [27]:
results_df = pd.concat(results_list, ignore_index=True)

In [28]:
result_path = './results/U6_result.xlsx'
results_df.to_excel(result_path, index=False)

In [29]:
df = pd.read_excel(result_path)
df.head()

Unnamed: 0,Fold,Group,SVM_TPR,SVM_TNR,SVM_FPR,SVM_FNR,SVM_TP,SVM_TN,SVM_FP,SVM_FN,...,ANN_FP,ANN_FN,NB_TPR,NB_TNR,NB_FPR,NB_FNR,NB_TP,NB_TN,NB_FP,NB_FN
0,1,Female,1.0,1.0,0.0,0.0,8,7,0,0,...,0,0,1.0,1.0,0.0,0.0,8,7,0,0
1,2,Female,1.0,0.625,0.375,0.0,7,5,3,0,...,2,0,0.571429,0.75,0.25,0.428571,4,6,2,3
2,3,Female,1.0,0.75,0.25,0.0,7,6,2,0,...,0,0,0.714286,1.0,0.0,0.285714,5,8,0,2
3,4,Female,0.833333,1.0,0.0,0.166667,5,9,0,1,...,0,1,0.833333,1.0,0.0,0.166667,5,9,0,1
4,5,Female,0.875,1.0,0.0,0.125,7,6,0,1,...,0,1,0.875,1.0,0.0,0.125,7,6,0,1


In [30]:
label = 'Female'

fl.perform_t_tests(df, 'SVM', label)
fl.perform_t_tests(df, 'DT', label)
fl.perform_t_tests(df, 'RF', label)
fl.perform_t_tests(df, 'LR', label)
fl.perform_t_tests(df, 'KNN', label)
fl.perform_t_tests(df, 'ANN', label)
fl.perform_t_tests(df, 'NB', label)


SVM -TPR: MannwhitneyuResult(statistic=np.float64(9.0), pvalue=np.float64(0.440686016488678))
SVM - FPR: MannwhitneyuResult(statistic=np.float64(13.0), pvalue=np.float64(1.0))
SVM - FN/FP: MannwhitneyuResult(statistic=np.float64(16.0), pvalue=np.float64(0.4385780260809998))
DT -TPR: MannwhitneyuResult(statistic=np.float64(12.5), pvalue=np.float64(1.0))
DT - FPR: MannwhitneyuResult(statistic=np.float64(12.5), pvalue=np.float64(1.0))
DT - FN/FP: MannwhitneyuResult(statistic=np.float64(12.5), pvalue=np.float64(1.0))
RF -TPR: MannwhitneyuResult(statistic=np.float64(12.5), pvalue=np.float64(1.0))
RF - FPR: MannwhitneyuResult(statistic=np.float64(12.5), pvalue=np.float64(1.0))
RF - FN/FP: MannwhitneyuResult(statistic=np.float64(12.5), pvalue=np.float64(1.0))
LR -TPR: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.4237107971667934))
LR - FPR: MannwhitneyuResult(statistic=np.float64(15.0), pvalue=np.float64(0.4237107971667934))
LR - FN/FP: MannwhitneyuResult(statistic=np.fl