## Load libraries and data

In [1]:
import os
import numpy as np
import pandas as pd

from modules.preprocessing import (
    label_classes,
    preprocess_dataset,
    remove_duplicates_and_columns,
    save_processed_dataset,
)
from modules.utils import load_dataset

In [2]:
# Load files and extract dataset names
raw_files_dir = '../data/raw'
train_files_dir = '../data/processed/train'
test_files_dir = '../data/processed/test'
dataset_files = os.listdir(raw_files_dir)
dataset_names = [filename.split('.csv.gz')[0] for filename in dataset_files]

In [3]:
dataset_names

['kaggle_credit_card_fraud',
 'kaggle_patient_survival',
 'uci_android_permissions',
 'uci_breast_cancer',
 'uci_heart_disease',
 'uci_indian_liver',
 'uci_mushroom',
 'uci_phishing_url',
 'uci_secondary_mushroom',
 'uci_spect_heart']

## Kaggle Credit Card Fraud

In [4]:
# Load dataset
dataset_num = 0
df = load_dataset(os.path.join(raw_files_dir, dataset_files[dataset_num]))
df

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [5]:
# All columns are numeric
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   time    284807 non-null  float64
 1   v1      284807 non-null  float64
 2   v2      284807 non-null  float64
 3   v3      284807 non-null  float64
 4   v4      284807 non-null  float64
 5   v5      284807 non-null  float64
 6   v6      284807 non-null  float64
 7   v7      284807 non-null  float64
 8   v8      284807 non-null  float64
 9   v9      284807 non-null  float64
 10  v10     284807 non-null  float64
 11  v11     284807 non-null  float64
 12  v12     284807 non-null  float64
 13  v13     284807 non-null  float64
 14  v14     284807 non-null  float64
 15  v15     284807 non-null  float64
 16  v16     284807 non-null  float64
 17  v17     284807 non-null  float64
 18  v18     284807 non-null  float64
 19  v19     284807 non-null  float64
 20  v20     284807 non-null  float64
 21  v21     28

In [6]:
# Unique value checks
unique_values = df.nunique()

# Purely unique columns
pure_unique_columns = unique_values[unique_values == df.shape[0]].index.tolist()
binary_columns = unique_values[unique_values == 2].index.tolist()

In [7]:
# No purely unique columns
pure_unique_columns

[]

In [8]:
# Only binary column is the target column
binary_columns

['class']

In [9]:
# Drop duplicates, no obvious columns to drop
df = remove_duplicates_and_columns(df)
df

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [10]:
# Class already labeled
df['class'].unique()

array([0, 1])

In [11]:
# Define target column and column types
y = 'class'
categorical_cols = []
ordinal_cols = []
numerical_cols = [col for col in df.columns if col != y]

In [12]:
# Preprocess dataset
train, test = preprocess_dataset(df, y = y,
                                 categorical_columns = categorical_cols,
                                 numeric_columns = numerical_cols,
                                 ordinal_columns = ordinal_cols)

In [13]:
# Write to file
train_path = os.path.join(train_files_dir, dataset_names[dataset_num] + '-train.csv.gz')
test_path = os.path.join(test_files_dir, dataset_names[dataset_num] + '-test.csv.gz')

save_processed_dataset(train = train, test = test, train_path = train_path, test_path = test_path)

## Kaggle Patient Survival

In [14]:
# Load dataset
dataset_num = 1
df = load_dataset(os.path.join(raw_files_dir, dataset_files[dataset_num]))
df

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,66154,25312,118,68.0,22.730000,0,Caucasian,M,180.3,Floor,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
1,114252,59342,81,77.0,27.420000,0,Caucasian,F,160.0,Floor,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
2,119783,50777,118,25.0,31.950000,0,Caucasian,F,172.7,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
3,79267,46918,118,81.0,22.640000,1,Caucasian,F,165.1,Operating Room / Recovery,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,0
4,92056,34377,33,19.0,,0,Caucasian,M,188.0,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91708,91592,78108,30,75.0,23.060250,0,Caucasian,M,177.8,Floor,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,Sepsis,Cardiovascular,0
91709,66119,13486,121,56.0,47.179671,0,Caucasian,F,183.0,Floor,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
91710,8981,58179,195,48.0,27.236914,0,Caucasian,M,170.2,Accident & Emergency,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
91711,33776,120598,66,,23.297481,0,Caucasian,F,154.9,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0


In [15]:
# Mixture of data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91713 entries, 0 to 91712
Data columns (total 84 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   encounter_id                   91713 non-null  int64  
 1   patient_id                     91713 non-null  int64  
 2   hospital_id                    91713 non-null  int64  
 3   age                            87485 non-null  float64
 4   bmi                            88284 non-null  float64
 5   elective_surgery               91713 non-null  int64  
 6   ethnicity                      90318 non-null  object 
 7   gender                         91688 non-null  object 
 8   height                         90379 non-null  float64
 9   icu_admit_source               91601 non-null  object 
 10  icu_id                         91713 non-null  int64  
 11  icu_stay_type                  91713 non-null  object 
 12  icu_type                       91713 non-null 

In [16]:
# Unique value checks
unique_values = df.nunique()

# Purely unique columns
pure_unique_columns = unique_values[unique_values == df.shape[0]].index.tolist()
binary_columns = unique_values[unique_values == 2].index.tolist()

In [17]:
# These two are ID columns and should be dropped
pure_unique_columns

['encounter_id', 'patient_id']

In [18]:
# All of these should be ordinally encoded except for the target which is hospital death
binary_columns

['elective_surgery',
 'gender',
 'apache_post_operative',
 'arf_apache',
 'gcs_unable_apache',
 'intubated_apache',
 'ventilated_apache',
 'aids',
 'cirrhosis',
 'diabetes_mellitus',
 'hepatic_failure',
 'immunosuppression',
 'leukemia',
 'lymphoma',
 'solid_tumor_with_metastasis',
 'hospital_death']

In [19]:
# No ordinal columns in the object columns
df.select_dtypes('object')

Unnamed: 0,ethnicity,gender,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem
0,Caucasian,M,Floor,admit,CTICU,Sepsis,Cardiovascular
1,Caucasian,F,Floor,admit,Med-Surg ICU,Respiratory,Respiratory
2,Caucasian,F,Accident & Emergency,admit,Med-Surg ICU,Metabolic,Metabolic
3,Caucasian,F,Operating Room / Recovery,admit,CTICU,Cardiovascular,Cardiovascular
4,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Trauma,Trauma
...,...,...,...,...,...,...,...
91708,Caucasian,M,Floor,admit,Cardiac ICU,Sepsis,Cardiovascular
91709,Caucasian,F,Floor,admit,Med-Surg ICU,Sepsis,Cardiovascular
91710,Caucasian,M,Accident & Emergency,admit,Med-Surg ICU,Metabolic,Metabolic
91711,Caucasian,F,Accident & Emergency,admit,Med-Surg ICU,Respiratory,Respiratory


In [20]:
# Drop duplicates and ID columns
df = remove_duplicates_and_columns(df, columns_to_drop = pure_unique_columns)
df

Unnamed: 0,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,icu_id,icu_stay_type,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,118,68.0,22.730000,0,Caucasian,M,180.3,Floor,92,admit,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
1,81,77.0,27.420000,0,Caucasian,F,160.0,Floor,90,admit,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
2,118,25.0,31.950000,0,Caucasian,F,172.7,Accident & Emergency,93,admit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
3,118,81.0,22.640000,1,Caucasian,F,165.1,Operating Room / Recovery,92,admit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,0
4,33,19.0,,0,Caucasian,M,188.0,Accident & Emergency,91,admit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91708,30,75.0,23.060250,0,Caucasian,M,177.8,Floor,927,admit,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,Sepsis,Cardiovascular,0
91709,121,56.0,47.179671,0,Caucasian,F,183.0,Floor,925,admit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
91710,195,48.0,27.236914,0,Caucasian,M,170.2,Accident & Emergency,908,admit,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
91711,66,,23.297481,0,Caucasian,F,154.9,Accident & Emergency,922,admit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0


In [21]:
# Class already labeled
df['hospital_death'].unique()

array([0, 1])

In [22]:
# Define target column and column types
# Column types determined using data dictionary from: https://www.kaggle.com/datasets/mitishaagarwal/patient/data
y = 'hospital_death'
categorical_cols = [col for col in df.select_dtypes('object').columns if col not in binary_columns and col != y]\
                    + ['hospital_id', 'icu_id', 'apache_3j_diagnosis']
ordinal_cols = [col for col in binary_columns if col != y]\
                    + ['apache_2_diagnosis', 'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_verbal_apache']
numerical_cols = [col for col in df.columns if col not in categorical_cols and col not in ordinal_cols and col != y]

In [23]:
# Preprocess dataset
train, test = preprocess_dataset(df, y = y,
                                 categorical_columns = categorical_cols,
                                 numeric_columns = numerical_cols,
                                 ordinal_columns = ordinal_cols)

In [24]:
# Write to file
train_path = os.path.join(train_files_dir, dataset_names[dataset_num] + '-train.csv.gz')
test_path = os.path.join(test_files_dir, dataset_names[dataset_num] + '-test.csv.gz')

save_processed_dataset(train = train, test = test, train_path = train_path, test_path = test_path)

## UCI Android Permissions

In [25]:
# Load dataset
dataset_num = 2
df = load_dataset(os.path.join(raw_files_dir, dataset_files[dataset_num]))
df

Unnamed: 0,android.permission.get_accounts,com.sonyericsson.home.permission.broadcast_badge,android.permission.read_profile,android.permission.manage_accounts,android.permission.write_sync_settings,android.permission.read_external_storage,android.permission.receive_sms,com.android.launcher.permission.read_settings,android.permission.write_settings,com.google.android.providers.gsf.permission.read_gservices,...,com.android.launcher.permission.uninstall_shortcut,com.sec.android.iap.permission.billing,com.htc.launcher.permission.update_shortcut,com.sec.android.provider.badge.permission.write,android.permission.access_network_state,com.google.android.finsky.permission.bind_get_install_referrer_service,com.huawei.android.launcher.permission.read_settings,android.permission.read_sms,android.permission.process_incoming_calls,result
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29327,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
29328,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
29329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
29330,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [26]:
# All numeric data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29332 entries, 0 to 29331
Data columns (total 87 columns):
 #   Column                                                                         Non-Null Count  Dtype
---  ------                                                                         --------------  -----
 0   android.permission.get_accounts                                                29332 non-null  int64
 1   com.sonyericsson.home.permission.broadcast_badge                               29332 non-null  int64
 2   android.permission.read_profile                                                29332 non-null  int64
 3   android.permission.manage_accounts                                             29332 non-null  int64
 4   android.permission.write_sync_settings                                         29332 non-null  int64
 5   android.permission.read_external_storage                                       29332 non-null  int64
 6   android.permission.receive_sms        

In [27]:
# Unique value checks
unique_values = df.nunique()

# Purely unique columns
pure_unique_columns = unique_values[unique_values == df.shape[0]].index.tolist()
binary_columns = unique_values[unique_values == 2].index.tolist()

In [28]:
# No purely unique columns
pure_unique_columns

[]

In [29]:
# All of these should be ordinally encoded except for the target which is result
binary_columns

['android.permission.get_accounts',
 'com.sonyericsson.home.permission.broadcast_badge',
 'android.permission.read_profile',
 'android.permission.manage_accounts',
 'android.permission.write_sync_settings',
 'android.permission.read_external_storage',
 'android.permission.receive_sms',
 'com.android.launcher.permission.read_settings',
 'android.permission.write_settings',
 'com.google.android.providers.gsf.permission.read_gservices',
 'android.permission.download_without_notification',
 'android.permission.get_tasks',
 'android.permission.write_external_storage',
 'android.permission.record_audio',
 'com.huawei.android.launcher.permission.change_badge',
 'com.oppo.launcher.permission.read_settings',
 'android.permission.change_network_state',
 'com.android.launcher.permission.install_shortcut',
 'android.permission.android.permission.read_phone_state',
 'android.permission.call_phone',
 'android.permission.write_contacts',
 'android.permission.read_phone_state',
 'com.samsung.android.p

In [30]:
# All features are actually categorical
len(binary_columns)

87

In [31]:
# Note that because there is no identifier column, due to the nature of the data there are many duplicate entries
# Presumably these are different applications and the app names were scrubbed from the dataset, so ignore duplicates
# df = remove_duplicates_and_columns(df)

In [32]:
# Class already labeled
df['result'].unique()

array([0, 1])

In [33]:
# Define target column and column types
y = 'result'
categorical_cols = []
ordinal_cols = [col for col in binary_columns if col != y]
numerical_cols = []

In [34]:
# Preprocess dataset
train, test = preprocess_dataset(df, y = y,
                                 categorical_columns = categorical_cols,
                                 numeric_columns = numerical_cols,
                                 ordinal_columns = ordinal_cols)

In [35]:
# Write to file
train_path = os.path.join(train_files_dir, dataset_names[dataset_num] + '-train.csv.gz')
test_path = os.path.join(test_files_dir, dataset_names[dataset_num] + '-test.csv.gz')

save_processed_dataset(train = train, test = test, train_path = train_path, test_path = test_path)

## UCI Breast Cancer

In [36]:
# Load dataset
dataset_num = 3
df = load_dataset(os.path.join(raw_files_dir, dataset_files[dataset_num]))
df

Unnamed: 0,patient_id,diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [37]:
# All numeric data types except for diagnosis which is the target
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   patient_id          569 non-null    int64  
 1   diagnosis           569 non-null    object 
 2   radius1             569 non-null    float64
 3   texture1            569 non-null    float64
 4   perimeter1          569 non-null    float64
 5   area1               569 non-null    float64
 6   smoothness1         569 non-null    float64
 7   compactness1        569 non-null    float64
 8   concavity1          569 non-null    float64
 9   concave_points1     569 non-null    float64
 10  symmetry1           569 non-null    float64
 11  fractal_dimension1  569 non-null    float64
 12  radius2             569 non-null    float64
 13  texture2            569 non-null    float64
 14  perimeter2          569 non-null    float64
 15  area2               569 non-null    float64
 16  smoothne

In [38]:
# Unique value checks
unique_values = df.nunique()

# Purely unique columns
pure_unique_columns = unique_values[unique_values == df.shape[0]].index.tolist()
binary_columns = unique_values[unique_values == 2].index.tolist()

In [39]:
# Patient ID is purely unique and should be dropped
pure_unique_columns

['patient_id']

In [40]:
# Only binary column is the target which is diagnosis
binary_columns

['diagnosis']

In [41]:
# Drop duplicates and id column
df = remove_duplicates_and_columns(df, columns_to_drop = pure_unique_columns)
df

Unnamed: 0,diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [42]:
# Class is not already labeled
df['diagnosis'].unique()

array(['M', 'B'], dtype=object)

In [43]:
# Beneficial masses labeled as 0, malignant masses as 1
df = label_classes(df, y = 'diagnosis', label_map = {'B': 0, 'M': 1})

In [44]:
# Define target column and column types
# Column types determined using data dictionary from: https://www.kaggle.com/datasets/mitishaagarwal/patient/data
y = 'diagnosis'
categorical_cols = []
ordinal_cols = []
numerical_cols = [col for col in df.columns if col != y]

In [45]:
# Preprocess dataset
train, test = preprocess_dataset(df, y = y,
                                 categorical_columns = categorical_cols,
                                 numeric_columns = numerical_cols,
                                 ordinal_columns = ordinal_cols)

In [46]:
# Write to file
train_path = os.path.join(train_files_dir, dataset_names[dataset_num] + '-train.csv.gz')
test_path = os.path.join(test_files_dir, dataset_names[dataset_num] + '-test.csv.gz')

save_processed_dataset(train = train, test = test, train_path = train_path, test_path = test_path)

## UCI Heart Disease

In [47]:
# Load dataset
dataset_num = 4
df = load_dataset(os.path.join(raw_files_dir, dataset_files[dataset_num]))
df

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,resting_ecg,maximum_heart_rate,exercise_induced_angina,st_depression,slope,num_major_vessels,thal,diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,1
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,1
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [48]:
# Mixed data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      303 non-null    float64
 1   sex                      303 non-null    float64
 2   chest_pain_type          303 non-null    float64
 3   resting_blood_pressure   303 non-null    float64
 4   cholestoral              303 non-null    float64
 5   fasting_blood_sugar      303 non-null    float64
 6   resting_ecg              303 non-null    float64
 7   maximum_heart_rate       303 non-null    float64
 8   exercise_induced_angina  303 non-null    float64
 9   st_depression            303 non-null    float64
 10  slope                    303 non-null    float64
 11  num_major_vessels        303 non-null    object 
 12  thal                     303 non-null    object 
 13  diagnosis                303 non-null    int64  
dtypes: float64(11), int64(1), 

In [49]:
# Notice that there is special category '?' for missing
df.num_major_vessels.unique()

array(['0.0', '3.0', '2.0', '1.0', '?'], dtype=object)

In [50]:
# Notice that there is special category '?' for missing
df.thal.unique()

array(['6.0', '3.0', '7.0', '?'], dtype=object)

In [51]:
# Replace placeholder values with NaN
df.num_major_vessels = df.num_major_vessels.replace('?', np.nan)
df.thal = df.thal.replace('?', np.nan)

In [52]:
# Unique value checks
unique_values = df.nunique()

# Purely unique columns
pure_unique_columns = unique_values[unique_values == df.shape[0]].index.tolist()
binary_columns = unique_values[unique_values == 2].index.tolist()

In [53]:
# No purely unique columns
pure_unique_columns

[]

In [54]:
# All of these should be ordinally encoded except for the target which is diagnosis
binary_columns

['sex', 'fasting_blood_sugar', 'exercise_induced_angina', 'diagnosis']

In [55]:
# Drop duplicates
df = remove_duplicates_and_columns(df)
df

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,resting_ecg,maximum_heart_rate,exercise_induced_angina,st_depression,slope,num_major_vessels,thal,diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,1
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,1
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [56]:
df.resting_ecg.unique()

array([2., 0., 1.])

In [57]:
# Class already labeled
df['diagnosis'].unique()

array([0, 1])

In [58]:
# Define target column and column types
# Column types determined using data dictionary from: https://archive.ics.uci.edu/dataset/45/heart+disease
y = 'diagnosis'
categorical_cols = ['chest_pain_type', 'resting_ecg', 'slope', 'num_major_vessels', 'thal']
ordinal_cols = [col for col in binary_columns if col != y]
numerical_cols = [col for col in df.columns if col not in categorical_cols and col not in ordinal_cols and col != y]

In [59]:
# Preprocess dataset
train, test = preprocess_dataset(df, y = y,
                                 categorical_columns = categorical_cols,
                                 numeric_columns = numerical_cols,
                                 ordinal_columns = ordinal_cols)

In [60]:
# Write to file
train_path = os.path.join(train_files_dir, dataset_names[dataset_num] + '-train.csv.gz')
test_path = os.path.join(test_files_dir, dataset_names[dataset_num] + '-test.csv.gz')

save_processed_dataset(train = train, test = test, train_path = train_path, test_path = test_path)

## UCI Indian Liver

In [61]:
# Load dataset
dataset_num = 5
df = load_dataset(os.path.join(raw_files_dir, dataset_files[dataset_num]))
df

Unnamed: 0,age,gender,total_bilirubin,direct_bilirubin,alkaline_phosphotase,alamine_aminoransferase,aspartate_aminotransferase,total_proteins,albumin,almumin_globulin_ratio,has_liver_disease
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,0
579,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [62]:
# Mixed data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         583 non-null    int64  
 1   gender                      583 non-null    object 
 2   total_bilirubin             583 non-null    float64
 3   direct_bilirubin            583 non-null    float64
 4   alkaline_phosphotase        583 non-null    int64  
 5   alamine_aminoransferase     583 non-null    int64  
 6   aspartate_aminotransferase  583 non-null    int64  
 7   total_proteins              583 non-null    float64
 8   albumin                     583 non-null    float64
 9   almumin_globulin_ratio      579 non-null    float64
 10  has_liver_disease           583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [63]:
# Unique value checks
unique_values = df.nunique()

# Purely unique columns
pure_unique_columns = unique_values[unique_values == df.shape[0]].index.tolist()
binary_columns = unique_values[unique_values == 2].index.tolist()

In [64]:
# No purely unique columns
pure_unique_columns

[]

In [65]:
# All of these should be ordinally encoded except for the target which is has_liver_disease
binary_columns

['gender', 'has_liver_disease']

In [66]:
# Remove duplicates
df = remove_duplicates_and_columns(df)
df

Unnamed: 0,age,gender,total_bilirubin,direct_bilirubin,alkaline_phosphotase,alamine_aminoransferase,aspartate_aminotransferase,total_proteins,albumin,almumin_globulin_ratio,has_liver_disease
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,0
579,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [67]:
# Class already labeled
df['has_liver_disease'].unique()

array([1, 0])

In [68]:
# Define target column and column types
# Column types determined using data dictionary from: https://archive.ics.uci.edu/dataset/225/ilpd+indian+liver+patient+dataset
y = 'has_liver_disease'
categorical_cols = []
ordinal_cols = [col for col in binary_columns if col != y]
numerical_cols = [col for col in df.columns if col not in ordinal_cols and col != y]

In [69]:
# Preprocess dataset
train, test = preprocess_dataset(df, y = y,
                                 categorical_columns = categorical_cols,
                                 numeric_columns = numerical_cols,
                                 ordinal_columns = ordinal_cols)

In [70]:
# Write to file
train_path = os.path.join(train_files_dir, dataset_names[dataset_num] + '-train.csv.gz')
test_path = os.path.join(test_files_dir, dataset_names[dataset_num] + '-test.csv.gz')

save_processed_dataset(train = train, test = test, train_path = train_path, test_path = test_path)

## UCI Mushroom

In [71]:
# Load dataset
dataset_num = 6
df = load_dataset(os.path.join(raw_files_dir, dataset_files[dataset_num]))
df

Unnamed: 0,poisonous,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,_veil_color,ring_number,_ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [72]:
# All object data types, every feature is categorical
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   poisonous                 8124 non-null   object
 1   cap_shape                 8124 non-null   object
 2   cap_surface               8124 non-null   object
 3   cap_color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill_attachment           8124 non-null   object
 7   gill_spacing              8124 non-null   object
 8   gill_size                 8124 non-null   object
 9   gill_color                8124 non-null   object
 10  stalk_shape               8124 non-null   object
 11  stalk_root                8124 non-null   object
 12  stalk_surface_above_ring  8124 non-null   object
 13  stalk_surface_below_ring  8124 non-null   object
 14  stalk_color_above_ring  

In [73]:
# Some columns seem to be misnamed, fix the column names
df = df.rename({'_veil_color': 'veil_color', '_ring_type': 'ring_type'}, axis = 1)

In [74]:
# Replace missing value with nan using info from: https://archive.ics.uci.edu/dataset/73/mushroom
df.stalk_root = df.stalk_root.replace({'?': np.nan})

In [75]:
# Unique value checks
unique_values = df.nunique()

# Purely unique columns
pure_unique_columns = unique_values[unique_values == df.shape[0]].index.tolist()
binary_columns = unique_values[unique_values == 2].index.tolist()

In [76]:
# No purely unique columns
pure_unique_columns

[]

In [77]:
# All of these should be ordinally encoded except for the target which is poisonous
binary_columns

['poisonous',
 'bruises',
 'gill_attachment',
 'gill_spacing',
 'gill_size',
 'stalk_shape']

In [78]:
# Remove duplicates
df = remove_duplicates_and_columns(df)
df

Unnamed: 0,poisonous,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [79]:
# Class not already labeled
df['poisonous'].unique()

array(['p', 'e'], dtype=object)

In [80]:
# Edible mushrooms labeled as 0, poisonous mushrooms labeled as 1
df = label_classes(df, y = 'poisonous', label_map = {'e': 0, 'p': 1})

In [81]:
# Define target column and column types
# Column types determined using data dictionary from: https://archive.ics.uci.edu/dataset/73/mushroom
y = 'poisonous'
categorical_cols = [col for col in df.columns if col not in binary_columns and col != y]
ordinal_cols = [col for col in binary_columns if col != y]
numerical_cols = []

In [82]:
# Preprocess dataset
train, test = preprocess_dataset(df, y = y,
                                 categorical_columns = categorical_cols,
                                 numeric_columns = numerical_cols,
                                 ordinal_columns = ordinal_cols)

In [83]:
# Write to file
train_path = os.path.join(train_files_dir, dataset_names[dataset_num] + '-train.csv.gz')
test_path = os.path.join(test_files_dir, dataset_names[dataset_num] + '-test.csv.gz')

save_processed_dataset(train = train, test = test, train_path = train_path, test_path = test_path)

## UCI Phishing URL

In [15]:
# Load dataset
dataset_num = 7
df = load_dataset(os.path.join(raw_files_dir, dataset_files[dataset_num]))
df

Unnamed: 0,file_name,url,url_length,domain,domain_length,is_domain_ip,tld,url_similarity_index,char_continuation_rate,tld_legitimate_prob,...,pay,crypto,has_copyright_info,num_image,num_css,num_js,num_self_ref,num_empty_ref,num_external_ref,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.000000,1.000000,0.522907,...,0,0,1,34,20,28,119,0,124,1
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.000000,0.666667,0.032650,...,0,0,1,50,9,8,39,0,217,1
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.000000,0.866667,0.028555,...,0,0,1,10,2,7,42,2,5,1
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.000000,1.000000,0.522907,...,1,1,1,3,27,15,22,1,31,1
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.000000,1.000000,0.079963,...,1,0,1,244,15,34,72,1,85,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235790,660997.txt,https://www.skincareliving.com,29,www.skincareliving.com,22,0,com,100.000000,1.000000,0.522907,...,1,0,1,51,7,21,187,2,191,1
235791,77185.txt,https://www.winchester.gov.uk,28,www.winchester.gov.uk,21,0,uk,100.000000,0.785714,0.028555,...,1,0,0,50,1,7,88,0,31,1
235792,622132.txt,https://www.nononsensedesign.be,30,www.nononsensedesign.be,23,0,be,100.000000,1.000000,0.003319,...,0,0,1,27,10,30,58,2,67,1
235793,7503962.txt,https://patient-cell-40f5.updatedlogmylogin.wo...,55,patient-cell-40f5.updatedlogmylogin.workers.dev,47,0,dev,28.157537,0.465116,0.000961,...,0,0,0,0,0,3,0,0,0,0


In [16]:
# Mixed data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235795 entries, 0 to 235794
Data columns (total 56 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   file_name                    235795 non-null  object 
 1   url                          235795 non-null  object 
 2   url_length                   235795 non-null  int64  
 3   domain                       235795 non-null  object 
 4   domain_length                235795 non-null  int64  
 5   is_domain_ip                 235795 non-null  int64  
 6   tld                          235795 non-null  object 
 7   url_similarity_index         235795 non-null  float64
 8   char_continuation_rate       235795 non-null  float64
 9   tld_legitimate_prob          235795 non-null  float64
 10  url_char_prob                235795 non-null  float64
 11  tld_length                   235795 non-null  int64  
 12  num_sub_domain               235795 non-null  int64  
 13 

In [17]:
# Unique value checks
unique_values = df.nunique()

# Purely unique columns
pure_unique_columns = unique_values[unique_values == df.shape[0]].index.tolist()
binary_columns = unique_values[unique_values == 2].index.tolist()
highly_unique_columns = unique_values[unique_values > df.shape[0] * 0.75].index.tolist()

In [21]:
unique_values

file_name                      235795
url                            235370
url_length                        482
domain                         220086
domain_length                     101
is_domain_ip                        2
tld                               695
url_similarity_index            36360
char_continuation_rate            898
tld_legitimate_prob               465
url_char_prob                  227421
tld_length                         12
num_sub_domain                     10
has_obfuscation                     2
num_obfuscated_char                20
obfuscation_ratio                 146
num_letters_url                   421
letter_ratio_url                  709
num_digits_url                    182
digit_ratio_url                   575
num_equals_url                     25
num_qmark_url                       5
num_amp_url                        31
num_other_special_chars_url        74
spacial_char_ratio_url            240
is_https                            2
line_of_code

In [18]:
# File names are purely unique and should be dropped
pure_unique_columns

['file_name']

In [19]:
# All of these should be ordinally encoded except for the target which is label
binary_columns

['is_domain_ip',
 'has_obfuscation',
 'is_https',
 'has_title',
 'has_favicon',
 'robots',
 'is_responsive',
 'num_url_redirect',
 'num_self_redirect',
 'has_description',
 'has_external_form_submit',
 'has_social_net',
 'has_submit_button',
 'has_hidden_fields',
 'has_password_field',
 'bank',
 'pay',
 'crypto',
 'has_copyright_info',
 'label']

In [22]:
# Note that there are some categorical columns with over 75% uniqueness for a large dataset
# These should be dropped because they are too specific
highly_unique_columns

['file_name', 'url', 'domain', 'url_char_prob', 'title']

In [23]:
# Drop duplicates and file names
df = remove_duplicates_and_columns(df, columns_to_drop = pure_unique_columns + highly_unique_columns)
df

Unnamed: 0,url_length,domain_length,is_domain_ip,tld,url_similarity_index,char_continuation_rate,tld_legitimate_prob,tld_length,num_sub_domain,has_obfuscation,...,pay,crypto,has_copyright_info,num_image,num_css,num_js,num_self_ref,num_empty_ref,num_external_ref,label
0,31,24,0,com,100.000000,1.000000,0.522907,3,1,0,...,0,0,1,34,20,28,119,0,124,1
1,23,16,0,de,100.000000,0.666667,0.032650,2,1,0,...,0,0,1,50,9,8,39,0,217,1
2,29,22,0,uk,100.000000,0.866667,0.028555,2,2,0,...,0,0,1,10,2,7,42,2,5,1
3,26,19,0,com,100.000000,1.000000,0.522907,3,1,0,...,1,1,1,3,27,15,22,1,31,1
4,33,26,0,org,100.000000,1.000000,0.079963,3,1,0,...,1,0,1,244,15,34,72,1,85,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235790,29,22,0,com,100.000000,1.000000,0.522907,3,1,0,...,1,0,1,51,7,21,187,2,191,1
235791,28,21,0,uk,100.000000,0.785714,0.028555,2,2,0,...,1,0,0,50,1,7,88,0,31,1
235792,30,23,0,be,100.000000,1.000000,0.003319,2,1,0,...,0,0,1,27,10,30,58,2,67,1
235793,55,47,0,dev,28.157537,0.465116,0.000961,3,2,0,...,0,0,0,0,0,3,0,0,0,0


In [24]:
# Class already labeled
df['label'].unique()

array([1, 0])

In [25]:
# Define target column and column types
# Column types determined using data dictionary from: https://archive.ics.uci.edu/dataset/967/phiusiil+phishing+url+dataset
y = 'label'
categorical_cols = ['tld']
ordinal_cols = [col for col in binary_columns if col != y]
numerical_cols = [col for col in df.columns if col not in categorical_cols and col not in ordinal_cols and col != y]

In [26]:
# Preprocess dataset
train, test = preprocess_dataset(df, y = y,
                                 categorical_columns = categorical_cols,
                                 numeric_columns = numerical_cols,
                                 ordinal_columns = ordinal_cols)

In [27]:
# Write to file
train_path = os.path.join(train_files_dir, dataset_names[dataset_num] + '-train.csv.gz')
test_path = os.path.join(test_files_dir, dataset_names[dataset_num] + '-test.csv.gz')

save_processed_dataset(train = train, test = test, train_path = train_path, test_path = test_path)

## UCI Secondary Mushroom

In [94]:
# Load dataset
dataset_num = 8
df = load_dataset(os.path.join(raw_files_dir, dataset_files[dataset_num]))
df

Unnamed: 0,class,cap_diameter,cap_shape,cap_surface,cap_color,does_bruise_or_bleed,gill_attachment,gill_spacing,gill_color,stem_height,...,stem_root,stem_surface,stem_color,veil_type,veil_color,has_ring,ring_type,spore_print_color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.60,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.80,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,p,1.18,s,s,y,f,f,f,f,3.93,...,,,y,,,f,f,,d,a
61065,p,1.27,f,s,y,f,f,f,f,3.18,...,,,y,,,f,f,,d,a
61066,p,1.27,s,s,y,f,f,f,f,3.86,...,,,y,,,f,f,,d,u
61067,p,1.24,f,s,y,f,f,f,f,3.56,...,,,y,,,f,f,,d,u


In [95]:
# Mixed data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 61069 non-null  object 
 1   cap_diameter          61069 non-null  float64
 2   cap_shape             61069 non-null  object 
 3   cap_surface           46949 non-null  object 
 4   cap_color             61069 non-null  object 
 5   does_bruise_or_bleed  61069 non-null  object 
 6   gill_attachment       51185 non-null  object 
 7   gill_spacing          36006 non-null  object 
 8   gill_color            61069 non-null  object 
 9   stem_height           61069 non-null  float64
 10  stem_width            61069 non-null  float64
 11  stem_root             9531 non-null   object 
 12  stem_surface          22945 non-null  object 
 13  stem_color            61069 non-null  object 
 14  veil_type             3177 non-null   object 
 15  veil_color         

In [96]:
# Unique value checks
unique_values = df.nunique()

# Purely unique columns
pure_unique_columns = unique_values[unique_values == df.shape[0]].index.tolist()
binary_columns = unique_values[unique_values == 2].index.tolist()

In [97]:
# No purely unique columns
pure_unique_columns

[]

In [98]:
# All of these should be ordinally encoded except for the target which is class
binary_columns

['class', 'does_bruise_or_bleed', 'has_ring']

In [99]:
# Drop duplicates and highly missing columns
df = remove_duplicates_and_columns(df)
df

Unnamed: 0,class,cap_diameter,cap_shape,cap_surface,cap_color,does_bruise_or_bleed,gill_attachment,gill_spacing,gill_color,stem_height,stem_width,stem_color,has_ring,ring_type,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,17.09,w,t,g,d,w
1,p,16.60,x,g,o,f,e,,w,17.99,18.19,w,t,g,d,u
2,p,14.07,x,g,o,f,e,,w,17.80,17.74,w,t,g,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,15.98,w,t,p,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,17.20,w,t,p,d,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,p,1.18,s,s,y,f,f,f,f,3.93,6.22,y,f,f,d,a
61065,p,1.27,f,s,y,f,f,f,f,3.18,5.43,y,f,f,d,a
61066,p,1.27,s,s,y,f,f,f,f,3.86,6.37,y,f,f,d,u
61067,p,1.24,f,s,y,f,f,f,f,3.56,5.44,y,f,f,d,u


In [100]:
# Class not already labeled
df['class'].unique()

array(['p', 'e'], dtype=object)

In [101]:
# Edible mushrooms labeled as 0, poisonous mushrooms labeled as 1
df = label_classes(df, y = 'class', label_map = {'e': 0, 'p': 1})

In [102]:
# Define target column and column types
# Column types determined using data dictionary from: https://archive.ics.uci.edu/dataset/848/secondary+mushroom+dataset
y = 'class'
categorical_cols = [col for col in df.select_dtypes('object').columns if col not in binary_columns and col != y]
ordinal_cols = [col for col in binary_columns if col != y]
numerical_cols = ['cap_diameter', 'stem_height', 'stem_width']

In [103]:
# Preprocess dataset
train, test = preprocess_dataset(df, y = y,
                                 categorical_columns = categorical_cols,
                                 numeric_columns = numerical_cols,
                                 ordinal_columns = ordinal_cols)

In [104]:
# Write to file
train_path = os.path.join(train_files_dir, dataset_names[dataset_num] + '-train.csv.gz')
test_path = os.path.join(test_files_dir, dataset_names[dataset_num] + '-test.csv.gz')

save_processed_dataset(train = train, test = test, train_path = train_path, test_path = test_path)

## UCI Spect Heart

In [105]:
# Load dataset
dataset_num = 9
df = load_dataset(os.path.join(raw_files_dir, dataset_files[dataset_num]))
df

Unnamed: 0,diagnosis,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22
0,1,1,0,0,1,1,0,0,0,1,...,0,1,1,1,0,0,1,1,0,0
1,1,1,0,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,0,0,0,1,0,1,0,0,1,...,0,1,1,0,0,0,0,0,0,1
3,1,0,1,1,1,0,0,1,0,1,...,1,1,0,1,0,0,0,0,1,0
4,1,0,0,1,0,0,0,0,1,0,...,1,1,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
263,0,1,0,0,0,1,1,0,0,1,...,0,1,0,0,0,0,1,1,0,0
264,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
265,0,0,0,1,1,0,0,1,0,0,...,1,1,0,0,0,0,0,0,1,1


In [106]:
# All numeric data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267 entries, 0 to 266
Data columns (total 23 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   diagnosis  267 non-null    int64
 1   f1         267 non-null    int64
 2   f2         267 non-null    int64
 3   f3         267 non-null    int64
 4   f4         267 non-null    int64
 5   f5         267 non-null    int64
 6   f6         267 non-null    int64
 7   f7         267 non-null    int64
 8   f8         267 non-null    int64
 9   f9         267 non-null    int64
 10  f10        267 non-null    int64
 11  f11        267 non-null    int64
 12  f12        267 non-null    int64
 13  f13        267 non-null    int64
 14  f14        267 non-null    int64
 15  f15        267 non-null    int64
 16  f16        267 non-null    int64
 17  f17        267 non-null    int64
 18  f18        267 non-null    int64
 19  f19        267 non-null    int64
 20  f20        267 non-null    int64
 21  f21        267 n

In [107]:
# Unique value checks
unique_values = df.nunique()

# Purely unique columns
pure_unique_columns = unique_values[unique_values == df.shape[0]].index.tolist()
binary_columns = unique_values[unique_values == 2].index.tolist()

In [108]:
# No purely unique columns
pure_unique_columns

[]

In [109]:
# All of these should be ordinally encoded except for the target which is result
# Every feature is binary
binary_columns

['diagnosis',
 'f1',
 'f2',
 'f3',
 'f4',
 'f5',
 'f6',
 'f7',
 'f8',
 'f9',
 'f10',
 'f11',
 'f12',
 'f13',
 'f14',
 'f15',
 'f16',
 'f17',
 'f18',
 'f19',
 'f20',
 'f21',
 'f22']

In [110]:
# Note that because of the nature of the data, there are likely to be many duplicates
# Presumably each record is a patient whose identifying information has been scrubbed, no duplicates will be dropped
# df = remove_duplicates_and_columns(df)

In [111]:
# Class already labeled
df['diagnosis'].unique()

array([1, 0])

In [112]:
# Define target column and column types
# Column types determined using data dictionary from: https://www.kaggle.com/datasets/mitishaagarwal/patient/data
y = 'diagnosis'
categorical_cols = []
ordinal_cols = [col for col in binary_columns if col != y]
numerical_cols = []

In [113]:
# Preprocess dataset
train, test = preprocess_dataset(df, y = y,
                                 categorical_columns = categorical_cols,
                                 numeric_columns = numerical_cols,
                                 ordinal_columns = ordinal_cols)

In [114]:
# Write to file
train_path = os.path.join(train_files_dir, dataset_names[dataset_num] + '-train.csv.gz')
test_path = os.path.join(test_files_dir, dataset_names[dataset_num] + '-test.csv.gz')

save_processed_dataset(train = train, test = test, train_path = train_path, test_path = test_path)