In [1]:
import csv
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
data = pd.read_csv("../data/copd_synthetic.csv", index_col=0)

In [4]:
data.head()

Unnamed: 0,ID,AGE,PackHistory,COPDSEVERITY,MWT1,MWT2,MWT1Best,FEV1,FEV1PRED,FVC,...,HAD,SGRQ,AGEquartiles,gender,smoking,Diabetes,muscular,hypertension,AtrialFib,IHD
0,25,74.0,71,SEVERE,258,214,258,1.401366,11.0,2.377036,...,13,42.881273,3,1.0,2,1,0,0,0,0
1,104,77.0,41,MODERATE,307,280,313,1.347227,57.0,2.322822,...,15,38.69324,4,0.0,2,1,0,0,1,1
2,66,74.0,51,MODERATE,370,367,370,2.275576,74.0,4.849031,...,18,24.038178,3,1.0,2,0,0,0,0,0
3,142,63.0,45,VERY SEVERE,233,244,244,0.486223,19.0,1.513124,...,28,65.759074,2,1.0,2,0,0,1,1,0
4,152,71.0,35,SEVERE,216,226,226,1.086112,44.0,2.453488,...,19,64.818536,2,0.0,2,1,0,0,1,0


In [5]:
data.shape

(101, 22)

### Target feature exists twice in the dataset so we will remove one of them.

In [6]:
data = data.drop(columns=['copd'], axis=1)
data.shape

KeyError: "['copd'] not found in axis"

In [7]:
data.columns.values

array(['ID', 'AGE', 'PackHistory', 'COPDSEVERITY', 'MWT1', 'MWT2',
       'MWT1Best', 'FEV1', 'FEV1PRED', 'FVC', 'FVCPRED', 'CAT', 'HAD',
       'SGRQ', 'AGEquartiles', 'gender', 'smoking', 'Diabetes',
       'muscular', 'hypertension', 'AtrialFib', 'IHD'], dtype=object)

### Step 1: Expert Knowledge Checks

Based on bibliography about acceptable range of values for various features.

In [8]:
# Check MWT1
mwt1_error_idx = data.index[(data['MWT1'] < 100) | (data['MWT1'] > 790)].tolist()
mwt1_error_idx

[15, 45, 61, 88]

In [9]:
# Check MWT2
mwt2_error_idx = data.index[(data['MWT2'] < 100) | (data['MWT2'] > 790)].tolist()
mwt2_error_idx

[7, 10, 32, 71, 80]

In [10]:
# Check MWT1Best
mwt1best_error_idx = data.index[(data['MWT1Best'] < 100) | (data['MWT1Best'] > 790)].tolist()
mwt1best_error_idx

[13, 63, 64, 70, 96]

In [11]:
# Check FEV1PRED
fev1pred_error_idx = data.index[(data['FEV1PRED'] < 11) | (data['FEV1PRED'] > 133)].tolist()
fev1pred_error_idx

[20, 32, 62, 69, 71, 90]

In [12]:
# Check SGRQ
sgrq_error_idx = data.index[(data['SGRQ'] < 0) | (data['SGRQ'] > 93)].tolist()
sgrq_error_idx

[19, 63, 84]

In [13]:
expert_knowledge_results = {
    'mwt1_error_idx': mwt1_error_idx,
    'mwt2_error_idx': mwt2_error_idx,
    'mwt1best_error_idx': mwt1best_error_idx,
    'fev1pred_error_idx': fev1pred_error_idx,
    'sgrq_error_idx': sgrq_error_idx
}

expert_knowledge_results

{'mwt1_error_idx': [15, 45, 61, 88],
 'mwt2_error_idx': [7, 10, 32, 71, 80],
 'mwt1best_error_idx': [13, 63, 64, 70, 96],
 'fev1pred_error_idx': [20, 32, 62, 69, 71, 90],
 'sgrq_error_idx': [19, 63, 84]}

### Step 2: Statistical Analysis Checks

In [14]:
statistical_analysis_results = {}

In [15]:
# Check for duplicates
duplicates_idx = data[data.duplicated()].index.tolist()
statistical_analysis_results['duplicates_idx'] = duplicates_idx

In [16]:
# Check for null values
null_results = {}
for column in data.columns:
    column_null_idx = data[data[column].isnull()].index.tolist()
    null_results[column] = column_null_idx
statistical_analysis_results['null_idx'] = null_results

In [17]:
# Check for data type consistency

# We first load the data into a dictionary, checking for their original data types before creating the pandas Dataframe.
# Unfortunately we have to do this whole workaround because if we load the data immediately using pandas it enforces its own data types to columns homogeneously.

def convert_to_numeric(value):
    if value is None:
        return float('nan')
    if '.' in value:
        try:
            return float(value)
        except ValueError:
            return value
    else:
        try:
            return int(value)
        except ValueError:
            return value

def read_csv_to_dict(file_path):
    data_dict = {}
    
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        for row in csv_reader:
            for key, value in row.items():
                if key not in data_dict:
                    data_dict[key] = []
                
                # Convert to int or float if possible
                converted_value = convert_to_numeric(value)
                data_dict[key].append(converted_value)
    
    return data_dict

In [18]:
data_path = "../data/copd_synthetic.csv"
data_dict = read_csv_to_dict(data_path)
true_datatypes_df = pd.DataFrame(data_dict).drop(columns=[''], axis=1)

In [19]:
# Check for data type consistency
data_type_results = {}
for column in true_datatypes_df.columns:
    column_types = {}
    for value in true_datatypes_df[column]:
        if type(value).__name__ in list(column_types.keys()):
            column_types[type(value).__name__] += 1
        else:
            column_types[type(value).__name__] = 1
    data_type_results[column] = column_types
statistical_analysis_results['feature_data_type'] = data_type_results

In [20]:
# Check features where all values are the same (0 cardinality)
zero_cardinality_columns = []
for column in data.columns:
    if data[column].nunique() == 1:
        zero_cardinality_columns.append(column)
statistical_analysis_results['zero_cardinality_columns'] = zero_cardinality_columns

In [21]:
# TODO: Decide if there is going to be a check for high cardinality
# This requires deciding on which data types this will be applied
# Also the threshold tthat determines what cardinality is considered high should be decided

In [22]:
# Check for outliers based on IQR
# Only a small fraction of the original dataset should be excluded
# This refers to features where more than 90% of the data are floats
# TODO: Reiterate to see if there are going to be checks for ints, too

def check_feature_outliers(data, feature, iqr_factor=1.7):
    q1 = data[feature].quantile(0.25)
    q3 = data[feature].quantile(0.75)
    iqr = q3-q1

    lower_bound = q1 - iqr_factor*iqr
    upper_bound = q3 + iqr_factor*iqr

    return data[(data[feature] > upper_bound) | (data[feature] < lower_bound)].index.tolist()

In [23]:
outliers_results = {}
for column in data.columns:
    feature_data_types = statistical_analysis_results['feature_data_type'][column]

    total_counter =0
    float_counter = 0

    for key, value in feature_data_types.items():

        total_counter += value

        if key == 'float':
            float_counter += value
    float_fraction = float_counter / total_counter
    if float_fraction >= 0.9:
        outliers_idx = check_feature_outliers(data, column)
        feature_outliers_key = column + "_outliers_idx"
        outliers_results[column] = outliers_idx
statistical_analysis_results['outliers_idx'] = outliers_results

In [24]:
statistical_analysis_results

{'duplicates_idx': [94],
 'null_idx': {'ID': [],
  'AGE': [],
  'PackHistory': [],
  'COPDSEVERITY': [],
  'MWT1': [],
  'MWT2': [],
  'MWT1Best': [],
  'FEV1': [7, 9, 46, 96],
  'FEV1PRED': [7, 9, 46, 96],
  'FVC': [],
  'FVCPRED': [],
  'CAT': [7, 9, 46, 96],
  'HAD': [],
  'SGRQ': [],
  'AGEquartiles': [],
  'gender': [7, 9, 46, 96],
  'smoking': [],
  'Diabetes': [],
  'muscular': [],
  'hypertension': [],
  'AtrialFib': [],
  'IHD': []},
 'feature_data_type': {'ID': {'int': 101},
  'AGE': {'float': 101},
  'PackHistory': {'int': 101},
  'COPDSEVERITY': {'str': 101},
  'MWT1': {'int': 101},
  'MWT2': {'int': 101},
  'MWT1Best': {'int': 101},
  'FEV1': {'float': 95, 'str': 4, 'int': 2},
  'FEV1PRED': {'float': 97, 'str': 4},
  'FVC': {'float': 101},
  'FVCPRED': {'int': 101},
  'CAT': {'float': 97, 'str': 4},
  'HAD': {'int': 101},
  'SGRQ': {'float': 101},
  'AGEquartiles': {'int': 101},
  'gender': {'float': 97, 'str': 4},
  'smoking': {'int': 101},
  'Diabetes': {'int': 101},
  '