In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
np.random.seed(42)

In [3]:
data_real = pd.read_csv("./data/copd.csv", index_col=0)
data_real = data_real.drop(columns=['copd'], axis=1)

In [4]:
data_synthetic = pd.DataFrame(columns=data_real.columns)

In [5]:
# Create synthetic samples by taking convex combinations from randomly selected existing data
for index in range(50):
    current_row = data_real.iloc[index]

    copd_severity_value = current_row['COPDSEVERITY']

    matching_rows = data_real[data_real['COPDSEVERITY'] == copd_severity_value]

    random_row = matching_rows.sample(1).iloc[0]
    
    random_coefficients = np.random.rand(2)
    random_coefficients /= random_coefficients.sum()

    convex_combination_values = random_coefficients[0]*current_row.drop('COPDSEVERITY') + random_coefficients[1]*random_row.drop('COPDSEVERITY')
    convex_combination_values['COPDSEVERITY'] = copd_severity_value
    
    data_synthetic.loc[len(data_synthetic)] = convex_combination_values

In [6]:
# Create the rest of the samples by adding white noise to existing samples
noise_stds = {
    'ID': 10,
    'AGE': 7,
    'PackHistory': 10,
    'MWT1': 50,
    'MWT2': 50,
    'MWT1Best': 50,
    'FEV1': 0.1,
    'FEV1PRED': 7,
    'FVC': 0.2,
    'FVCPRED': 9,
    'CAT': 10,
    'HAD': 5,
    'SGRQ': 5
}

In [7]:
for _ in range(51):
    selected_row = data_real.sample(1)

    modified_row = selected_row.copy()

    for key, value in noise_stds.items():
        modified_row[key] += np.random.normal(0, value)

    data_synthetic = pd.concat([data_synthetic, modified_row])

In [8]:
data_synthetic.shape

(101, 22)

In [9]:
# These columns have integer values
round_columns = ['ID', 'AGE', 'PackHistory', 'MWT1', 'MWT2', 'MWT1Best', 'FEV1PRED', 'FVCPRED', 'CAT', 'HAD', 'AGEquartiles', 'gender', 'smoking', 'Diabetes', 'muscular', 'hypertension', 'AtrialFib', 'IHD']

In [10]:
for column in round_columns:
    data_synthetic[column] = pd.to_numeric(data_synthetic[column], errors='ignore').round().astype(int)

  data_synthetic[column] = pd.to_numeric(data_synthetic[column], errors='ignore').round().astype(int)
  data_synthetic[column] = pd.to_numeric(data_synthetic[column], errors='ignore').round().astype(int)
  data_synthetic[column] = pd.to_numeric(data_synthetic[column], errors='ignore').round().astype(int)
  data_synthetic[column] = pd.to_numeric(data_synthetic[column], errors='ignore').round().astype(int)
  data_synthetic[column] = pd.to_numeric(data_synthetic[column], errors='ignore').round().astype(int)
  data_synthetic[column] = pd.to_numeric(data_synthetic[column], errors='ignore').round().astype(int)
  data_synthetic[column] = pd.to_numeric(data_synthetic[column], errors='ignore').round().astype(int)
  data_synthetic[column] = pd.to_numeric(data_synthetic[column], errors='ignore').round().astype(int)
  data_synthetic[column] = pd.to_numeric(data_synthetic[column], errors='ignore').round().astype(int)
  data_synthetic[column] = pd.to_numeric(data_synthetic[column], errors='ignore').

In [11]:
# Add some outliers in MWT1 on purpose
random_index = np.random.choice(data_synthetic.index, 3)
outliers = [80, 70, 810]

for random_idx, outlier in zip(random_index, outliers):
    data_synthetic.at[random_idx, 'MWT1'] = outlier

In [12]:
# Add some outliers in MWT2 on purpose
random_index = np.random.choice(data_synthetic.index, 3)
outliers = [80, 70, 810]

for random_idx, outlier in zip(random_index, outliers):
    data_synthetic.at[random_idx, 'MWT2'] = outlier

In [13]:
# Add some outliers in MWT1Best on purpose
random_index = np.random.choice(data_synthetic.index, 3)
outliers = [80, 70, 810]

for random_idx, outlier in zip(random_index, outliers):
    data_synthetic.at[random_idx, 'MWT1Best'] = outlier

In [14]:
# Add some outliers in FEV1PRED on purpose
random_index = np.random.choice(data_synthetic.index, 3)
outliers = [8, 140, -5]

for random_idx, outlier in zip(random_index, outliers):
    data_synthetic.at[random_idx, 'FEV1PRED'] = outlier

In [15]:
# Add some outliers in SGRQ on purpose
random_index = np.random.choice(data_synthetic.index, 3)
outliers = [-2.4, 140, -5]

for random_idx, outlier in zip(random_index, outliers):
    data_synthetic.at[random_idx, 'SGRQ'] = outlier

In [16]:
data_synthetic = data_synthetic.reset_index()

In [17]:
# Add a duplicate row
random_index1, random_index2 = np.random.choice(data_synthetic.index, size=2, replace=False)
data_synthetic.loc[random_index1] = data_synthetic.loc[random_index2].values

In [18]:
# Add null values
random_rows = np.random.choice(data_synthetic.index, size=4, replace=False)
random_columns = np.random.choice(data_synthetic.columns, size=4, replace=True)

data_synthetic.loc[random_rows, random_columns] = np.nan

In [19]:
# Add float values in age
data_synthetic['AGE'] = data_synthetic['AGE'].astype('object')

random_index = np.random.choice(data_synthetic.index, 3)
fake_floats = [33.4, 66.2, 72.1]

for random_idx, fake_float in zip(random_index, fake_floats):
    data_synthetic.at[random_idx, 'AGE'] = fake_float

In [20]:
# Add int values in FEV1
data_synthetic['FEV1'] = data_synthetic['FEV1'].astype('object')

random_index = np.random.choice(data_synthetic.index, 2)
fake_ints = [1, 2]

for random_idx, fake_int in zip(random_index, fake_ints):
    data_synthetic.at[random_idx, 'FEV1'] = fake_int

In [21]:
# Create a zero cardinality column
data_synthetic['muscular'] = 0

In [22]:
# Add some outliers in FVC on purpose
random_index = np.random.choice(data_synthetic.index, 3)
outliers = [1000.0, 2000.0, 3000.0]

for random_idx, outlier in zip(random_index, outliers):
    data_synthetic.at[random_idx, 'FVC'] = outlier

In [23]:
data_synthetic.drop(columns=['index'], inplace=True)

In [24]:
data_synthetic.to_csv("data/copd_synthetic.csv")