In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [10]:
def read_data(path, 
              save_file = True,
              return_file = True,
              set_index = None):
    '''
    Read data from data folder in xlsb format.
    
    Parameters
    ----------
    path: str
          path to data
    
    '''
    
    data = pd.read_csv(path)
    
    if save_file:
        joblib.dump(data, "../output/data.pkl")
    
    if return_file:
        return data

In [30]:
def split_input_output(dataset,
                       target_column,
                       save_file = True,
                       return_file = True):
    
    output_df = dataset[target_column]
    input_df = dataset.drop([target_column],
                            axis = 1)
    
    if save_file:
        joblib.dump(output_df, "../output/output_df.pkl")
        joblib.dump(input_df, "../output/input_df.pkl")
    
    if return_file:
        return output_df, input_df

In [31]:
def split_train_test(x, y, TEST_SIZE):
    # Do not forget to stratify if classification
    x_train, x_test,\
        y_train, y_test = train_test_split(x,
                                           y,
                                           test_size=TEST_SIZE,
                                           random_state=123,
                                           stratify=y)

    return x_train, x_test, y_train, y_test


def split_data(data_input, data_ouput, return_file=False, TEST_SIZE=0.2):

    x_train, x_test, \
        y_train, y_test = split_train_test(
            data_input,
            data_ouput,
            TEST_SIZE)

    x_train, x_valid, \
        y_train, y_valid = split_train_test(
            x_train,
            y_train,
            TEST_SIZE)

    joblib.dump(x_train, "../output/x_train.pkl")
    joblib.dump(y_train, "../output/y_train.pkl")
    joblib.dump(x_valid, "../output/x_valid.pkl")
    joblib.dump(y_valid, "../output/y_valid.pkl")
    joblib.dump(x_test, "../output/x_test.pkl")
    joblib.dump(y_test, "../output/y_test.pkl")

    if return_file:
        return x_train, y_train, \
            x_valid, y_valid, \
            x_test, y_test

In [12]:
DATA_PATH_1 = '../data/ar-2010-2014-csv.csv'
DATA_PATH_2 = '../data/ar-2015-2016-csv.csv'

df1 = read_data(DATA_PATH_1)
df2 = read_data(DATA_PATH_2)

In [14]:
data = pd.concat([df1, df2])

In [15]:
data.head()

Unnamed: 0,Patient Age at Treatment,Date patient started trying to become pregnant OR date of last pregnancy,"Total Number of Previous cycles, Both IVF and DI","Total Number of Previous treatments, Both IVF and DI at clinic",Total Number of Previous IVF cycles,Total Number of Previous DI cycles,"Total number of previous pregnancies, Both IVF and DI",Total number of IVF pregnancies,Total number of DI pregnancies,Total number of live births - conceived through IVF or DI,...,Heart Three Birth Weight,Heart Three Sex,Heart Three Delivery Date,Heart Three Birth Congenital Abnormalities,Heart Four Weeks Gestation,Heart Four Birth Outcome,Heart Four Birth Weight,Heart Four Sex,Heart Four Delivery Date,Heart Four Birth Congenital Abnormalities
0,18 - 34,,1,1,0,1,0,0,0,0,...,,,,,,,,,,
1,35-37,,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2,18 - 34,,0,0,0,0,0,0,0,0,...,,,,,,,,,,
3,38-39,,1,1,0,1,0,0,0,0,...,,,,,,,,,,
4,35-37,,0,0,0,0,0,0,0,0,...,,,,,,,,,,


In [16]:
ori_data = data.copy()

In [17]:
data['Type of treatment - IVF or DI'].value_counts()

IVF    463358
DI      32270
Name: Type of treatment - IVF or DI, dtype: int64

In [18]:
def exclude_di(input_data):
    
    data = input_data.copy()
    data = data[data['Type of treatment - IVF or DI'] != 'DI']
    
    return data

In [19]:
# Relevant fileds according to the paper

cols = ['Patient Age at Treatment',
        'Total Number of Previous IVF cycles',
        'Total number of IVF pregnancies',
        'Total number of live births - conceived through IVF',
        'Type of Infertility - Female Primary',
        'Type of Infertility - Female Secondary',
        'Type of Infertility - Male Primary',
        'Type of Infertility - Male Secondary',
        'Type of Infertility -Couple Primary',
        'Type of Infertility -Couple Secondary',
        'Cause  of Infertility - Tubal disease',
        'Cause of Infertility - Ovulatory Disorder',
        'Cause of Infertility - Male Factor',
        'Cause of Infertility - Patient Unexplained',
        'Cause of Infertility - Endometriosis',
        'Cause of Infertility - Cervical factors',
        'Cause of Infertility - Female Factors',
        'Cause of Infertility - Partner Sperm Concentration',
        'Cause of Infertility -  Partner Sperm Morphology',
        'Causes of Infertility - Partner Sperm Motility',
        'Cause of Infertility -  Partner Sperm Immunological factors',
        'Stimulation used',
        'Egg Source',
        'Sperm From', 
        'Fresh Cycle', 
        'Frozen Cycle', 
        'Eggs Thawed',
        'Fresh Eggs Collected', 
        'Eggs Mixed With Partner Sperm',
        'Embryos Transfered',
        'Live Birth Occurrence']

In [20]:
def select_feats(input_data, selected_cols):
    
    data = input_data.copy()
    data = data[selected_cols]
    
    return data

In [21]:
def main_data(input_data, selected_cols):
    
    data = exclude_di(input_data)
    data = select_feats(data, selected_cols)
    
    return data

In [22]:
data = main_data(data, cols)
data.head()

Unnamed: 0,Patient Age at Treatment,Total Number of Previous IVF cycles,Total number of IVF pregnancies,Total number of live births - conceived through IVF,Type of Infertility - Female Primary,Type of Infertility - Female Secondary,Type of Infertility - Male Primary,Type of Infertility - Male Secondary,Type of Infertility -Couple Primary,Type of Infertility -Couple Secondary,...,Stimulation used,Egg Source,Sperm From,Fresh Cycle,Frozen Cycle,Eggs Thawed,Fresh Eggs Collected,Eggs Mixed With Partner Sperm,Embryos Transfered,Live Birth Occurrence
8214,38-39,3,1,1,0,0,0,0,0,0,...,0,Patient,Partner,0.0,1.0,0.0,0.0,0.0,2.0,
8215,18 - 34,2,1,0,0,0,0,0,0,0,...,0,Patient,Partner,0.0,1.0,0.0,0.0,0.0,1.0,
8216,18 - 34,3,0,0,0,0,0,0,0,0,...,0,Patient,Partner,0.0,1.0,0.0,0.0,0.0,2.0,1.0
8217,999,0,0,0,0,0,0,0,0,0,...,1,Patient,Partner,1.0,0.0,0.0,11.0,0.0,0.0,
8218,18 - 34,0,0,0,0,0,0,0,0,0,...,1,Patient,Partner,1.0,0.0,0.0,19.0,18.0,0.0,


In [23]:
data['Live Birth Occurrence'].unique()

array([nan,  1.])

In [24]:
print(f'Percentage of Null value: {(data["Live Birth Occurrence"].isnull().sum()/data.shape[0])*100:.2f}%')

Percentage of Null value: 75.18%


In [25]:
def impute_target(input_data, target_col):
    
    data = input_data.copy()
    data[target_col] = data[target_col].fillna(0)
    
    return data

In [26]:
TARGET_COLUMN = "Live Birth Occurrence"

data = impute_target(data, TARGET_COLUMN)

In [27]:
data['Live Birth Occurrence'].isna().sum()

0

In [28]:
data['Live Birth Occurrence'].unique()

array([0., 1.])

In [32]:
TEST_SIZE = 0.2

output_df, input_df = split_input_output(data,
                                         TARGET_COLUMN)

X_train, y_train, X_valid, y_valid, X_test, y_test = split_data(input_df,
                                                                output_df,
                                                                True,
                                                                TEST_SIZE)

In [33]:
y_test.shape

(92672,)

In [34]:
# check if stratify is correct
print(y_train.value_counts(normalize = True))
print('--------------')
print(y_valid.value_counts(normalize = True))
print('--------------')
print(y_test.value_counts(normalize = True))

0.0    0.751784
1.0    0.248216
Name: Live Birth Occurrence, dtype: float64
--------------
0.0    0.751787
1.0    0.248213
Name: Live Birth Occurrence, dtype: float64
--------------
0.0    0.751791
1.0    0.248209
Name: Live Birth Occurrence, dtype: float64
