In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import zscore

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif

In [2]:
x_train = joblib.load("../output/x_train.pkl")
y_train = joblib.load("../output/y_train.pkl")
x_valid = joblib.load("../output/x_valid.pkl")
y_valid = joblib.load("../output/y_valid.pkl")
x_test = joblib.load("../output/x_test.pkl")
y_test = joblib.load("../output/y_test.pkl")

In [3]:
def to_numeric(input_data):
    
    data = input_data.copy()
    
    # replace '> 50' with 51
    data['Fresh Eggs Collected'] = data['Fresh Eggs Collected'].replace(['> 50'],[51])
    data['Eggs Mixed With Partner Sperm'] = data['Eggs Mixed With Partner Sperm'].replace(['> 50'],[51])
    
    # replace '>=5' with 6
    data['Total Number of Previous IVF cycles'] = data['Total Number of Previous IVF cycles'].replace(['>=5'],[6])
    data['Total number of IVF pregnancies'] = data['Total number of IVF pregnancies'].replace(['>=5'],[6])
    
    # convert to numerical data
    data['Fresh Eggs Collected'] = pd.to_numeric(data['Fresh Eggs Collected'])
    data['Eggs Mixed With Partner Sperm'] = pd.to_numeric(data['Eggs Mixed With Partner Sperm'])
    data['Total Number of Previous IVF cycles'] = pd.to_numeric(data['Total Number of Previous IVF cycles'])
    data['Total number of IVF pregnancies'] = pd.to_numeric(data['Total number of IVF pregnancies'])
    
    return data

In [4]:
def replace_age(input_data, cats):
    
    data = input_data.copy()
    data['Patient Age at Treatment'] = data['Patient Age at Treatment'].replace(cats)
    
    return data

In [5]:
def get_dummies(input_data, col):
    
    data = input_data.copy()
    data = pd.get_dummies(data, columns=col, prefix=col)
    
    return data

In [6]:
def replace_eggsrc(input_data):
    
    data = input_data.copy()
    data['Egg Source'] = data['Egg Source'].replace(['Patient','Donor'],[0,1])
    
    return data

In [7]:
def remove_cols(input_data, cols):
    
    data = input_data.copy()
    data = data.drop(columns=cols)
    
    return data

In [8]:
def undersampling(x_train, y_train):
    
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X_train_under, y_train_under = undersample.fit_resample(x_train, y_train)
    
    return X_train_under, y_train_under

In [9]:
# compile all dict & lists needed

age_replace = {'18 - 34':0, '35-37':1, '38-39':2, '40-42':3, '43-44':4, '45-50':5}

to_dummy = ['Sperm From']

to_remove = ['Cause of Infertility - Female Factors',
             'Cause of Infertility -  Partner Sperm Immunological factors',
             'Type of Infertility -Couple Primary', 
             'Type of Infertility - Male Primary', 
             'Frozen Cycle', 'Fresh Cycle', 
             'Sperm From_Partner', 
             'Total number of live births - conceived through IVF', 
             'Eggs Mixed With Partner Sperm']

In [10]:
# compile all engineering

def main_feat_engineering(input_data, age_value, to_dummy_feat, to_remove_feat):
    
    data = input_data.copy()
    
    # convert object columns to numerical
    data = to_numeric(data)
    
    # delete 999 value in age field
    data.drop(data[data['Patient Age at Treatment'] == '999'].index, inplace = True)
    
    # replace age value
    data = replace_age(data, age_value)
    
    # get dummy variables
    data = get_dummies(data, to_dummy_feat)
    
    # remove values on egg source
    data = replace_eggsrc(data)
    
    # remove constant & correlated features
    data = remove_cols(data, to_remove_feat)
    
    # split to x & y
    x = data.drop(columns=['Live Birth Occurrence'], axis=1)
    y = data['Live Birth Occurrence']
    
    # normalise data
    X_norm = zscore(x)
    
    df_clean = pd.concat([X_norm, pd.DataFrame(y)], axis = 1)
    
    return df_clean

In [11]:
df_train = pd.concat([x_train, pd.DataFrame(y_train)], axis = 1)

In [12]:
df_train_clean = main_feat_engineering(df_train, age_replace, to_dummy, to_remove)

In [13]:
X_train_clean = df_train_clean.drop(columns=['Live Birth Occurrence'], axis=1)
y_train_clean = df_train_clean['Live Birth Occurrence']

In [14]:
X_train_ready, y_train_ready = undersampling(X_train_clean, y_train_clean)

In [15]:
X_train_ready.shape

(146466, 24)

In [16]:
X_train_ready.isna().sum()

Patient Age at Treatment                              0
Total Number of Previous IVF cycles                   0
Total number of IVF pregnancies                       0
Type of Infertility - Female Primary                  0
Type of Infertility - Female Secondary                0
Type of Infertility - Male Secondary                  0
Type of Infertility -Couple Secondary                 0
Cause  of Infertility - Tubal disease                 0
Cause of Infertility - Ovulatory Disorder             0
Cause of Infertility - Male Factor                    0
Cause of Infertility - Patient Unexplained            0
Cause of Infertility - Endometriosis                  0
Cause of Infertility - Cervical factors               0
Cause of Infertility - Partner Sperm Concentration    0
Cause of Infertility -  Partner Sperm Morphology      0
Causes of Infertility - Partner Sperm Motility        0
Stimulation used                                      0
Egg Source                                      

In [17]:
df_valid = pd.concat([x_valid, pd.DataFrame(y_valid)], axis = 1)

In [18]:
df_valid_clean = main_feat_engineering(df_valid, age_replace, to_dummy, to_remove)

In [19]:
X_valid_ready = df_valid_clean.drop(columns=['Live Birth Occurrence'], axis=1)
y_valid_ready = df_valid_clean['Live Birth Occurrence']

In [20]:
X_valid_ready.shape

(73057, 24)

In [21]:
X_valid_ready.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73057 entries, 199094 to 80719
Data columns (total 24 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Patient Age at Treatment                            73057 non-null  float64
 1   Total Number of Previous IVF cycles                 73057 non-null  float64
 2   Total number of IVF pregnancies                     73057 non-null  float64
 3   Type of Infertility - Female Primary                73057 non-null  float64
 4   Type of Infertility - Female Secondary              73057 non-null  float64
 5   Type of Infertility - Male Secondary                73057 non-null  float64
 6   Type of Infertility -Couple Secondary               73057 non-null  float64
 7   Cause  of Infertility - Tubal disease               73057 non-null  float64
 8   Cause of Infertility - Ovulatory Disorder           73057 non-null  flo

In [23]:
df_test = pd.concat([x_test, pd.DataFrame(y_test)], axis = 1)

In [24]:
df_test_clean = main_feat_engineering(df_test, age_replace, to_dummy, to_remove)

In [25]:
X_test_ready = df_test_clean.drop(columns=['Live Birth Occurrence'], axis=1)
y_test_ready = df_test_clean['Live Birth Occurrence']

In [26]:
X_test_ready.shape

(91401, 24)

In [None]:
joblib.dump(X_train_ready, "../output/X_train_ready.pkl")
    joblib.dump(y_train_ready, "../output/y_train_ready.pkl")
    joblib.dump(X_valid_ready, "../output/X_valid_ready.pkl")
    joblib.dump(y_valid_ready, "../output/y_valid_ready.pkl")
    joblib.dump(X_test_ready, "../output/X_test_ready.pkl")
    joblib.dump(y_test_ready, "../output/y_test_ready.pkl")