In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
def get_train_test_sets(y, x, test_size):
    x_train, x_test, y_train, y_test = train_test_split(x_, y_, test_size=test_size)
    return x_train, x_test, y_train, y_test

In [3]:
def get_train_validation_test_sets(x, y, train_size, test_size):
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_size, random_state=0)
    x_test, x_valid, y_test, y_valid = train_test_split(x_test, y_test, train_size=test_size, random_state=1)
    
    return x_train, x_test, x_valid, y_train, y_test, y_valid

In [4]:
def get_list_of_continuous_column():
    return ['Przebieg choroby, Maksymalne stężnenie CRP [mg/l] w momencie rozpoznania choroby, ',
            'Leczenie, Całkowita dawka kumulacyjna [g], ',
            'Leczenie, Skumulowany czas stosowania sterydów w latach , ',
            'Leczenie, Całkowita dawka kumulacyjna [mg], ',
            'Badania Laboratoryjne , ANCA IF typ świecenia, ',
            'Przebieg choroby, Maksymalne stężenie kreatyniny, mg/dl, ujednolicone',
            'Przebieg choroby, Czas między rozpoznaniem choroby a pierwszym zaostrzeniem (miesiące) , ',
            'Przebieg choroby, Całkowita liczba zaostrzeń , ',
            'Badania Laboratoryjne , ANCA IF typ świecenia, ',
            'wiek zachorowania',
            'opóźnienie rozpoznania',
            'czas trwania choroby',
            'Badania Laboratoryjne , Anti GBM    , ',
            'Badania Laboratoryjne , Krioglobuliny, ']

In [5]:
def get_target_colums():
    return ['Leczenie, Pacjent dializowany  , Tak',
            'Przebieg choroby, Zgon pacjenta, Tak']

In [6]:
def prepare_data(df_):
    
    bin_df = df_.drop(get_list_of_continuous_column(), axis=1)
    bin_df = bin_df.astype('int32')
    
    const_df = df_[get_list_of_continuous_column()]
    
    return bin_df, const_df

In [7]:
def prepare_data_to_unsupervised(df_):
    
    bin_df = df_.drop(get_list_of_continuous_column(), axis=1)
    bin_df = bin_df.drop([mpa_ids.name, gpa_ids.name], axis=1)
    bin_df = bin_df.astype('int32')

    const_df = df[get_list_of_continuous_column()]

    return bin_df, const_df

In [8]:
def prepare_target(df_):

    target = df_[get_target_colums()]
    df_.drop(get_target_colums(), axis=1, inplace=True)
    
    deaths = target[target.columns[1]].replace(1,1)
    dialysis = target[target.columns[0]].replace(1,2)

    y = deaths + dialysis
    y = y.replace(3,1)
    
    return y

In [9]:
def get_y_x(data):
    
    data_ = data.copy()
    bin_df, const_df = prepare_data(data_)
    
    mca_df = mca.MCA(bin_df)
    mca_df = mca_df.fs_r(0.99)
    
    const_df = data_[get_list_of_continuous_column()]
    scaler = preprocessing.MinMaxScaler(feature_range=(-1,1) )
    scaler.fit(const_df)
    const_df=scaler.transform(const_df)
    
    all_df = np.column_stack((mca_df,const_df))
    all_df = pd.DataFrame(all_df)
    
    x = all_df.copy()
    del all_df
    y = prepare_target(data_)
    
    return y, x

In [10]:
def get_y_x_to_unsupervised(data):
    
    data_ = data.copy()
    bin_df, const_df = prepare_data_to_unsupervised(data_)
    
    mca_df = mca.MCA(bin_df)
    mca_df = mca_df.fs_r(0.99)
    
    const_df = data_[get_list_of_continuous_column()]
    scaler = preprocessing.MinMaxScaler(feature_range=(-1,1) )
    scaler.fit(const_df)
    const_df=scaler.transform(const_df)
    
    all_df = np.column_stack((mca_df,const_df))
    all_df = pd.DataFrame(all_df)
    
    x = all_df.copy()
    del all_df
    y = prepare_target(data_)
    
    return y, x

In [11]:
def get_mca_data(data):
    data_ = data.copy()
    bin_df, const_df = prepare_data(data_)
    
    mca_df = mca.MCA(bin_df)
    mca_df = mca_df.fs_r(0.99)
    return mca_df

In [12]:
def get_GPA_data(data):
    dataGPA = data[data["Ogólne, Rozpoznanie kliniczne, Ziarniakowatość z zapaleniem naczyń (Wegenera)(GPA)"] == 1]
    return dataGPA

In [13]:
def get_MPA_data(data):
    dataMPA = data[data["Ogólne, Rozpoznanie kliniczne, Mikroskopowe zapalenie naczyń (MPA)"] == 1]
    return dataMPA

In [14]:
def get_pANCA_data(data):
    pANCA_data = data[data["Badania Laboratoryjne , ANCA IF typ świecenia, P"] == 1]
    return pANCA_data

In [15]:
def get_cANCA_data(data):
    cANCA_data = data[data["Badania Laboratoryjne , ANCA IF typ świecenia, C"] == 1]
    return cANCA_data

In [16]:
def get_PR3_data(data):
    PR3_data = data[data[data.columns[103]] == 1]
    return PR3_data

In [17]:
def get_MPO_data(data):
    MPO_data = data[data[data.columns[108]] == 1]
    return MPO_data

In [18]:
def get_pANCA_MPO_data(data):
    pANCA_MPO_data = data[(data["Badania Laboratoryjne , ANCA IF typ świecenia, P"] == 1) | ([data.columns[108] == 1])]
    return pANCA_MPO_data

In [19]:
def get_cANCA_PR3_data(data):
    cANCA_PR3_data = data[(data["Badania Laboratoryjne , ANCA IF typ świecenia, C"] == 1) | ([data.columns[103] == 1])]
    return cANCA_PR3_data