Basic imports

In [None]:
import numpy as np
import os
from pandas import DataFrame
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import pandas as pd
import matplotlib.pyplot as plt

Basic functions for data preprocessing

In [None]:
def drop_line_if_it_contains(data_csv: DataFrame, column: str, value):
    for i in range(len(data_csv[column])):
        if data_csv[column].at[i] == value:
            data_csv.drop([i], axis=0, inplace=True)

    return data_csv

# replacing 0 to NaN in dataframe
def null_to_NaN(X, except_list):
    cols_with_missing_val = detect_vals(X, 0, except_list).index[0]
    X = replace_val_in_cols(X, [cols_with_missing_val], 0, np.nan)
    return X

# Counting an amount of null values in dataset by column
def detect_vals(obj_to_describe, val, exclude_col=[]):
    # missing values by columns
    obj_to_describe = obj_to_describe.drop(exclude_col, axis=1)
    missing_val_count_by_column = (obj_to_describe.isin([val]).sum())
    return (missing_val_count_by_column)

def read_csv(name):
    return pd.read_csv('../DataSource/' + name, encoding='utf-8')

# Replaces each val in appropriate cols
def replace_val_in_cols(data_csv: DataFrame, column_list, old_value: str, new_value: str):
    for col in column_list:
        data_csv[col] = data_csv[col].replace(old_value, new_value)
    return data_csv

def replace_val_in_cols_except(data_csv: DataFrame, column: str, except_values: [], new_value: str):
    for i in range(len(data_csv[column])):
        if any(x == data_csv[column].at[i] for x in except_values):
            continue
        data_csv[column].at[i] = new_value

    return data_csv

# # replacing inf with nan
def replace_inf_with_nan_and_impute(X: DataFrame):
    df = apply_encoder(X)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df = simple_imputing_data(df, df)
    return df[0]

Functions for imputing and encoding non numerical values

In [None]:
# # Replacing missing values (imputing) according to certain strategy
def simple_imputing_data(X_train, X_valid, strategy:str = 'mean'):
    simple_imputer = SimpleImputer(missing_values=np.nan, strategy=strategy)
    imputed_X_train = pd.DataFrame(simple_imputer.fit_transform(X_train))
    imputed_X_valid = pd.DataFrame(simple_imputer.transform(X_valid))
    # Imputation removed column names; put them back
    imputed_X_train.columns = X_train.columns
    imputed_X_valid.columns = X_valid.columns

    return imputed_X_train, imputed_X_valid

# # Encoding labels
def apply_encoder(X: DataFrame):
    s = (X.dtypes == 'object')
    object_cols = list(s[s].index)

    for col in object_cols:
        X[col] = LabelEncoder().fit_transform(X[col])

    return X

Enumerating initial files(DataSource folder) and saving it as single file inlined_ecgs.csv

In [None]:
def prepare_ecg(ECGS_filename):
    source_path = '../DataSource/'
    directory = source_path + 'metrics/'
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            if 'df' not in locals():
                df = read_csv(f)
                df_inlined = inline_single_ecg(df)
            else:
                current_line = read_csv(f)
                df = pd.concat([df, current_line])
                df_inlined = pd.concat([df_inlined, inline_single_ecg(current_line)])

    df.to_csv('../DataSource/' + ECGS_filename)
    df_inlined.to_csv('../DataSource/' + 'inlined_'+ECGS_filename)
    return df_inlined

def inline_single_ecg(df: DataFrame):
    patient_id = df.iloc[0, 0]
    coloumns_to_drop = ['patient_id', 'ecg_id', 'lead']
    for col in coloumns_to_drop:
        df.drop(col, axis=1, inplace=True)
    df.index = df.index + 1
    df_out = df.stack()
    df_out.index = df_out.index.map('{0[1]}_{0[0]}'.format)
    df_inlined = df_out.to_frame().T
    df_inlined['patient_id'] = patient_id
    return df_inlined



Casting to binary task replacing:
    MI as false(0);
    NORM as true(1);
    other values are dropped out from dataframe

In [None]:
def prepare_dataset_core(name_csv: str):
    df = read_csv('../DataSource/' + name_csv)
    col = 'diagnostic_superclass'
    replace_val_in_cols(df, [col], "['MI']", 'False')
    replace_val_in_cols(df, [col], "['NORM']", 'True')

    replace_val_in_cols_except(df, col, ['True', 'False'], col)
    drop_line_if_it_contains(df, col, col)

    return apply_encoder(df)

Building Random Forest Model and measuring it with confusion matrix, accuracy(jaccard_score) and MAE(Mean absolute error regression loss)

In [None]:
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
def build_confusion_matrix(predictions, y_test):
    cm = confusion_matrix(y_test, predictions)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()


from random import uniform
from sklearn.ensemble import RandomForestClassifier
def rfc(X_train, X_valid, y_train, y_valid, random_state=round(uniform(0, 300))):
    reg_model = RandomForestClassifier(n_estimators=10000, random_state=random_state, n_jobs=-1)
    reg_model = reg_model.fit(X_train, y_train)
    predicted = reg_model.predict(X_valid)
    predictions = [round(value) for value in predicted]
    accuracy = accuracy_score(y_valid, predictions)
    build_confusion_matrix(predicted, y_valid)
    print("Accuracy: %.3f%%" % (accuracy * 100.0))
    print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))


from sklearn.model_selection import train_test_split
def build_and_score_ml_model_core(X_full: DataFrame, beside_list=[]):
    target_column = 'diagnostic_superclass'
    randomState = round(uniform(0, 300))
    y = X_full[target_column]
    X = null_to_NaN(X_full.drop([target_column], axis=1), beside_list)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.82, test_size=0.18, random_state=randomState)

    rfc(X_train, X_valid, y_train, y_valid, randomState)


Main function. Uncomment line with "prepare_ecg" function only if in there is no
joined_data.csv in DataSource folder(enumerating files might take up to 5 min)

In [None]:
def main():
    patients_info_filename = 'patientsInfo.csv'
    ECGS_filename = 'ecgs.csv'
    path = '../DataSource/'

    # print('converting initial data to single inlined data')
    # # converts initial data to single inlined data/(5min)
    # # run only once if inlined_ecgs.csv was not generated
    # prepare_ecg(ECGS_filename)

    # Joining ECGS_data and patients info
    ECGS_data = read_csv('../DataSource/' + 'inlined_' + ECGS_filename)

    # prepare generated DF(values replacing, imputing)
    print('preparing generated DF')
    dataframe = prepare_dataset_core(patients_info_filename)
    X = dataframe.set_index('patient_id').join(ECGS_data.set_index('patient_id'))
    X.to_csv(path + "joined_data.csv")
    X = replace_inf_with_nan_and_impute(X)

    # building random forest model
    print('building ML')
    build_and_score_ml_model_core(X)

Executing pipeline

In [None]:
main()