# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import statistics

In [28]:
# Set datasets timesteps and time series name
dataset_info = {'mimic': {'ts': 48,
                          'features': ['WBC', 'Chloride (serum)', 'Glucose (serum)']#, 'Magnesium', 'Sodium (serum)', 'BUN',
                            #   'Phosphorous', 'Anion gap', 'Potassium (serum)', 'HCO3 (serum)', 'Platelet Count',
                            #   'Prothrombin time', 'PTT', 'Lactic Acid']
                          },
                'toy': {'ts': 10,
                        'features': ['F1_constant', 'F2_early', 'F3_late', 'F4_narrow', 'F5_wide']
                        }
}

# Create Toy Dataset

### Parameters and variables

In [82]:
# Output path for the toy dataset
output_path_toy = "/content/drive/MyDrive/OptionalProject/MIMIC_data_labels_toy.csv"

# Number of timesteps in the timeseries
TIMESTEPS = 10
# Number of patients to generate
RECORDS = 1
# Show timeseries plots when generating
VERBOSE = True

# Define the static features and the label
label = ['0', '1']
gender_values = ['M', 'F']
age_values = list(range(18, 99, 1))
insurance_values = ['Other', 'Medicare', 'Medicaid']

static_feat = ['gender', 'Age', 'insurance', 'label']


# Define continuous features
cont_feat = ['F1_constant', 'F2_early', 'F3_late', 'F4_narrow', 'F5_wide']

### Helper funtions

In [123]:
def logistic_function(x, a, b):
    return 1 / (1 + np.exp(-(a * x + b)))


def generate_y(early=True):
    # TODO: Make this dependent on the timesteps, not fixed

    data = np.array([0, 0, 0, 0, 0.3, 0.5, 0.7, 1, 1, 1])
    assert len(data) == TIMESTEPS, 'Change length of y_data'

    if early:
        y_data = data
    else:
        y_data = 1 - data

    return y_data


def show(x_data, y_data, y_constant, axes, title):

    # Plot the points and logistic function
    if title == 'Constant':
        axes.scatter(x_data, y_data, color='red', label='Data Points')
    else:
        axes.scatter(x_data, y_data, color='red')

    axes.scatter(x_data, y_constant, color='red')
    if title == 'Constant':
        axes.plot(x_data, y_data, color='blue', label='Label_1')
        axes.plot(x_data, y_constant, color='green', label='Label_0')
    else:
        axes.plot(x_data, y_data, color='blue')
        axes.plot(x_data, y_constant, color='green')

    if title == 'Constant':
        axes.legend(fontsize="8", loc="best")

    axes.set_title(title)
    axes.set_xlabel('x')
    axes.set_ylabel('y')


def generate_continuous_function(early, axes, title, verbose=False):

    # Define the points
    x_data = np.array(range(0, TIMESTEPS))
    y_data = generate_y(early)

    def loss_function(params):
        a, b = params
        y_pred = logistic_function(x_data, a, b)
        loss = -np.mean(y_data * np.log(y_pred) + (1 - y_data) * np.log(1 - y_pred))
        return loss

    # Perform logistic regression using minimize
    initial_params = np.zeros(2)  # Initial guess for parameters
    result = minimize(loss_function, initial_params, method='SLSQP')

    # Retrieve the optimized parameters
    params = result.x

    # Generate points to plot the logistic function
    y_data = logistic_function(x_data, params[0], params[1])

    x_coord = np.random.uniform(low=-2.0, high=2.0, size=None)
    y_data += x_coord
    y_constant = np.full(TIMESTEPS, min(y_data))

    if verbose:
        show(x_data, y_data, y_constant, axes, title)

    return y_constant, y_data


def constant_function(verbose, gender, axes, title):

    if gender == 'M':
        m = np.random.uniform(low=0, high=0.5, size=None)
    else:
        m = np.random.uniform(low=1.5, high=2, size=None)

    x_data = np.array(range(0, TIMESTEPS, 1))
    y_data = m * x_data

    if gender == 'M':
        x_coord = np.random.uniform(low=-2.0, high=0, size=None)
    else:
        x_coord = np.random.uniform(low=0, high=2.0, size=None)

    y_data += x_coord
    y_constant = np.full(TIMESTEPS, min(y_data))

    if verbose:
        show(x_data, y_data, y_constant, axes, title)

    return y_constant, y_data


def early_late_function(early, axes, title, verbose):

    var0, var1 = generate_continuous_function(early, axes, title, verbose)
    return var0, var1


def wave_function(narrow, axes, title, verbose):

    resolution = TIMESTEPS * 2 # how many datapoints to generate
    x_data = np.array(range(0, resolution, 1))

    cycles = 4 if narrow else 2

    length = np.pi * 2 * cycles
    y_data = np.sin(np.arange(0, length, length / resolution))

    x_coord = np.random.uniform(low=-2.0, high=2.0, size=None)
    y_data += x_coord
    y_constant = np.full(resolution, statistics.median(sorted(y_data)))
    start = np.random.choice(TIMESTEPS)
    y_constant = y_constant[start: start+TIMESTEPS]
    y_data = y_data[start: start+TIMESTEPS]
    x_data = x_data[start: start+TIMESTEPS]

    if verbose:
        show(x_data, y_data, y_constant, axes, title)

    return y_constant, y_data


def populate_df(n_rows, cont_feat, static_feat, verbose=False):

    timesteps = range(TIMESTEPS)
    cont_mi = pd.MultiIndex.from_product([timesteps, cont_feat])
    timestep = [-1]

    mi = cont_mi.append(pd.MultiIndex.from_product([timestep, static_feat]))
    df = pd.DataFrame(columns = mi)

    fig, axes = plt.subplots(1, 5, figsize=(13, 4))

    label_index = 0
    for idx in range(n_rows):
        if idx >= n_rows // 2:
            label_index = 1

        l = label[label_index]

        f2 = early_late_function(True, axes[1], 'Late', verbose)[label_index]
        f3 = early_late_function(False, axes[2], 'Early', verbose)[label_index]
        f4 = wave_function(True, axes[3], 'Narrow', verbose)[label_index]
        f5 = wave_function(False, axes[4], 'Wide', verbose)[label_index]
        cont = [f2, f3, f4, f5]

        # Create a balanced dataset regarding gender
        if idx % 2 == 0:
            g = 'M'
        else:
            g = 'F'
        # g = gender_values[np.random.choice(len(gender_values))]

        a = age_values[np.random.choice(len(age_values))]
        i = insurance_values[np.random.choice(len(insurance_values))]
        static = [g, a, i, l]

        f1 = constant_function(verbose, g, axes[0], 'Constant')[label_index]
        cont.insert(0, f1)

        for t in range(TIMESTEPS):
            for j in range(len(cont_feat)):
                df.loc[idx, (t, cont_feat[j])] = cont[j][t]

        for k in range(len(static_feat)):
            df.loc[idx, (-1, static_feat[k])] = static[k]

        df[(-1, 'label')] = df[(-1, 'label')].astype(int)

        if verbose:
            print("Gender: {}, Age: {}, Insurance: {}, Label: {}".format(g, a, i, l))
            fig.tight_layout()
            fig.show()

    df = df.sample(frac=1).reset_index(drop=True)

    return df

### Creation and saving

In [None]:
df = populate_df(RECORDS, cont_feat, static_feat, verbose=VERBOSE)
# df.to_csv(output_path_toy, index=False)
df.head(10)

# Plot predicted data or generated data

In [69]:
def plot_generated(df, timesteps, features, nr_patients):

    indexes = [str(i) for i in range(1, timesteps)]
    cont_df = df.loc[:, pd.IndexSlice[indexes, :]]

    for i in range(nr_patients):
        fig, ax = plt.subplots(1, len(features), figsize=(15, 4))

        # Take labels from timestep 0 to end (on index 0 there is the aggregate label)
        label = np.array(df.xs('label', level = 1, axis = 1).loc[i].values[0])
        gender = np.array(df.xs('gender', level = 1, axis = 1).loc[i].values[0])

        for j, f in enumerate(features):
            if len(features) == 1:
                pos = ax
            else:
                pos = ax[j]
            y = cont_df.xs(f, level = 1, axis = 1).loc[i].values
            x = list(range(1, timesteps))

            pos.scatter(np.array(x), np.array(y), color='gray')
            pos.plot(x, y, color='red', label='Generated')

            pos.set_xlabel('x')
            pos.set_ylabel('y')

            title = f
            if title == 'F2_early':
                title = 'F2_late'
            if title == 'F3_late':
                title = 'F3_early'
            pos.set_title(f"{title}_patient_{i} - {gender}, {label}")
            pos.set_ylim([-4, 4])
            pos.set_xticks(range(1, timesteps, 3))
            pos.legend()

        plt.tight_layout()
        plt.show()

def plot_predicted_vs_gt(df, gt_df, timesteps, features, nr_patients):

    indexes = [str(i) for i in range(0, timesteps - 1)]
    cont_df = df.loc[:, pd.IndexSlice[indexes, :]]
    cont_df_gt = gt_df.loc[:, pd.IndexSlice[indexes, :]]

    for i in range(nr_patients):
        fig, ax = plt.subplots(1, len(features), figsize=(15, 4))

        # Take labels from timestep 0 to end (on index 0 there is the aggregated predicted label across timesteps)
        label = np.array(df.xs('label', level = 1, axis = 1).loc[i].values[1:])
        label_gt = gt_df.xs('label', level = 1, axis = 1).loc[i].values[0]
        gender = df.loc[:, ('-1', 'gender')].loc[i]
        wrong_idx = np.where(label[:-1] != label_gt)

        for j, f in enumerate(features):
            if len(features) == 1:
                pos = ax
            else:
                pos = ax[j]
            y = cont_df.xs(f, level = 1, axis = 1).loc[i].values
            valid_idx = np.argwhere(~np.isnan(y))
            y = y[valid_idx]

            y_gt = cont_df_gt.xs(f, level = 1, axis = 1).loc[i].values
            y_gt = y_gt[valid_idx]

            wrong_idx = np.intersect1d(wrong_idx, valid_idx, assume_unique=False)

            pos.scatter(np.array(valid_idx), np.array(y), color='red',  label='Predicted')

            pos.scatter(np.array(valid_idx), np.array(y_gt), color='green',  label='GT')

            # Plot wrong preds
            if len(wrong_idx) > 0:
                pos.scatter(wrong_idx, np.array(y)[wrong_idx], color='blue', marker = '*', label='Wrong label pred.')
                pos.scatter(wrong_idx, np.array(y_gt)[wrong_idx], color='blue', marker = '*')

            pos.set_xlabel('x')
            pos.set_ylabel('y')

            title = f
            if title == 'F2_early':
                title = 'F2_late'
            if title == 'F3_late':
                title = 'F3_early'

            pos.set_title(f"{title}_patient_{i} - {gender}, {label_gt}")
            pos.set_ylim([-2.5, 2.5])
            pos.set_xticks(range(0, timesteps, 3))
            pos.legend()

        plt.tight_layout()
        plt.show()

In [70]:
# Set info for plotting
dataset = 'mimic'

gt_test_data_path = "/content/drive/MyDrive/OptionalProject/GT_predicted_data_labelenc_notarget_randominit_small.csv"
pred_data_path = "/content/drive/MyDrive/OptionalProject/predicted_data_labelenc_notarget_randominit_small.csv"
nr_patients = 2

In [None]:
df = pd.read_csv(pred_data_path, header=[0, 1])
gt_df = pd.read_csv(gt_test_data_path, header=[0, 1]).iloc[:, 1:]

timesteps = dataset_info[dataset]['ts']
features = dataset_info[dataset]['features']

plot_predicted_vs_gt(df, gt_df, timesteps, features, nr_patients)

In [None]:
gen_data_path = "/content/drive/MyDrive/OptionalProject/generated_data_labelenc_notarget_randominit_small.csv"
nr_patients = 1

df = pd.read_csv(gen_data_path, header=[0, 1])

plot_generated(df, timesteps, features, nr_patients)

# Plot MIMIC continuous distributions

In [2]:
mimic_data_path = "/content/drive/MyDrive/OptionalProject/MIMIC_data_labels_small.csv"

dataset = 'mimic'
timesteps = dataset_info[dataset]['ts']
features = dataset_info[dataset]['features']

In [None]:
df = pd.read_csv(mimic_data_path, header=[0, 1])

indexes = [str(i) for i in range(0, timesteps)]
cont_df = df.loc[:, pd.IndexSlice[indexes, :]]

label = df.xs('label', level = 1, axis = 1)['-1'].values.astype(bool)
pos_data = cont_df.loc[label].reset_index(drop=True)
neg_data = cont_df.loc[~label].reset_index(drop=True)


for j, f in enumerate(mimic_feats):
    feat_data_pos = np.empty(shape=(pos_data.shape[0], timesteps))
    feat_data_neg = np.empty(shape=(neg_data.shape[0], timesteps))

    for i in range(pos_data.shape[0]):
        feat_data_pos[i] = pos_data.xs(f, level = 1, axis = 1).loc[i].values
    for i in range(neg_data.shape[0]):
        feat_data_neg[i] = neg_data.xs(f, level = 1, axis = 1).loc[i].values

    feat_data_mean_pos = np.nanmean(feat_data_pos, axis=0)
    feat_data_mean_neg = np.nanmean(feat_data_neg, axis=0)

    fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    ax[0].plot(pd.DataFrame(feat_data_mean_pos).rolling(12).mean(), label='Smoothed positive', color='red')
    ax[1].plot(pd.DataFrame(feat_data_mean_neg).rolling(12).mean(), label='Smoothed negative', color='red')
    ax[0].set_title(f"Mean {f} - positive patients across time")
    ax[1].set_title(f"Mean {f} - negative patients across time")
    ax[0].legend()
    ax[1].legend()

    plt.tight_layout()
    plt.show()

# Plot static distributions

In [15]:
mimic_data_path = "/content/drive/MyDrive/OptionalProject/MIMIC_data_labels_small.csv"

mimic_feats = ['Age', 'gender', 'insurance', 'ethnicity']

In [None]:
df = pd.read_csv(mimic_data_path, header=[0, 1])

stat_df = df.loc[:, pd.IndexSlice['-1', :]].droplevel(level=0, axis=1).iloc[:, :-1]

fig, axes = plt.subplots(1, 4, figsize=(15, 4))

gender_values = stat_df['gender'].value_counts()
axes[0].bar(gender_values.index, gender_values / stat_df.shape[0] * 100)
axes[0].set_title("Gender distribution in percentages")
axes[0].tick_params(axis='x', rotation=90)
insurance_values = stat_df['insurance'].value_counts()
axes[1].bar(insurance_values.index, insurance_values / stat_df.shape[0] * 100)
axes[1].set_title("Insurance distribution in percentages")
axes[1].tick_params(axis='x', rotation=90)
ethnicity_values = stat_df['ethnicity'].value_counts()
axes[2].bar(['White', 'Unknown', 'Black', 'Other', 'Hispanic', 'Asian', 'Not obtained', 'Alaska native'], ethnicity_values / stat_df.shape[0] * 100)
axes[2].set_title("Ethnicity distribution in percentages")
axes[2].tick_params(axis='x', rotation=90)
axes[3].hist(stat_df['Age'])
axes[3].set_title("Age distribution")
axes[3].tick_params(axis='x', rotation=90)


plt.tight_layout()
plt.savefig('statc_feats.png')
plt.show()


# Get data missingness MIMIC

In [14]:
mimic_data_path = "/content/drive/MyDrive/OptionalProject/MIMIC_data_labels_small.csv"

dataset = 'mimic'

timesteps = dataset_info[dataset]['ts']
features = dataset_info[dataset]['features']

In [5]:
def compute_missing_data_perc(df, feats, timesteps):

    indexes = [str(i) for i in range(0, timesteps)]
    cont_df = df.loc[:, pd.IndexSlice[indexes, :]]

    missing_data_perc = {}
    for i, c in enumerate(feats):
        data = cont_df.xs(c, level = 1, axis = 1)
        miss = data.isna().sum() / data.shape[0]
        if not missing_data_perc.get(c, None):
            missing_data_perc[c] = [miss.values]
        else:
            missing_data_perc.append(miss.values)

    return missing_data_perc


def compute_missing_data_perc_class(df, feats, timesteps):

    indexes = [str(i) for i in range(0, timesteps)]
    cont_df = df.loc[:, pd.IndexSlice[indexes, :]]

    label = df.xs('label', level = 1, axis = 1)['-1'].values.astype(bool)
    pos_data = cont_df.loc[label].reset_index(drop=True)
    neg_data = cont_df.loc[~label].reset_index(drop=True)

    missing_data_perc_pos = {}
    missing_data_perc_neg = {}
    for i, c in enumerate(feats):
        data_pos = pos_data.xs(c, level = 1, axis = 1)
        data_neg = neg_data.xs(c, level = 1, axis = 1)
        miss_pos = data_pos.isna().sum() / data_pos.shape[0]
        miss_neg = data_neg.isna().sum() / data_neg.shape[0]
        if not missing_data_perc_pos.get(c, None):
            missing_data_perc_pos[c] = [miss_pos.values]
        else:
            missing_data_perc_pos.append(miss_pos.values)

        if not missing_data_perc_neg.get(c, None):
            missing_data_perc_neg[c] = [miss_neg.values]
        else:
            missing_data_perc_neg.append(miss_neg.values)

    return missing_data_perc_pos, missing_data_perc_neg

In [None]:
df = pd.read_csv(mimic_data_path, header=[0, 1])

missing_data_perc_pos, missing_data_perc_neg = compute_missing_data_perc_class(df, mimic_feats, timesteps)

fig, axes = plt.subplots(14, 2, figsize=(15, 35))

means_pos = []
means_neg = []

for i, (key, value) in enumerate(missing_data_perc_pos.items()):
    axes[i, 0].plot(value[0])
    mean = np.array(value[0]).mean()
    means_pos.append(mean)
    axes[i, 0].hlines(mean, 0, len(value[0]) - 1, label=f'Mean percentage of missing {key} - {mean:.2f}', color='red')
    axes[i, 0].set_title(f'Percentage of missing {key} for label 1')
    axes[i, 0].legend()

for j, (key, value) in enumerate(missing_data_perc_neg.items()):
    axes[j, 1].plot(value[0])
    mean = np.array(value[0]).mean()
    means_neg.append(mean)
    axes[j, 1].hlines(mean, 0, len(value[0]) - 1, label=f'Mean percentage of missing {key} - {mean:.2f}', color='red')
    axes[j, 1].set_title(f'Percentage of missing {key} for label 0')
    axes[j, 1].legend()

plt.tight_layout()
plt.show()

plt.bar(mimic_feats, means_pos, label='Positive cases', alpha=0.5)
plt.bar(mimic_feats, means_neg, label='Negative cases', alpha=0.5)
plt.xticks(rotation = 90)
plt.yticks([0.1 * x for x in range(0, 10)])
plt.legend(loc="lower right")
plt.title("Average percentage of missing data across time, per feature")
plt.show()

In [None]:
missing_data_perc = compute_missing_data_perc(df, mimic_feats, timesteps)

for key, value in missing_data_perc.items():
    plt.plot(value[0])
    mean = np.array(value[0]).mean()
    plt.hlines(mean, 0, len(value[0]) - 1, label=f'Mean percentage of missing {key} - {mean:.2f}', color='red')
    plt.title(f'Percentage of missing {key}')
    plt.legend()
    plt.show()

# Insert missing values in toy dataset

In [None]:
toy_data_path = "/content/drive/MyDrive/OptionalProject/MIMIC_data_labels_toy.csv"

# output_toy_missing_path = "/content/drive/MyDrive/OptionalProject/MIMIC_data_labels_toy_missing.csv"
# missing_data_perc = np.array([0.82, 0.88, 0.91, 0.93, 0.92, 0.91, 0.93, 0.93, 0.94, 0.93])

output_toy_missing_path = "/content/drive/MyDrive/OptionalProject/MIMIC_data_labels_toy_missing_30.csv"
missing_data_perc = np.array([0.22, 0.28, 0.31, 0.33, 0.32, 0.31, 0.33, 0.33, 0.34, 0.33])

dataset = 'toy'
timesteps = dataset_info[dataset]['ts']
toy_cont_feats = dataset_info[dataset]['features']

In [None]:
# Insert nan's and save dataframe
df = pd.read_csv(toy_data_path, header=[0, 1])

for c in toy_cont_feats:
    for t in range(timesteps):
        nr_missing_values = int(missing_data_perc[t] * df.shape[0])
        miss_indexes = np.random.choice(df.index, nr_missing_values, replace=False)
        df.loc[sorted(miss_indexes), (str(t), c)] = np.nan

df.to_csv(output_toy_missing_path, index=False)

In [None]:
x = pd.read_csv(output_toy_missing_path, header=[0, 1]).iloc[:, :-5] # don't consider static data and label
print(f'Total features per row: {x.shape[1]}')
print(f'Missing features per row: {x.isnull().sum(axis=1).mean()}')

# Metrics and plots on generated data

In [29]:
# The MAE is not a good indicator and we only rely on the plots generated with below functions

def generated_data_metrics(df, gt_df, timesteps, features, verbose=False):

    metrics = {}
    indexes = [str(i) for i in range(0, timesteps)]
    label_values = ['No', 'Yes']
    gender_values = ['M', 'F']
    results_pos = {}
    results_neg = {}

    for j, f in enumerate(features):

        if f == 'F1_constant':
            # For each gender
            for gender_index in range(len(gender_values)):
                new_gt_df_gender = gt_df[(gt_df.xs('gender', level = 1, axis = 1) == gender_values[gender_index]).values].reset_index()
                new_df_gender = df[(gt_df.xs('gender', level = 1, axis = 1) == gender_values[gender_index]).values].reset_index()

                # For each label
                for label_index in range(len(label_values)):
                    new_gt_df = new_gt_df_gender[(new_gt_df_gender.xs('label', level = 1, axis = 1) == label_values[label_index]).values].reset_index()
                    new_df = new_df_gender[(new_gt_df_gender.xs('label', level = 1, axis = 1) == label_values[label_index]).values].reset_index()

                    cont_df = new_df.loc[:, pd.IndexSlice[indexes, :]]
                    cont_df_gt = new_gt_df.loc[:, pd.IndexSlice[indexes, :]]

                    all_data = pd.DataFrame(columns=list(range(1, timesteps)))
                    all_data_gt = pd.DataFrame(columns=list(range(1, timesteps)))

                    for i in range(new_df.shape[0]):

                        y_gt = cont_df_gt.xs(f, level = 1, axis = 1).loc[i].values
                        y_gt = np.array(y_gt)
                        valid_idx = np.argwhere(~np.isnan(y_gt))

                        pred = np.empty((y_gt.shape[0],))
                        pred[:] = np.nan
                        y = cont_df.xs(f, level = 1, axis = 1).loc[i].values
                        pred[valid_idx] = y[valid_idx]

                        all_data.loc[i] = pred[1:]
                        all_data_gt.loc[i] = y_gt[1:]

                    all_data_gt_mean = all_data_gt.mean()
                    std_gt = all_data_gt.std()
                    all_data_mean = all_data.mean()
                    std = all_data.std()

                    if verbose:
                        for i in range(all_data_gt.shape[0]):
                            plt.plot(all_data_gt.loc[i], color='#ABCDFF', alpha=0.4)

                        plt.plot(all_data_gt_mean, label=f'Ground_truth - {label_values[label_index]} - {gender_values[gender_index]}', linewidth=3)

                        for i in range(all_data.shape[0]):
                            plt.plot(all_data.loc[i], color='#FACC8D', alpha=0.4)

                        plt.plot(all_data_mean, label=f'Predicted - {label_values[label_index]} - {gender_values[gender_index]}', linewidth=3)

                        plt.title(f"Mean values of {f} across timesteps")
                        plt.ylim([-2.5, 2.5])
                        plt.legend()
                        plt.show()

                    abs_dif = np.abs(all_data_gt_mean - all_data_mean)

                    if label_index == 0:
                        results_neg[f + gender_values[gender_index]] = abs_dif
                    else:
                        results_pos[f + gender_values[gender_index]] = abs_dif
        else:
             # For each label
                for label_index in range(len(label_values)):
                    new_gt_df = gt_df[(gt_df.xs('label', level = 1, axis = 1) == label_values[label_index]).values].reset_index()
                    new_df = df[(gt_df.xs('label', level = 1, axis = 1) == label_values[label_index]).values].reset_index()

                    cont_df = new_df.loc[:, pd.IndexSlice[indexes, :]]
                    cont_df_gt = new_gt_df.loc[:, pd.IndexSlice[indexes, :]]

                    all_data = pd.DataFrame(columns=list(range(1, timesteps)))
                    all_data_gt = pd.DataFrame(columns=list(range(1, timesteps)))

                    for i in range(new_df.shape[0]):

                        y_gt = cont_df_gt.xs(f, level = 1, axis = 1).loc[i].values
                        y_gt = np.array(y_gt)
                        valid_idx = np.argwhere(~np.isnan(y_gt))

                        pred = np.empty((y_gt.shape[0],))
                        pred[:] = np.nan
                        y = cont_df.xs(f, level = 1, axis = 1).loc[i].values
                        pred[valid_idx] = y[valid_idx]

                        all_data.loc[i] = pred[1:]
                        all_data_gt.loc[i] = y_gt[1:]

                    all_data_gt_mean = all_data_gt.mean()
                    std_gt = all_data_gt.std()
                    all_data_mean = all_data.mean()
                    std = all_data.std()

                    if verbose:
                        for i in range(all_data_gt.shape[0]):
                            plt.plot(all_data_gt.loc[i], color='#ABCDFF', alpha=0.4)

                        plt.plot(all_data_gt_mean, label=f'Ground_truth - {label_values[label_index]}', linewidth=3)

                        for i in range(all_data.shape[0]):
                            plt.plot(all_data.loc[i], color='#FACC8D', alpha=0.4)

                        plt.plot(all_data_mean, label=f'Predicted - {label_values[label_index]}', linewidth=3)

                        plt.title(f"Mean values of {f} across timesteps")
                        plt.ylim([-2.5, 2.5])
                        plt.legend()
                        plt.show()

                    abs_dif = np.abs(all_data_gt_mean - all_data_mean)

                    if label_index == 0:
                        results_neg[f] = abs_dif
                    else:
                        results_pos[f] = abs_dif

    return results_neg, results_pos

In [None]:
gen_data_path = "/content/drive/MyDrive/OptionalProject/generated_data_labelenc_notarget_randominit_small.csv"
gt_gen_data_path = "/content/drive/MyDrive/OptionalProject/GT_generated_data_labelenc_notarget_randominit_small.csv"

dataset = 'mimic'
timesteps = dataset_info[dataset]['ts']
features = dataset_info[dataset]['features']

df = pd.read_csv(gen_data_path, header=[0, 1])
gt_df = pd.read_csv(gt_gen_data_path, header=[0, 1])

results_neg, results_pos = generated_data_metrics(df, gt_df, timesteps, features, verbose=True)