In [None]:
import pandas as pd
import numpy as np
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 200)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


def df_stats(df):
    from tabulate import tabulate
    print("\n***** Shape: ", df.shape," *****\n")
    
    columns_list = df.columns.values.tolist()
    isnull_list = df.isnull().sum().values.tolist()
    isunique_list = df.nunique().values.tolist()
    dtypes_list = df.dtypes.tolist()
    
    list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
    df_stat_val = pd.DataFrame(list_stat_val, columns=['Name', 'Null', 'Unique', 'Dtypes'])
    print(tabulate(df_stat_val, headers='keys', tablefmt='psql'))
    return df.head()

### Load the labelbox annotations

In [None]:
import labelbox
# Enter your Labelbox API key here
LB_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySWQiOiJjbDh2eGdsaHAweTlzMDh6ZmdyOXM3Z28zIiwib3JnYW5pemF0aW9uSWQiOiJjbDh2eGdsaDgweTlyMDh6ZjNybzNkbXg2IiwiYXBpS2V5SWQiOiJjbGV2aTA4ejEwMzBmMDczZjIzb2UxZnExIiwic2VjcmV0IjoiNTExMDIwNzcxODk3MmRjM2MyMTI0MDRjNTI5ZGFjY2UiLCJpYXQiOjE2NzgwMjcwOTQsImV4cCI6MjMwOTE3OTA5NH0.kTAKOI5Sm7wE3IeEjZTGpwc1u4aU0Ya5mzt4eOHa-wQ"
# Create Labelbox client
client = labelbox.Client(api_key=LB_API_KEY)
PROJECT_ID = 'cl8vxju2k0z0q07ztfyt5dr7l'
project = client.get_project(PROJECT_ID)
labels = project.export_v2(params={
	"data_row_details": False,
	"metadata_fields": False,
	"attachments": False,
	"project_details": False,
	"performance_details": False,
	"label_details": False,
	"interpolated_frames": False
  })
labels.wait_till_done()

if labels.errors:
  print(labels.errors)

export_json = labels.result

### Load the previous parquet and make sure that the patient id has a zero-padded length of 7

In [None]:
import pandas as pd
import numpy as np

dir_ = '/media/data1/ravram/DeepECG/ekg_waveforms_output/df_xml_2023_03_30_n_1633856_with_labelbox.parquet'
df_ = pd.read_parquet(dir_, engine='fastparquet')

#make sure patient len(id) == 7
df_['RestingECG_PatientDemographics_PatientID'] = [n.zfill(7) for n in df_['RestingECG_PatientDemographics_PatientID'].tolist()]
display(len(df_))  

### Groupby xml_path
df_group = df_.groupby('xml_path').agg(lambda x: x.tolist())
display(len(df_group))  

### load the annotated ECG csv and similarly z-pad patient id

In [None]:
new_df = pd.read_csv("/media/data1/anolin/20221002_ECG_mod_diagnosis_sampled_3600.csv")
new_df['patientid'] = [str(n).zfill(7) for n in new_df['patientid'].tolist()]
new_df

### make pairs

In [None]:
# this basically fills a dict where the key is a unique diag and the values
# is a list that is filled with each patient ID with an exact match for that diag
dict_diag = dict()
#ROBERT : 2023-11-22 CHANGED TO DIAGNOSIS
for k,v in zip(new_df['Diagnosis'].tolist(),new_df['patientid'].tolist()):
    if k in dict_diag:
        dict_diag[k].append(v)

    else:
        dict_diag.update({k:[v]})

#dict_diag

### Generate the labelbox hot-encoding for the Labelbox

In [None]:
def flatten_labels(label_dict):
    """
    Flattens the labels from a Labelbox sub-dictionary into a predictable structure.

    Parameters:
        label_dict (dict): A Labelbox sub-dictionary object.

    Returns:
        dict: Flattened dictionary with key categories and their corresponding features.
    """
    flattened_dict = {
        'Rhythm': [], 'QRS complex': [], 'Wave criterias': [], 
        'Conduction': [], 'Chamber enlargement': [], 'Other': [],
        'ST segments': [], 'P-wave morphology': []
    }
    classification_dict = label_dict['classifications']

    for category in classification_dict:
        category_name = category['name']
        for feature in category['checklist_answers']:
            flattened_dict[category_name].append(feature['name'])

    return flattened_dict


def adjust_name(label_dict):
    """
    Modifies the dictionary to use the original patient ID as the key.
    
    Parameters:
        label_dict (dict): Dictionary with Labelbox original ECG ID as key.

    Returns:
        dict: Dictionary with Patient ID as the new key.
    """
    modified_dict = {}
    for key, value in label_dict.items():
        modified_key = key.split('_')[1]
        modified_dict[modified_key] = value

    return modified_dict


def get_single_value(label_dict):
    """
    Filters the dictionary to include only non-empty label categories.
    
    Parameters:
        label_dict (dict): Dictionary with both empty and filled Labelbox label categories.

    Returns:
        dict: Dictionary with only positive labels, without category information.
    """

    filtered_dict = {}

    for key, value in label_dict.items():
        for subkey, subvalue in value['annotations'].items():
            if key not in filtered_dict:
                filtered_dict[key] = subvalue
            else:
                filtered_dict[key].extend(subvalue)

    return filtered_dict


def get_hot_labels(dict_):
    """
    Generates a one_hot encoded_vector for a dictionnary containing its unique positive labels
    it looks through the entire dictionnary creates a unique list of all the potnetial labels
    and uses that list to, within another loop, generate the vector
    
    Parameters:
        dict: a dictionnary with only positive labels no category
    Returns:
        dict: a dictionnary with patient id: one hot encoded vector
        list: list of the features used, used to generate the df columns
    """

    list_features = list()

    for k,v in dict_.items():
        for i in v:
            if i not in list_features:
                list_features.append(i)

    out_dict = dict()
    for k,v in dict_.items():
        out_list = [0] * len(list_features)
        for i in v:
            for pos,ii in enumerate(list_features):
                if ii == i:
                    out_list[pos] = 1

        out_dict.update({k:out_list})

    return out_dict,list_features


from tqdm.notebook import tqdm


def generate_cleaned_dict_v2(label_box_dict):
    """
    Finds and filteres the labelbox raw dict to extract only the labels and/or flatten it

    Parameters:
        dict: a labelbox object 

    Returns:
        dict: flatten dict of the labelbox object
        dict: flatten dict containing the positive labels for each patient
    """
    dict_flatten = dict()
    dict_labels = dict()
    for annotation in tqdm(label_box_dict):

        temp_dict_flatten = dict()
        temp_dict_labels = dict()

        temp_dict = dict()

        name = annotation['data_row']['external_id'].split('.')[0]
        for key, values in annotation['projects']['cl8vxju2k0z0q07ztfyt5dr7l']['labels'][0].items():
            if isinstance(values, dict) and key == 'annotations':
                print(values)
                d_ = flatten_labels(values)
                temp_dict_flatten.update({key:d_})
                temp_dict_labels.update({key:d_})
            else: 
                temp_dict_flatten.update({key:values})

        dict_flatten.update({name:dict_flatten})
        dict_labels.update({name:temp_dict_labels})
        display(dict_labels)
        dict_labels = get_single_value(adjust_name(dict_labels))
        one_hot_enconded, feature_list = get_hot_labels(dict_labels)
    return dict_flatten, dict_labels, one_hot_enconded, feature_list, pd.DataFrame.from_dict(one_hot_enconded,orient='index',columns=feature_list)

In [None]:
from tqdm.notebook import tqdm


def generate_cleaned_dict_v2_no_tqdm(label_box_data):
    """
    Processes Labelbox data to extract and flatten labels, and generates one-hot encoded vectors.
    Modified to work without tqdm for progress visualization.

    Parameters:
        label_box_data (list): List of Labelbox data entries.

    Returns:
        Tuple containing dictionaries of flattened labels, positive labels, and one-hot encoded vectors,
        along with a list of feature names and a DataFrame.
    """
    import pandas as pd

    flattened_dict = {}
    positive_labels_dict = {}

    for annotation in label_box_data:
        name = annotation['data_row']['external_id'].split('.')[0]
        project_labels = annotation['projects']['cl8vxju2k0z0q07ztfyt5dr7l']['labels'][0]
        annotations = project_labels.get('annotations', {})

        if annotations:
            flattened_labels = flatten_labels(annotations)
            positive_labels = adjust_name(get_single_value({name: {'annotations': flattened_labels}}))
            positive_labels_dict.update(positive_labels)
            flattened_dict[name] = project_labels

    one_hot_encoded, feature_list = get_hot_labels(positive_labels_dict)
    df_one_hot = pd.DataFrame.from_dict(one_hot_encoded, orient='index', columns=feature_list)

    return flattened_dict, positive_labels_dict, one_hot_encoded, feature_list, df_one_hot

### Generate the final one-hot-encoded dataset

In [None]:
dict_flatten, dict_labels, one_hot_enconded, feature_list, final_one_hot = generate_cleaned_dict_v2_no_tqdm(export_json)

In [None]:
key_list = list(dict_flatten.keys())
print(len(key_list))
print(len(final_one_hot))

In [None]:
#display(final_one_hot.head(n=5))
#display(final_one_hot.to_csv('final_one_hot.csv'))

In [None]:
# Creating a new column 'all_values' with array of all values in each row
final_one_hot['all_values'] = final_one_hot.apply(lambda row: row.values, axis=1)

# Identifying rows where all values are 0
# Excluding the 'all_values' and 'Unnamed: 0' columns for this check
zero_value_rows = final_one_hot[final_one_hot.drop(columns=['all_values']).eq(0).all(axis=1)].reset_index()

# Displaying the rows with all zero values
display(zero_value_rows.describe())
display(zero_value_rows['index'])
## ROBERT : This should be "0"

# find the patients equivalent to those annotated
# annotation with exact string matching

In [None]:
# cette cell itère le dataset original i.e. df_ et 
# Ajouter le one hot si le diag dans df_ match dans dict_diag

dict_out_adjusted = dict()
list_matched = list()
## ROBERT : Used DIAGNOSIS instead of original_diagnosis
for patient_id, in_, diag in zip(df_['RestingECG_PatientDemographics_PatientID'].tolist(),df_.index.tolist(),df_['diagnosis'].tolist()):
    if diag in dict_diag:
        dict_out_adjusted.update({f'{patient_id}_{in_}':one_hot_enconded[dict_diag[diag][0]]}) #ajoute le one-hot approprié
        list_matched.append(0) #ajoute 0 pour signifier match

    else:
        dict_out_adjusted.update({f'{patient_id}_{in_}':[0] * 78}) #ajoute empty one hot de la taille appropriée
        list_matched.append(-1) #ajoute -1 pour signifier no match

# le dict resultant est transformé en df
temp_ = pd.DataFrame.from_dict(dict_out_adjusted, orient='index',columns=feature_list)
temp_ = temp_.reset_index()
temp_['annotated'] = list_matched
temp_

In [None]:
# concat avec le df original pour conserver toutes les données
temp_out = pd.concat([df_,temp_], axis=1)
temp_out

In [None]:
from collections import Counter

Counter(temp_out['annotated'].tolist())

# Do a string cleanup

In [None]:
# proposed string cleanup
import re

import pandas as pd
from unidecode import unidecode


def normalize_string(s):
    s = s.split("*** ATTENTION! mauvaise qualité de l'ECG*** ")[-1] #retiré puisque plusieurs ECG avais que ce substring comme différence
    s = s.lower()
    s = s.split(' ecg anormal')[0] #retiré puisque plusieurs ECG avais que ce substring comme différence
    #s = s.lstrip()
    #s = s.lstrip()
    #s = s.rstrip()
    s = unidecode(s)
    s = s.strip()
    s = re.sub(r"\W+", " ", s)
    return s

In [None]:
# apply the string edditing in the diag_df and df_
df_["Normalized_Diag"] = df_["original_diagnosis"].apply(normalize_string)
dict_diag = {normalize_string(k):v for k,v in dict_diag.items()}

In [None]:
# rerun the matching function

dict_out_adjusted = dict()
list_matched = list()
for patient_id, in_, diag in zip(df_['RestingECG_PatientDemographics_PatientID'].tolist(),df_.index.tolist(),df_['Normalized_Diag'].tolist()):
    if diag in dict_diag:
        dict_out_adjusted.update({f'{patient_id}_{in_}':one_hot_enconded[dict_diag[diag][0]]})
        list_matched.append(0)

    else:
        dict_out_adjusted.update({f'{patient_id}_{in_}':[0] * 78})
        list_matched.append(-1)

temp_ = pd.DataFrame.from_dict(dict_out_adjusted, orient='index',columns=feature_list)
temp_ = temp_.reset_index()
temp_['annotated'] = list_matched

temp_out = pd.concat([df_,temp_], axis=1)
temp_out

In [None]:
from collections import Counter

Counter(temp_out['annotated'].tolist())

In [None]:
temp_out_bad = temp_out[temp_out['annotated'] == -1]
temp_out_bad.iloc[400]['Normalized_Diag']

In [None]:
temp_out.to_parquet('/media/data1/anolin/out_v2.csv')

# add other ECGs

In [None]:
__author__ = "alexis nolin-lapalme"
__email__ = "alexis.nolin-lapalme@umontreal.ca"


import argparse
import base64
import os
import struct
from datetime import datetime

# utils
import numpy as np
import pandas as pd
import xmltodict
from tqdm.notebook import tqdm


def is_interactive():
    import __main__ as main
    return not hasattr(main, '__file__')


if is_interactive():
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm


class tinyxml2df:
    def __init__(
        self,
        in_path: str,
        out_path: str = "/media/data1/anolin/ECG",
        verbose: bool = True,
        save: bool = True,
    ):
        self.path = in_path
        self.out_path = out_path
        self.verbose = verbose
        self.save = save

    def remove_a_key(self, d, remove_key):
        if isinstance(d, dict):
            for key in list(d.keys()):
                if key == remove_key:
                    del d[key]
                else:
                    self.remove_a_key(d[key], remove_key)

    def decode_ekg_muse(self, raw_wave):
        """
        Ingest the base64 encoded waveforms and transform to numeric
        """
        # covert the waveform from base64 to byte array
        arr = base64.b64decode(bytes(raw_wave, "utf-8"))

        # unpack every 2 bytes, little endian (16 bit encoding)
        unpack_symbols = "".join([char * (len(arr) // 2) for char in "h"])
        byte_array = struct.unpack(unpack_symbols, arr)
        return byte_array

    def decode_ekg_muse_to_array(self, raw_wave, downsample=1):
        """
        Ingest the base64 encoded waveforms and transform to numeric
        downsample: 0.5 takes every other value in the array. Muse samples at 500/s and the sample model requires 250/s. So take every other.
        """
        try:
            dwnsmpl = int(1 // downsample)
        except ZeroDivisionError:
            print("You must downsample by more than 0")
        # covert the waveform from base64 to byte array
        arr = base64.b64decode(bytes(raw_wave, "utf-8"))

        # unpack every 2 bytes, little endian (16 bit encoding)
        unpack_symbols = "".join([char * int(len(arr) / 2) for char in "h"])
        byte_array = struct.unpack(unpack_symbols, arr)
        return np.array(byte_array)[::dwnsmpl]

    def xml_to_np_array_file(self, dic, path_to_output=os.getcwd()):
        """
        Upload the ECG as numpy array with shape=[2500,12,1] ([time, leads, 1]).
        The voltage unit should be in 1 mv/unit and the sampling rate should be 250/second (total 10 second).
        The leads should be ordered as follow I, II, III, aVR, aVL, aVF, V1, V2, V3, V4, V5, V6.
        """
        # print(dic)
        try:
            pt_id = dic["RestingECG"]["PatientDemographics"]["PatientID"]
        except:
            print("no PatientID")
            pt_id = "none"
        try:
            AcquisitionDateTime = (
                dic["RestingECG"]["TestDemographics"]["AcquisitionDate"]
                + "_"
                + dic["RestingECG"]["TestDemographics"]["AcquisitionTime"].replace(":", "-")
            )
        except:
            print("no AcquisitionDateTime")
            AcquisitionDateTime = "none"

        # try:
        #     requisition_number = dic['RestingECG']['Order']['RequisitionNumber']
        # except:
        #     print("no requisition_number")
        #     requisition_number = "none"

        # need to instantiate leads in the proper order for the model
        lead_order = [
            "I",
            "II",
            "III",
            "aVR",
            "aVL",
            "aVF",
            "V1",
            "V2",
            "V3",
            "V4",
            "V5",
            "V6",
        ]

        """
        Each EKG will have this data structure:
        lead_data = {
            'I': np.array
        }
        """

        lead_data = dict.fromkeys(lead_order)
        # lead_data = {leadid: None for k in lead_order}

        #     for all_lead_data in dic['RestingECG']['Waveform']:
        #         for single_lead_data in lead['LeadData']:
        #             leadname =  single_lead_data['LeadID']
        #             if leadname in (lead_order):
        try:
            for lead in dic["RestingECG"]["Waveform"]:
                for leadid in range(len(lead["LeadData"])):
                    sample_length = len(
                        self.decode_ekg_muse_to_array(lead["LeadData"][leadid]["WaveFormData"])
                    )
                    # sample_length is equivalent to dic['RestingECG']['Waveform']['LeadData']['LeadSampleCountTotal']
                    if sample_length == 5000:
                        lead_data[
                            lead["LeadData"][leadid]["LeadID"]
                        ] = self.decode_ekg_muse_to_array(
                            lead["LeadData"][leadid]["WaveFormData"], downsample=0.5
                        )
                    elif sample_length == 2500:
                        lead_data[
                            lead["LeadData"][leadid]["LeadID"]
                        ] = self.decode_ekg_muse_to_array(
                            lead["LeadData"][leadid]["WaveFormData"], downsample=1
                        )
                    else:
                        continue
                # ensures all leads have 2500 samples and also passes over the 3 second waveform

            lead_data["III"] = np.array(lead_data["II"]) - np.array(lead_data["I"])
            lead_data["aVR"] = -(np.array(lead_data["I"]) + np.array(lead_data["II"])) / 2
            lead_data["aVF"] = (np.array(lead_data["II"]) + np.array(lead_data["III"])) / 2
            lead_data["aVL"] = (np.array(lead_data["I"]) - np.array(lead_data["III"])) / 2

            lead_data = {k: lead_data[k] for k in lead_order}
            # drops V3R, V4R, and V7 if it was a 15-lead ECG

            # now construct and reshape the array
            # converting the dictionary to an np.array
            temp = []
            for key, value in lead_data.items():
                temp.append(value)

            # transpose to be [time, leads, ]
            ekg_array = np.array(temp).T

            # expand dims to [time, leads, 1]
            ekg_array = np.expand_dims(ekg_array, axis=-1)

            # Here is a check to make sure all the model inputs are the right shape
            #     assert ekg_array.shape == (2500, 12, 1), "ekg_array is shape {} not (2500, 12, 1)".format(ekg_array.shape )

            # filename = '/ekg_waveform_{}_{}.npy'.format(pt_id, requisition_number)
            filename = f"{pt_id}_{AcquisitionDateTime}.npy"

            path_to_output += filename
            # print(path_to_output)
            with open(path_to_output, "wb") as f:
                np.save(f, ekg_array)
            return path_to_output

        except:
            print("error", dic)
            return None

    def flatten(self, input_node: dict, key_: str = "", output_dict: dict = {}):
        self.remove_a_key(input_node, "Waveform")
        self.remove_a_key(input_node, "OriginalDiagnosis")
        self.remove_a_key(input_node, "Diagnosis")

        if isinstance(input_node, dict):
            for key, val in input_node.items():
                new_key = f"{key_}_{key}" if key_ else f"{key}"
                self.flatten(val, new_key, output_dict)
        elif isinstance(input_node, list):
            for idx, item in enumerate(input_node):
                self.flatten(item, f"{key_}_{idx}", output_dict)
        else:
            output_dict[key_] = input_node

        return output_dict

    def check_abnoramlity(self, data: pd.DataFrame):
        warn = ["Analyse impossible", "ECG anormal"]
        list_abnormality = [0] * data.shape[0]
        for pos, entry in enumerate(data["original_diagnosis"].values):
            if any(x in entry for x in warn):
                list_abnormality[pos] = -1

        for pos, entry in enumerate(data["diagnosis"].values):
            if any(x in entry for x in warn):
                list_abnormality[pos] = -1

        data["warnings"] = list_abnormality
        return data

    def read2flatten(self):
        xml_dict_list = list()
        path_list = list()
        xml_list = list()
        extracted = list()
        npy_list = list()
        dx_txt_list = list()
        original_dx_txt_list = list()

        # print(self.path)
        # files_with_xml = self.path.apply(lambda path: [_ for _ in os.listdir(path) if _.endswith('.xml')]).sum()
        ## Make directory self.out_path if it doesn't exist
        if not os.path.exists(self.out_path):
            os.makedirs(self.out_path)
        if not os.path.exists(os.path.join(self.out_path, "ecg_npy/")):
            os.makedirs(os.path.join(self.out_path, "ecg_npy/"))
            print("Creating directory")

        # iterate through all the files name verbose or not
        # print("{} | Currently transforming {} xml files from dir {} into dict".format(datetime.now().strftime("%H:%M:%S"),len(files_with_xml),self.path))
        list_files = os.listdir(self.path)
        for file_xml in tqdm(
           list_files, total=len(list_files), desc="Transforming xml files into dict"
        ):
            # with open(os.path.join(self.path,file_xml), 'r') as xml:
            with open(os.path.join(self.path,file_xml)) as xml:
                path_list.append(os.path.join(self.path,file_xml))
                # load
                # *|MARKER_CURSOR|*
                ECG_data_nested = xmltodict.parse(xml.read())
                npy_extracted = self.xml_to_np_array_file(
                    ECG_data_nested, os.path.join(self.out_path, "ecg_npy/")
                )

                try:
                    dx_txt = []
                    for line in ECG_data_nested["RestingECG"]["Diagnosis"]["DiagnosisStatement"]:
                        dx_txt.append(line["StmtText"])
                    ## Flatten array dx_txt and add whitespace between each element
                    dx_txt = " ".join(dx_txt)
                    dx_txt_list.append(dx_txt)
                except:
                    # print(ECG_data_nested)
                    dx_txt_list.append("-1")
                try:
                    original_dx_txt = []
                    for line in ECG_data_nested["RestingECG"]["OriginalDiagnosis"][
                        "DiagnosisStatement"
                    ]:
                        original_dx_txt.append(line["StmtText"])
                    original_dx_txt = " ".join(original_dx_txt)
                    original_dx_txt_list.append(original_dx_txt)
                except:
                    original_dx_txt_list.append("-1")

                ECG_data_flatten = self.flatten(ECG_data_nested)

                # append to the list
                ECG_extracted = xml_dict_list.append(ECG_data_flatten.copy())
                if npy_extracted == None:
                    extracted.append("False")
                    npy_list.append("Error")
                else:
                    extracted.append("True")
                    npy_list.append(npy_extracted)

                xml_list.append(os.path.join(self.path,file_xml))

        df = pd.DataFrame(xml_dict_list)
        df["diagnosis"] = dx_txt_list
        df["original_diagnosis"] = original_dx_txt_list
        df["xml_path"] = xml_list
        df["npy_path"] = npy_list
        df["extracted"] = extracted
        df = self.check_abnoramlity(df)

        if self.save == True:
            df.to_csv(
                os.path.join(
                    self.out_path,
                    "df_xml_{}_n_{}.csv".format(datetime.now().strftime("%Y_%m_%d"), df.shape[0]),
                )
            )

        return df


out_new = tinyxml2df('/media/data1/anolin/out_ECG_latest/xml/', '/media/data1/anolin/', True, True).read2flatten()

out_new.to_csv("temp_new_xml.csv")


out_new = pd.read_csv("temp_new_xml.csv")

In [None]:
out_new = pd.read_csv("temp_new_xml.csv")

In [None]:
new_out_combined = pd.concat([df_,out_new], axis=0)
new_out_combined

In [None]:
import re

import pandas as pd
from unidecode import unidecode


def normalize_string(s):
    s = s.split("*** ATTENTION! mauvaise qualité de l'ECG*** ")[-1] #retiré puisque plusieurs ECG avais que ce substring comme différence
    s = s.lower()
    s = s.split(' ecg anormal')[0] #retiré puisque plusieurs ECG avais que ce substring comme différence
    #s = s.lstrip()
    #s = s.lstrip()
    #s = s.rstrip()
    s = unidecode(s)
    s = s.strip()
    s = re.sub(r"\W+", " ", s)
    return s

In [None]:
new_out_combined["Normalized_Diag"] = new_out_combined["original_diagnosis"].apply(normalize_string)

dict_diag = {normalize_string(k):v for k,v in dict_diag.items()}

In [None]:
new_out_combined_dict_out_adjusted = dict()
new_out_combined_list_matched = list()
counter = 0
for patient_id, in_, diag in zip(new_out_combined['RestingECG_PatientDemographics_PatientID'].tolist(),new_out_combined.index.tolist(),new_out_combined['Normalized_Diag'].tolist()):
    if diag in dict_diag:
        new_out_combined_dict_out_adjusted.update({f'{patient_id}_{in_}_{counter}':out_dict[dict_diag[diag][0]]})
        new_out_combined_list_matched.append(0)

    else:
        new_out_combined_dict_out_adjusted.update({f'{patient_id}_{in_}_{counter}':[0] * 78})
        new_out_combined_list_matched.append(-1)

    counter += 1

temp_new_out = pd.DataFrame.from_dict(new_out_combined_dict_out_adjusted, orient='index',columns=list_features)
temp_new_out = temp_new_out.reset_index()

print(len(new_out_combined_list_matched))
temp_new_out['annotated'] = new_out_combined_list_matched

In [None]:
new_out_combined

In [None]:
temp_new_out

In [None]:
new_out_combined = new_out_combined.reset_index(drop=True)
temp_new_out = temp_new_out.reset_index(drop=True)
final_all_out = pd.concat([new_out_combined,temp_new_out], axis=1)
final_all_out

In [None]:
#Counter({-1: 497339, 0: 1396046})

Counter(final_all_out['annotated'].tolist())

In [None]:
1396046/(1396046+587706)

In [None]:
final_all_out.to_csv('/media/data1/anolin/added_label_box_2M.csv')

In [None]:
final_all_out