In [None]:
import pandas as pd
import numpy as np
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 200)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


def df_stats(df):
    from tabulate import tabulate
    print("\n***** Shape: ", df.shape," *****\n")
    
    columns_list = df.columns.values.tolist()
    isnull_list = df.isnull().sum().values.tolist()
    isunique_list = df.nunique().values.tolist()
    dtypes_list = df.dtypes.tolist()
    
    list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
    df_stat_val = pd.DataFrame(list_stat_val, columns=['Name', 'Null', 'Unique', 'Dtypes'])
    print(tabulate(df_stat_val, headers='keys', tablefmt='psql'))
    return df.head()

### Load the labelbox annotations

In [None]:
import labelbox
# Enter your Labelbox API key here
LB_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySWQiOiJjbDh2eGdsaHAweTlzMDh6ZmdyOXM3Z28zIiwib3JnYW5pemF0aW9uSWQiOiJjbDh2eGdsaDgweTlyMDh6ZjNybzNkbXg2IiwiYXBpS2V5SWQiOiJjbGV2aTA4ejEwMzBmMDczZjIzb2UxZnExIiwic2VjcmV0IjoiNTExMDIwNzcxODk3MmRjM2MyMTI0MDRjNTI5ZGFjY2UiLCJpYXQiOjE2NzgwMjcwOTQsImV4cCI6MjMwOTE3OTA5NH0.kTAKOI5Sm7wE3IeEjZTGpwc1u4aU0Ya5mzt4eOHa-wQ"
client = labelbox.Client(api_key=LB_API_KEY)
PROJECT_ID = 'cl8vxju2k0z0q07ztfyt5dr7l'
project = client.get_project(PROJECT_ID)
labels = project.export_v2(params={
	"data_row_details": False,
	"metadata_fields": False,
	"attachments": False,
	"project_details": False,
	"performance_details": False,
	"label_details": False,
	"interpolated_frames": False
  })
labels.wait_till_done()

if labels.errors:
  print(labels.errors)

export_json = labels.result

### Generate the labelbox hot-encoding for the Labelbox

In [None]:
def flatten_labels(label_dict):
    """
    Flattens the labels from a Labelbox sub-dictionary into a predictable structure.

    Parameters:
        label_dict (dict): A Labelbox sub-dictionary object.

    Returns:
        dict: Flattened dictionary with key categories and their corresponding features.
    """
    flattened_dict = {
        'Rhythm': [], 'QRS complex': [], 'Wave criterias': [], 
        'Conduction': [], 'Chamber enlargement': [], 'Other': [],
        'ST segments': [], 'P-wave morphology': []
    }
    classification_dict = label_dict['classifications']

    for category in classification_dict:
        category_name = category['name']
        for feature in category['checklist_answers']:
            flattened_dict[category_name].append(feature['name'])

    return flattened_dict


def adjust_name(label_dict):
    """
    Modifies the dictionary to use the original patient ID as the key.
    
    Parameters:
        label_dict (dict): Dictionary with Labelbox original ECG ID as key.

    Returns:
        dict: Dictionary with Patient ID as the new key.
    """
    modified_dict = {}
    for key, value in label_dict.items():
        modified_key = key.split('_')[1] + '_' + key.split('_')[2] + '_' + key.split('_')[3]
        modified_dict[modified_key] = value

    return modified_dict


def get_single_value(label_dict):
    """
    Filters the dictionary to include only non-empty label categories.
    
    Parameters:
        label_dict (dict): Dictionary with both empty and filled Labelbox label categories.

    Returns:
        dict: Dictionary with only positive labels, without category information.
    """

    filtered_dict = {}

    for key, value in label_dict.items():
        for subkey, subvalue in value['annotations'].items():
            if key not in filtered_dict:
                filtered_dict[key] = subvalue
            else:
                filtered_dict[key].extend(subvalue)

    return filtered_dict


def get_hot_labels(dict_, df):
    """
    Generates a one_hot encoded vector for a dictionary containing its unique positive labels.
    For multiple examples per 'Diagnosis', it takes the max value for the keys (1) between the examples.

    Parameters:
        dict_: a dictionary with only positive labels, no category.
        df: DataFrame to match keys with 'Diagnosis'.
    Returns:
        dict: a dictionary with 'Diagnosis' as keys and one-hot encoded vectors as values.
        list: list of the features used, used to generate the df columns.
    """

    list_features = list()
    # Create a list of unique features
    for v in dict_.values():
        for i in v:
            if i not in list_features:
                list_features.append(i)

    out_dict = dict()
    # Loop through each row in the DataFrame
    for index, row in df.iterrows():
        # Construct the key as in dict_
        key = f"{row['RestingECG_PatientDemographics_PatientID']}_{row['RestingECG_TestDemographics_AcquisitionDate']}_{row['RestingECG_TestDemographics_AcquisitionTime']}"
        # Check if this key is in dict_
        if key in dict_:
            out_list = [0] * len(list_features)
            for i in dict_[key]:
                for pos, feature in enumerate(list_features):
                    if feature == i:
                        out_list[pos] = 1
            # Use 'Diagnosis' as the key in out_dict
            diagnosis = row['Diagnosis']
            if diagnosis in out_dict:
                # Merge by taking max value for each feature
                out_dict[diagnosis] = [max(a, b) for a, b in zip(out_dict[diagnosis], out_list)]
            else:
                out_dict[diagnosis] = out_list

    return out_dict, list_features

# Example usage
# out_dict, list_features = get_hot_labels(positive_labels_dict, labelbox_df_prelabelling)


from tqdm.notebook import tqdm


def generate_cleaned_dict_v2_no_tqdm(label_box_data, labelbox_df_prelabelling):
    """
    Processes Labelbox data to extract and flatten labels, and generates one-hot encoded vectors.
    Modified to work without tqdm for progress visualization.

    Parameters:
        label_box_data (list): List of Labelbox data entries.

    Returns:
        Tuple containing dictionaries of flattened labels, positive labels, and one-hot encoded vectors,
        along with a list of feature names and a DataFrame.
    """
    import pandas as pd

    flattened_dict = {}
    positive_labels_dict = {}
    i = 0

    for annotation in tqdm(label_box_data):
  
        name = annotation['data_row']['external_id'].split('.')[0]
        project_labels = annotation['projects']['cl8vxju2k0z0q07ztfyt5dr7l']['labels'][0]
        annotations = project_labels.get('annotations', {})
        
        if annotations:
            flattened_labels = flatten_labels(annotations)
            positive_labels = adjust_name(get_single_value({name: {'annotations': flattened_labels}}))
            positive_labels_dict.update(positive_labels)
            flattened_dict[name] = project_labels


    one_hot_encoded, feature_list = get_hot_labels(positive_labels_dict, labelbox_df_prelabelling)
    print(one_hot_encoded)
    print(feature_list)
    df_one_hot = pd.DataFrame.from_dict(one_hot_encoded, orient='index', columns=feature_list)

    return flattened_dict, positive_labels_dict, one_hot_encoded, feature_list, df_one_hot

In [None]:
labelbox_df_prelabelling = pd.read_csv("/media/data1/anolin/20221002_ECG_mod_diagnosis_sampled_3600.csv")
labelbox_df_prelabelling['RestingECG_PatientDemographics_PatientID'] = [str(n).zfill(7) for n in labelbox_df_prelabelling['patientid'].tolist()]
#Drop patientid
labelbox_df_prelabelling.drop(columns=['patientid'], inplace=True)
## Rename labelbox_df_prelabelling['AcquisitionDate'] to labelbox_df_prelabelling['RestingECG_TestDemographics_AcquisitionDate']
labelbox_df_prelabelling.rename(columns={'AcquisitionDate':'RestingECG_TestDemographics_AcquisitionDate'}, inplace=True)
## Rename AcquisitioNTime to RestingECG_TestDemographics_AcquisitionTime
labelbox_df_prelabelling.rename(columns={'AcquisitionTime':'RestingECG_TestDemographics_AcquisitionTime'}, inplace=True)

# this basically fills a dict where the key is a unique diag and the values
# is a list that is filled with each patient ID with an exact match for that diag
dict_diag = dict()
#ROBERT : 2023-11-22 CHANGED TO DIAGNOSIS
for k,v in zip(labelbox_df_prelabelling['Diagnosis'].tolist(),labelbox_df_prelabelling['RestingECG_PatientDemographics_PatientID'].tolist()):
    if k in dict_diag:
        dict_diag[k].append(v)

    else:
        dict_diag.update({k:[v]})

display(labelbox_df_prelabelling.loc[labelbox_df_prelabelling['RestingECG_PatientDemographics_PatientID'] == '0590682'])



### Generate the final one-hot-encoded dataset

In [None]:
dict_flatten, dict_labels, one_hot_encoded, feature_list, final_one_hot = generate_cleaned_dict_v2_no_tqdm(export_json, labelbox_df_prelabelling)

In [None]:
display(feature_list)

In [None]:
# Creating a new column 'all_values' with array of all values in each row
final_one_hot['all_values'] = final_one_hot.apply(lambda row: row.values, axis=1)
# Identifying rows where all values are 0
# Excluding the 'all_values' and 'Unnamed: 0' columns for this check
zero_value_rows = final_one_hot[final_one_hot.drop(columns=['all_values']).eq(0).all(axis=1)].reset_index()

# Displaying the rows with all zero values
display(zero_value_rows.describe())
display(zero_value_rows['index'])
## ROBERT : This should be "0"

# find the patients equivalent to those annotated
# annotation with exact string matching

In [None]:
import pandas as pd
import numpy as np

dir_ = '/media/data1/anolin/temp_new_dataset/df_xml_2023_11_25_FINAL_CONCAT.parquet'
df_ecg = pd.read_parquet(dir_, engine='fastparquet')

#make sure patient len(id) == 7
df_ecg['RestingECG_PatientDemographics_PatientID'] = [n.zfill(7) for n in df_ecg['RestingECG_PatientDemographics_PatientID'].tolist()]


In [None]:
display(len(df_ecg.columns))

In [None]:
import pandas as pd

# Ensure the date column is in datetime format
df_ecg['RestingECG_TestDemographics_AcquisitionDate'] = pd.to_datetime(df_ecg['RestingECG_TestDemographics_AcquisitionDate'])

# Extract year and month
df_ecg['Year'] = df_ecg['RestingECG_TestDemographics_AcquisitionDate'].dt.year
df_ecg['Month'] = df_ecg['RestingECG_TestDemographics_AcquisitionDate'].dt.month
df_ecg = df_ecg.groupby(['npy_path']).first().reset_index()
# Group by year and month and count the ECGs
ecg_counts = df_ecg.groupby(['Year', 'Month']).size().reset_index(name='ECG_Counts')
# Display the results by months
print(ecg_counts)

In [None]:
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 200)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


display(df_ecg.loc[(df_ecg['RestingECG_PatientDemographics_PatientID'] == '0067241') & (df_ecg['RestingECG_TestDemographics_AcquisitionDate'] == '03-08-2022') & (df_ecg['RestingECG_TestDemographics_AcquisitionTime']=='00:11:07')].diagnosis)

In [None]:
def append_one_hot_vectors(df_ecg, one_hot_encoded, feature_list):
    """
    Appends one_hot_encoded vectors to the rows in df_ecg where 'diagnosis' matches the keys in one_hot_encoded.
    Sets the column headers of the appended vectors to the provided feature_list.

    Parameters:
        df_ecg: DataFrame containing 'diagnosis' column.
        one_hot_encoded: Dictionary with diagnosis as keys and one-hot encoded vectors as values.
        feature_list: List of feature names corresponding to positions in the one-hot encoded vectors.
    Returns:
        DataFrame: Modified df_ecg with appended one_hot_encoded vectors.
    """

    # Initialize a list to store the one-hot encoded vectors
    one_hot_vectors = []

    # Initialize a list to track if a match was found
    list_matched = []

    # Iterate through each row in df_ecg
    for index, row in tqdm(df_ecg.iterrows()):
        diagnosis = row['diagnosis']
        # Check if the diagnosis is in the one_hot_encoded keys
        if (diagnosis in one_hot_encoded) & (len(row['diagnosis']) >= 4):
            # Append the corresponding one-hot vector
            one_hot_vectors.append(one_hot_encoded[diagnosis])
            # Indicate a match was found
            list_matched.append(1)
        else:
            # Append a zero vector if no match is found
            one_hot_vectors.append([0] * len(feature_list)) # Using the length of feature_list
            # Indicate no match was found
            list_matched.append(0)

    # Convert the list of vectors to a DataFrame and set the column names to feature_list
    one_hot_df = pd.DataFrame(one_hot_vectors, columns=feature_list)

    # Append the one_hot_df to the original df_ecg
    df_ecg_extended = pd.concat([df_ecg, one_hot_df], axis=1)

    # Add the 'annotated' column to indicate matches
    df_ecg_extended['annotated'] = list_matched

    return df_ecg_extended

# Example usage
# df_ecg_extended = append_one_hot_vectors(df_ecg, one_hot_encoded, feature_list)


# Example usage
df_ecg_extended = append_one_hot_vectors(df_ecg, one_hot_encoded, feature_list)
display(df_ecg_extended.annotated.value_counts())

In [None]:
original_dict = {
    'Axe gauche': ['Left axis deviation'],
    'Axe indéterminé': [],
    "Axe droit": ['Right axis deviation'],
    'Axe nord-ouest': [],
    'Axe P anormal': ['Ectopic atrial rhythm (< 100 BPM)'],
    'Axe P anormal, rythme auriculaire ectopique possible': ['Ectopic atrial rhythm (< 100 BPM)'],
    'onde P intrinsèques': ['Ectopic atrial rhythm (< 100 BPM)'],
    'rythme auriculaire ectopique': ['Ectopic atrial rhythm (< 100 BPM)'],
    'Rythme sinusal': ['Regular', 'Sinusal']
}

inverted_dict = {}
for key, values in original_dict.items():
    for value in values:
        if value in inverted_dict:
            inverted_dict[value].append(key)
        else:
            inverted_dict[value] = [key]

print(inverted_dict)


# Do a string cleanup

In [None]:
import re
import pandas as pd
from unidecode import unidecode

def clean_and_format_ecg_string(s):
    # Check if the input is a list and join into a single string
    if isinstance(s, list):
        s = ' '.join(s)

    # Remove the specified prefix if it exists in the string
    s = s.split("*** ATTENTION! mauvaise qualité de l'ECG*** ")[-1]

    # Convert the string to lowercase
    s = s.lower()

    # Remove the specified suffix if it exists in the string
    s = s.split(' ecg anormal')[0]

    # Convert unicode characters to their closest ASCII representation
    s = unidecode(s)

    # Remove leading and trailing whitespace
    s = s.strip()

    # Replace one or more non-word characters (including punctuation) with a single space
    s = re.sub(r"\W+", " ", s)

    # Strip whitespace again to ensure removal of space before and after the start of the string
    s = s.strip()

    return s

# Example usage
# Assuming df_ecg_extended and one_hot_encoded are already defined
df_ecg["Clean_and_format_Diag"] = df_ecg["diagnosis"].apply(clean_and_format_ecg_string)
dict_diag = {clean_and_format_ecg_string(k): v for k, v in dict_diag.items()}
one_hot_encoded_cleaned = {clean_and_format_ecg_string(k): v for k, v in one_hot_encoded.items()}


In [None]:
dict_diag = {clean_and_format_ecg_string(k): v for k, v in dict_diag.items()}
one_hot_encoded_cleaned = {clean_and_format_ecg_string(k): v for k, v in one_hot_encoded.items()}
missing_keys = [key for key in dict_diag.keys() if key not in one_hot_encoded_cleaned.keys()]
print("Missing keys from labelbox vs dict_diag - should be 0", len(missing_keys))
common_keys = [key for key in dict_diag.keys() if key in one_hot_encoded_cleaned.keys()]
print("Common keys - should be ALL", len(common_keys))

In [None]:
def append_one_hot_vectors_clean_diagnosis(df_ecg_extended_, one_hot_encoded_cleaned, feature_list):
    """
    Appends one_hot_encoded vectors to the rows in df_ecg_extended_ where 'Clean_and_format_diagnosis' matches the keys in one_hot_encoded.
    Sets the column headers of the appended vectors to the provided feature_list.

    Parameters:
        df_ecg_extended_: DataFrame containing 'Clean_and_format_diagnosis' column.
        one_hot_encoded: Dictionary with diagnosisnosis as keys and one-hot encoded vectors as values.
        feature_list: List of feature names corresponding to positions in the one-hot encoded vectors.
    Returns:
        DataFrame: Modified df_ecg_extended_ with appended one_hot_encoded vectors.
    """

    # Initialize a list to store the one-hot encoded vectors
    one_hot_vectors = []

    # Initialize a list to track if a match was found
    list_matched = []

    # Iterate through each row in df_ecg_extended_
    for index, row in tqdm(df_ecg_extended_.iterrows()):
        diagnosis = row['Clean_and_format_Diag']
        # Check if the diagnosisnosis is in the one_hot_encoded keys
        if diagnosis in one_hot_encoded_cleaned:
            # Append the corresponding one-hot vector from one_hot_encoded
            one_hot_vectors.append(one_hot_encoded_cleaned[diagnosis])
            # Indicate a match was found
            list_matched.append(1)
        else:
            # Append a zero vector if no match is found
            one_hot_vectors.append([0] * len(feature_list)) # Using the length of feature_list
            # Indicate no match was found
            list_matched.append(0)

    # Convert the list of vectors to a DataFrame and set the column names to feature_list
    one_hot_df = pd.DataFrame(one_hot_vectors, columns=feature_list)

    # Append the one_hot_df to the original df_ecg_extended_
    df_ecg_extended_extended = pd.concat([df_ecg_extended_, one_hot_df], axis=1)

    # Add the 'annotated' column to indicate matches
    df_ecg_extended_extended['annotated_method_2'] = list_matched

    return df_ecg_extended_extended

# Example usage
df_ecg_extended_extended = append_one_hot_vectors_clean_diagnosis(df_ecg, one_hot_encoded_cleaned, feature_list)

In [None]:
display(df_ecg_extended_extended.annotated_method_2.value_counts())

In [None]:
from tqdm import tqdm
df_ecg_extended_extended['labelbox_diagnosis'] = df_ecg_extended_extended.iloc[:, 297:378].apply(lambda x: ','.join(x.index[x == 1]), axis=1)

### Checking Labels if they are accurate

In [None]:
top_40_diagnosis = df_ecg_extended_extended[0:500000].loc[df_ecg_extended_extended['annotated_method_2']==1].labelbox_diagnosis.value_counts().head(80)

# Get the diagnoses from rank 20 to 40
diagnoses_20_to_40 = top_40_diagnosis.index[0:80]

total_count = top_40_diagnosis.sum()

for diagnosis in diagnoses_20_to_40:
    count = top_40_diagnosis[diagnosis]
    diagnosis_statement = df_ecg_extended_extended.loc[df_ecg_extended_extended['labelbox_diagnosis'] == diagnosis, 'diagnosis'].iloc[0]
    percentage = (count / total_count) * 100
    print(f"Diagnosis: {diagnosis}\nStatement: {diagnosis_statement}\nFrequency: {count}\nPercentage: {percentage:.2f}%")
    print(f"Labelbox Diagnosis: {diagnosis}\n\n\n")


In [None]:
#display(df_ecg_extended_extended.loc[df_ecg_extended_extended['Ventricular paced'] ==1][['RestingECG_PatientDemographics_PatientID','RestingECG_TestDemographics_AcquisitionDate','RestingECG_TestDemographics_AcquisitionTime','diagnosis', 'labelbox_diagnosis']].head(n=30)

### Saving dataset for dictionary processing

In [None]:
df_ecg_extended_extended.to_parquet('/media/data1/ravram/ecg_temp_out.parquet')