In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import indexable, _safe_indexing
from sklearn.utils.validation import _num_samples
from sklearn.model_selection._split import _validate_shuffle_split
import warnings
import uuid
import pandas as pd
import json

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 300)

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
dir = '/media/data1/ravram/ecg_temp_out.parquet'
df_ecg_parquet_ = pd.read_parquet(dir)

df_ecg_parquet_['RestingECG_PatientDemographics_PatientID'] = [str(i).zfill(7) for i in df_ecg_parquet_['RestingECG_PatientDemographics_PatientID'].tolist()]
df_ecg_parquet_ = df_ecg_parquet_.sort_values(by=['RestingECG_PatientDemographics_PatientID'])
df_ecg_parquet_ = df_ecg_parquet_.reset_index(drop=True)
## Add new diagnoses to be propagated by dictionary
df_ecg_parquet_['Third Degree AV Block'] = 0
df_ecg_parquet_['Sinus Pause'] = 0
df_ecg_parquet_['Acute MI'] = 0
df_ecg_parquet_['Early repolarization'] = 0
df_ecg_parquet_['Q wave (posterior - V7-V9)'] = 0    
df_ecg_parquet_['Right atrial enlargement'] = 0
df_ecg_parquet_['Bi-atrial enlargement'] = 0
df_ecg_parquet_['LV pacing'] = 0
df_ecg_parquet_['Dextrocardia'] = 0
df_ecg_parquet_['Brugada'] = 0
df_ecg_parquet_['Ventricular Rhythm'] = 0
df_ecg_parquet_['ST depression (posterior - V7-V8-V9)'] = 0

In [None]:
import random
import re

substring = "Aspect de lésion"
escaped_substring = re.escape(substring)  # Escape the special characters in the substring
matching_indices = df_ecg_parquet_[df_ecg_parquet_.diagnosis.str.contains(escaped_substring, case=False)].index

if len(matching_indices) == 0:
    print("No match found")
else:
    print("Found {} matches".format(len(matching_indices)))
    matching_indices_list = list(matching_indices)
    random_indices = random.sample(matching_indices_list, 5)
    for index in random_indices:
        print(df_ecg_parquet_.loc[index, 'diagnosis'])

    random_index = random.choice(matching_indices_list)
    df_ecg_parquet = df_ecg_parquet_.loc[random_indices].reset_index()


In [None]:
#display(random_indices)
df_ecg_parquet = df_ecg_parquet_
#display(df_ecg_parquet.npy_path)

In [None]:
df_ecg_parquet['validated by MD'] = np.where((df_ecg_parquet['RestingECG_TestDemographics_OverreaderLastName'] == 'NON VALIDÉ') | (df_ecg_parquet['RestingECG_TestDemographics_OverreaderLastName'] == 'VALIDÉ PAR MD'), 0, 1)

count = len(df_ecg_parquet[df_ecg_parquet['validated by MD'] == 1][df_ecg_parquet['diagnosis'] != df_ecg_parquet['original_diagnosis']])
print(count, "of ECGs with modified diagnosis based on validated by MD overread")
#display(df_ecg_parquet.loc[df_ecg_parquet['diagnosis low/no QRS available']==1].diagnosis.value_counts())

df_ecg_parquet['no_qrs'] = np.where((df_ecg_parquet['diagnosis low/no QRS available'] == 1) | (df_ecg_parquet['original_diagnosis low/no QRS available'] == 1) | (df_ecg_parquet['diagnosis defectuous machine'] == 1) | (df_ecg_parquet['original_diagnosis defectuous machine'] == 1), 1, 0)
#If diagnosis column contains "*** ATTENTION! mauvaise qualité de l'ECG***" then add column low_quality == 1 else 0
# Escaping the asterisks in the pattern
pattern = r"\*\*\* ATTENTION! mauvaise qualité de l'ECG\*\*\*"

# Using the escaped pattern in str.contains()
df_ecg_parquet['diagnosis_low_quality'] = np.where(df_ecg_parquet['diagnosis'].str.contains(pattern), 1, 0)
df_ecg_parquet['original_diagnosis_low_quality'] = np.where(df_ecg_parquet['original_diagnosis'].str.contains(pattern), 1, 0)
#Rename df_ecg_parquet column ['Junctional rythm'] to df_ecg_parquet['Junctional rhythm']
df_ecg_parquet.rename(columns={'ST elevation (anterior- V3-V4)':'ST elevation (anterior - V3-V4)'}, inplace=True)
df_ecg_parquet.rename(columns={'T wave inversion (anterior- V3-V4)':'T wave inversion (anterior - V3-V4)'}, inplace=True)
df_ecg_parquet.rename(columns={'Junctional rythm':'Junctional rhythm'}, inplace=True)
df_ecg_parquet.rename(columns={'annotated_method_2':'annotated_method'}, inplace=True)
df_ecg_parquet.rename(columns={'Ectopic atrial rythm (< 100 BPM)':'Ectopic atrial rhythm (< 100 BPM)'}, inplace=True)

# Move columns annotated_method	labelbox_diagnosis to the end of the dataframe
df_ecg_parquet = df_ecg_parquet[[c for c in df_ecg_parquet if c not in ['annotated_method', 'labelbox_diagnosis']] 
        + ['annotated_method', 'labelbox_diagnosis']]


In [None]:
#df_ecg_parquet = df_ecg_parquet[df_ecg_parquet['diagnosis'].str.contains("Pause sinusale", case=False, na=False)]

In [None]:
with open('dictionary/1_simplify_diagnosis.json', 'r', encoding='utf-8') as file:
    dict_equivalence_0 = json.load(file)

In [None]:
with open('dictionary/2_list_key_phenotype.json', 'r', encoding='utf-8') as file:
    list_key_phenotype = json.load(file)

In [None]:
dict_ = dict(zip(list_key_phenotype,[[0]*len(df_ecg_parquet)] * len(list_key_phenotype)))


### Explanation:
The objective here is to process ECG diagnosis data within a DataFrame in a multi-step approach:

1. **Diagnosis Translation**: 
   - We have a DataFrame `df_ecg_parquet` containing ECG diagnoses in one of its columns (say, `'diagnosis'`).
   - A dictionary `dict_equivalence_0` is provided, mapping certain strings (symptoms or diagnoses) to a list of equivalent terms or phrases.
   - The goal is to iterate through each diagnosis in the DataFrame, replacing any term found in `dict_equivalence_0` with its corresponding key. This standardizes the diagnosis terminology.

2. **Phenotype Matching**:
   - There's a list `list_key_phenotype` containing key phenotypes (specific medical terms or conditions).
   - After the translation of diagnoses, the script checks if any of these key phenotypes are present in the newly standardized diagnosis strings.
   - It also includes additional specific terms to be matched, which are critical for accurate phenotype identification.

3. **DataFrame Column Addition**:
   - For each unique phenotype identified in the diagnoses, a new column is created in the DataFrame.
   - These columns are binary (1 or 0), indicating the presence or absence of the corresponding phenotype in each diagnosis.

4. **Challenges and Solutions**:
   - **String Matching and Replacing**: The code efficiently searches and replaces terms in the diagnosis strings using the provided dictionary. This step is crucial for standardizing the data.
   - **Ensuring Accuracy**: By including specific terms in the matching process, the code ensures that nuanced medical conditions are accurately identified.
   - **Optimizing Performance**: The use of list comprehensions and set operations aids in optimizing the script for performance, especially important when dealing with large datasets.

This process is particularly valuable in medical data analysis, where standardizing and accurately categorizing diagnosis data can lead to more reliable insights and conclusions.

In [None]:
import pandas as pd
from tqdm import tqdm

# Function to replace diagnosis strings with equivalents from a dictionary
def translate_diagnoses(diagnoses, equivalence_dict):
    translated_diagnoses = []
    for diagnosis in tqdm(diagnoses, total=len(diagnoses)):
        for key, values in equivalence_dict.items():
            for value in values:
                diagnosis = diagnosis.replace(value, key)
        translated_diagnoses.append(diagnosis)
    return translated_diagnoses

# Function to create a dictionary of diagnoses based on key phenotypes
# Function to refine and create a dictionary of diagnoses based on key phenotypes
def create_diagnosis_dict(diagnoses, key_phenotypes):
    diagnosis_dict = {}
    for idx, diagnosis in enumerate(tqdm(diagnoses)):
        diagnosis = " ".join(diagnosis.split()).lower()
        matched_phenotypes = []
        for phenotype in key_phenotypes:
            if phenotype in diagnosis:
                matched_phenotypes.append(phenotype)

        # Include specific string replacement logic
        additional_terms = [
             'anomalies non-spécifiques des segments st-t',
            'extrasystole ventriculaire avec conduction aberrante',
            'extrasystole auriculaire avec conduction aberrante',
            'fibrillation auriculaire avec réponse ventriculaire lent',
            'fibrillation auriculaire avec foyer jonctionnel en compétition',
            'extrasystole ventriculaire',
            'conduction aberrante',
            'Inversion possible des électrodes',
                        # Add more terms as needed
        ]
        for term in additional_terms:
            if term.lower() in diagnosis:
                matched_phenotypes.append(term.lower())

        diagnosis_dict[idx] = list(set(matched_phenotypes))
    return diagnosis_dict

# Replace diagnosis strings in the DataFrame
translated_list = translate_diagnoses(df_ecg_parquet['original_diagnosis'].tolist(), dict_equivalence_0)

# Create a dictionary of diagnoses
key_phenotypes_lower = [item.lower() for item in list_key_phenotype]
diagnosis_dict = create_diagnosis_dict(translated_list, key_phenotypes_lower)

# Add new columns to DataFrame based on the diagnosis dictionary
unique_phenotypes = list(set([item for sublist in diagnosis_dict.values() for item in sublist]))
phenotype_columns = {phenotype: [0] * len(df_ecg_parquet) for phenotype in unique_phenotypes}

for index, diagnosis in enumerate(df_ecg_parquet['original_diagnosis']):
    for phenotype in diagnosis_dict[index]:
        phenotype_columns[phenotype][index] = 1

df_ecg_parquet_with_diagnosis = pd.concat([df_ecg_parquet.reset_index(), pd.DataFrame.from_dict(phenotype_columns)], axis=1)

In [None]:
#instances = df_ecg_parquet[df_ecg_parquet['diagnosis'].str.contains("Sus-décalage du ST; possibilité de lésion inférolatérale ou infarctus aigu", na=False, case=False)].diagnosis.value_counts()
#display(instances)

In [None]:
from tqdm import tqdm

import ast
import pandas as pd
from tqdm import tqdm

# Function to read and process the diagnosis dictionary
def read_diagnosis_dictionary(file_path):
    try:
        with open(file_path, 'r') as file:
            file_content = file.read()
        return {key.lower(): value for key, value in ast.literal_eval(file_content).items()}
    except Exception as e:
        print(f"Error reading or processing the dictionary: {e}")
        return {}

# Function to check for column presence in DataFrame and report missing columns
def check_columns_presence(df, diagnosis_dictionary):
    missing_columns = set()
    for values in diagnosis_dictionary.values():
        missing_columns.update(set(values) - set(df.columns))
    if missing_columns:
        print(f"Missing columns in DataFrame: {missing_columns}")

# Function to update DataFrame based on diagnosis dictionary
def update_diagnosis_counts(df, diagnosis_dictionary):
    for index, row in tqdm(df.iterrows(), total=len(df)):
        diagnoses = row.get('dictionary_diagnosis', '').split('μ')
        for diagnosis in diagnoses:
            for value in diagnosis_dictionary.get(diagnosis, []):
                if value in df.columns:
                    df.at[index, value] += 1
    return df

# Path to your downloaded file
file_path = 'dictionary/final_reformatted_dictionary.txt'
diagnosis_dictionary = read_diagnosis_dictionary(file_path)

## NB This was added because we can add concepts and the IDS of the pertinent columns change over time so you want to always have the right columns for further processing
labelbox_diagnosis_col = df_ecg_parquet_with_diagnosis.columns.get_loc('labelbox_diagnosis') + 1
print("Labelbox diagnosis iloc +1 ", labelbox_diagnosis_col)
last_col_index = df_ecg_parquet_with_diagnosis.shape[1] - 1
last_col_iloc = df_ecg_parquet_with_diagnosis.columns.get_loc(df_ecg_parquet_with_diagnosis.columns[-1])
print("Last col ", last_col_iloc)
display(df_ecg_parquet_with_diagnosis.iloc[:, labelbox_diagnosis_col:last_col_iloc])


# Check if each value in diagnosis_dictionary is present in DataFrame columns
df_ecg_parquet_with_diagnosis['dictionary_diagnosis'] = df_ecg_parquet_with_diagnosis.iloc[:, labelbox_diagnosis_col:last_col_iloc].apply(lambda x: 'μ'.join(x.index[x == 1]), axis=1)
check_columns_presence(df_ecg_parquet_with_diagnosis, diagnosis_dictionary)

# Applying the function to update the DataFrame
updated_df = update_diagnosis_counts(df_ecg_parquet_with_diagnosis.copy(), diagnosis_dictionary)


In [None]:
# Displaying the first few rows of the updated DataFrame for verification
updated_df.head()

# Change updated_df['annotated_method'] to 1 if labelbox_diagnosis present, 2 if dictionary_diagnosis present, and 3 if both present
updated_df['annotated_method'] = 0

for index, row in updated_df.iterrows():
    if row['labelbox_diagnosis'] and not row['dictionary_diagnosis']:
        updated_df.at[index, 'annot ated_method'] = 1
    elif not row['labelbox_diagnosis'] and row['dictionary_diagnosis']:
        updated_df.at[index, 'annotated_method'] = 2
    elif row['labelbox_diagnosis'] and row['dictionary_diagnosis']:
        updated_df.at[index, 'annotated_method'] = 3

updated_df.annotated_method.value_counts()

In [None]:
#brugada_matches = updated_df[updated_df['diagnosis'].str.contains("Brugada", case=False, na=False)]

In [None]:
display(updated_df['Third Degree AV Block'].value_counts())
display(updated_df['Acute MI'].value_counts())
display(updated_df['Brugada'].value_counts())
display(updated_df['Sinus Pause'].value_counts())

In [None]:
updated_df = pd.read_parquet('/media/data1/muse_ge/ECG_ad202207_1453937_cat_labels_original_diagnosis_v1.1.parquet')

In [None]:
updated_df_old = pd.read_parquet('/media/data1/muse_ge/ECG_ad202207_1453937_cat_labels_v1.1.parquet')

In [None]:
def merge_and_clean_dataframes(updated_df, updated_df_old, to_remove_labels):
    # Remove specified columns from updated_df_old
    print(f"Number of columns before removal in updated_df_old: {updated_df_old.shape[1]}")
    updated_df_old.drop(columns=to_remove_labels, inplace=True)
    print(f"Number of columns removed from updated_df_old: {len(to_remove_labels)}")
    print(f"Number of columns after removal in updated_df_old: {updated_df_old.shape[1]}")

    # Remove specified columns from both dataframes
    print(f"Number of columns before removal in updated_df: {updated_df.shape[1]}")
    updated_df.drop(columns=to_remove_labels, inplace=True)
    print(f"Number of columns removed from updated_df: {len(to_remove_labels)}")
    print(f"Number of columns after removal in updated_df: {updated_df.shape[1]}")


    # Get column indices in updated_df
    sinusal_col_index = updated_df.columns.get_loc('Sinusal')
    annotated_method_col_index = updated_df.columns.get_loc('annotated_method')

    # Adjust indices if needed
    start_index = min(sinusal_col_index, annotated_method_col_index)
    end_index = max(sinusal_col_index, annotated_method_col_index)

    # Select columns between Sinusal and annotated_method in updated_df
    columns_to_keep_updated_df = updated_df.columns[start_index:end_index + 1].tolist()
    columns_to_keep_updated_df.append('npy_path')  # Include 'npy_path'
    updated_df_selected = updated_df[columns_to_keep_updated_df]

    # Assuming the same columns exist in updated_df_old
    # Get column names for these indices in updated_df_old
    columns_to_keep_updated_df_old = updated_df_old.columns[start_index:end_index + 1].tolist()

    # Combine with additional columns to keep for updated_df_old
    columns_to_keep_total =  [
        'npy_path','validated by MD','diagnosis', 'original_diagnosis', 'xml_path', 'extracted', 'error_reading_diag', 
        'error_reading_original_diag', 'reading_xml_error', 'original_shape', 'Clean_and_format_Diag', 
        'RestingECG_MuseInfo_MuseVersion', 'new_PatientID', 'changed_ID', 
        'still_potentially_bad_ID', 'RestingECG_PatientDemographics_AgeUnits', 'date_of_birth', 
        'calculated_age', 'emergency ecg', 'RestingECG_PatientDemographics_Gender'
    ] + columns_to_keep_updated_df_old 

    # Select columns for updated_df_old
    updated_df_old_selected = updated_df_old[columns_to_keep_total]

    # Merge the dataframes
    merged_df = pd.merge(updated_df_old_selected, updated_df_selected, on='npy_path', suffixes=('_CARDIOLOGIST', '_MUSE'))
    
    return merged_df



# Labels to remove
to_remove_labels = ['ST depression (posterior - V7-V8-V9)', 'Tall >2.5 mm', 'J wave', 'Auricular bigeminy', 'Ventricular bigeminy', 'Sinus Pause', 'Dextrocardia', 'Hyperacute T wave (lateral, V5-V6)', 'Hyperacute T wave (septal, V1-V2)', 'Hyperacute T wave (anterior, V3-V4)', 'Bifid', 'RaVL + SV3 > 28 mm (H) or 20 mm (F)', 'Large >0.08 s', 'Biphasic', 'ST depression et T inversion in V5 or V6']

# Apply the function on updated_df and updated_df_old
merged_df = merge_and_clean_dataframes(updated_df, updated_df_old, to_remove_labels)

# Now merged_df contains the merged and cleaned data


In [None]:
def create_combined_condition_column(df, suffix):
    # Create a list to store the result for each row
    combined_column = []

    # Iterate over each row
    for _, row in tqdm(df.iterrows(), total=len(df)):
        # Find columns that end with the specified suffix and have a value of 0
        columns_with_condition = [col.replace(suffix, '') for col in df.columns if col.endswith(suffix) and row[col] == 1]
        
        # Join the column names with a separator (e.g., ",")
        combined_column.append(', '.join(columns_with_condition))

    return combined_column

# Assuming merged_df is your merged dataframe
# Apply the function for both suffixes
merged_df['Diagnosis_One_Hot_CARDIOLOGIST'] = create_combined_condition_column(merged_df, '_CARDIOLOGIST')
merged_df['Diagnosis_One_Hot_MUSE'] = create_combined_condition_column(merged_df, '_MUSE')


In [None]:
import pandas as pd

def find_duplicate_columns(dataframe):
    # Create a Series from the columns
    columns_series = pd.Series(dataframe.columns)

    # Use the duplicated method on the Series
    duplicated = columns_series[columns_series.duplicated()]

    # Return the duplicated column names
    return list(duplicated)

# Assuming df is your DataFrame
# Check for duplicate columns
duplicates = find_duplicate_columns(merged_df)
if duplicates:
    print("Duplicate columns found:", duplicates)
else:
    print("No duplicate columns found.")



# Now try saving the DataFrame to Parquet again


In [None]:
merged_df = pd.read_parquet('/media/data1/muse_ge/ECG_ad202207_1453937_cat_labels_MUSE_vs_CARDIOLOGIST_v1.1.parquet')


In [None]:
display(merged_df.loc[(merged_df['validated by MD'] == 1) & (merged_df['diagnosis']!=merged_df['original_diagnosis'])].count())

In [None]:
display(merged_df['validated by MD'].head(n=5))

In [None]:
annotated_method_col_index = updated_df.columns.get_loc('annotated_method')
sinusal_col_index = updated_df.columns.get_loc('Sinusal')


 #Select columns from 395 to 586
selected_columns = updated_df.loc[updated_df['validated by MD']==1].iloc[:, sinusal_col_index:annotated_method_col_index]

# Plot value counts for each column
for col in selected_columns.columns:
    value_counts = selected_columns[col].value_counts(normalize=False)
    display(value_counts)

# English Translation

In [None]:
#df_.to_parquet('/media/data1/anolin/mhi_ecg_in_ENGLISH.parquet')

In [None]:
counter = 192

In [None]:
import random

counter+=1

id_ = random.randint(0,len(data))

print(id_)
print(counter)

print(data['diagnosis'].tolist()[id_])
print(new_list[id_])
print(new_dict_r[id_])
print(len(new_dict_r[id_]))

In [None]:
import random

counter+=1

id_ =  133626 #random.randint(0,len(data))

print(id_)
print(counter)

print(data['diagnosis'].tolist()[id_])
print(new_list[id_])
print(new_dict_r[id_])
print(len(new_dict_r[id_]))

In [None]:
import random

counter+=1

id_ = random.randint(0,len(data))

print(id_)
print(counter)

print(data['diagnosis'].tolist()[id_])
print(new_list[id_])
print(new_dict_r[id_])
print(len(new_dict_r[id_]))

In [None]:
'Anomalies non-spécifiques des segments ST-T'.lower() in new_list[id_].lower()

In [None]:
pd.DataFrame.from_dict(dict_)

In [None]:
from collections import Counter
import operator

list_replacement = list()
df_ecg_parquet_list = data['diagnosis'].tolist()
for i in tqdm(df_ecg_parquet_list):
    string = i
    for k,v in dict_equivalence_0.items():
        for v_ in v:
            string = string.replace(v_,'')
            i = i.replace(v_,k)

    if "".join(i.split()) == '':
        list_replacement.append(i)
    else:
        list_replacement.append('')

new_list = list()
for i in list_replacement:
    new_list.append(" ".join(i.split()))

x = dict(Counter(new_list))
dict( sorted(x.items(), key=operator.itemgetter(1),reverse=True))

In [None]:
list_replacement[93]

In [None]:
for i in dict_equivalence_0.keys():
    if i.lower() not in list(dict_equivalence_eng.keys()):
        print(i.lower() )

In [None]:
dict_equivalence_eng = {

    'rapport r/s augmenté en v1, considérer rotation horaire ou infarctus postérieur': 'increased r/s ratio in v1, consider clockwise rotation or posterior infarction',
    'rapport r/s augmenté en v1, considérer rotation horaire': 'increased r/s ratio in v1, consider clockwise rotation',


    # Axe (Concepts)
    'axe gauche':                             'left axis deviation',
    'axe indéterminé':                        'undetermined axis deviation',
    "axe droit":                              'right axis deviation',
    'axe nord-ouest':                         'extreme axis deviation',
    'axe p anormal':                          'abnormal p-axis',
    'à éliminer':                             'to eliminate',
    'onde p intrinsèques':                    'intrinsic p-wave',
    'rythme auriculaire ectopique':           'atrial ectopic beat',
    'rythme sinusal':                         'sinus rhythm',

    #Arythmie (Diagnosis)
    'fibrillo-flutter auriculaire':           'atrial fibrillation-flutter',
    'arythmie sinusale':                      'sinus arrhythmia',
    'arythmie supraventriculaire':            'supraventricular arrhythmia',
    'fibrillo':                               'fibrillo',
    'fibrillation auriculaire':               'atrial fibrillation',
    'flutter auriculaire':                    'atrial flutter',
    'flutter ventriculaire':                  'ventricular flutter',
    "flutter":                                'flutter',
    'bradycardie sinusale':                   'sinus bradycardia',
    'rythme jonctionnel':                     'junctional rhythm',
    'tachycardie régulière':                  'regular tachycardia',
    'tachycardie sinusale':                   'sinus tachycardia',
    'tachycardie supraventriculaire':         'supraventricular tachycardia',
    'tachycardie à qrs fins':                  'narrow-complex tachycardia',
    'pacing auriculaire':                     'atrial pacing',
    'pacing ventriculaire':                   'ventricular pacing',
    "pacing dans l'onde t":                   't-wave pacing',
    'sous-décalage du st; possibilité de lésion sous-endocardique ou imprégnation digitalique': 'st elevation; possible subendocardial lesion or digitalis impregnation',
    'bradycardie auriculaire inférieure droite': 'right inferior atrial bradycardia',
    'bradycardie auriculaire inférieure gauche': 'left inferior atrial bradycardia',
    'réponse ventriculaire lente':             'slow ventricular response',

    'bradycardie auriculaire gauche':         'left atrial bradycardia',
    'bradycardie auriculaire droite':         'right atrial bradycardia',

    'rythme idioventriculaire':               'idioventricular rhythm',
    'aspect de lésion latérale':              'aspect de lésion latérale',
    'bradycardie auriculaire ectopique possible': 'possible ectopic atrial bradycardia',
    'bradycardie jonctionnelle':              'junctional bradycardia',
    'tachycardie auriculaire ectopique possible':'possible ectopic atrial tachycardia',
    "tachycardie jonctionnelle":              'junctional tachycardia',
    'tachycardie auriculaire paroxystique':   'junctional paroxysmal atrial tachycardia',
    'bradycardie auriculaire ectopique':      'ectopic atrial bradycardia',     

    #Arythmie (Concepts)
    'rythme de nature indéterminée':           'rhythm of indeterminate nature',                 # <- same ?
    'rythme irrégulier de nature indéterminée':'irregular rhythm of indeterminate nature', # <- same ?

    'dissociation a-v':                        'av dissociation',                        
    'conduction av prolongée':                 'prolonged av conduction',
    'rythme à qrs large':                      'wide complex rhythm',
    'tachycardie à qrs large':                 'wide complex tachycardia',

    'pré-excitation ventriculaire, wpw de type b': 'ventricular pre-excitation, wolff-parkinson-white type b',
    'pré-excitation ventriculaire, wpw de type a': 'ventricular pre-excitation, wolff-parkinson-white type a',

    'pause sinusale':                         'sinus pause',
    'échappement ventriculaire':              'ventricular escape',
    'réponse ventriculaire rapide':           'rapid ventricular response',
    'réponse ventriculaire lente':            'slow ventricular response',
    "anomalie de la repolarisation":          'repolarization anomaly',
    'échappement jonctionnel':                'junctional escape rhythm',
    'conduction rétrograde':                  'retrograde conduction',
    'foyer jonctionnel en compétition':       'competitive junctional rhythm',
    "forme bigiminé":                         'with bigimined shape',
    "complexe(s) de fusion":                  'fusion complex(es)',
    "complexes sinusaux":                     'sinus complexes',

    'axe p anormal, rythme auriculaire ectopique possible':  'abnormal p-axis, possible ectopic atrial rhythm',

    'rythme ventriculaire entrainé':          'entrainment of ventricular tachycardia',
    'rythme auriculaire entrainé':            'entrainment of atrial tachycardia',

    'rythme av entrainé sequentiel':          'sequential entrainment of atrioventricular rhythm',
    'rythme auriculaire gauche':              'left atrial rhythm',
    
    'rythme auriculaire inférieur droit':     'right inferior atrial rhythm',
    "rythme sinusal":                         'sinus rhythm',
    'rythme sinusal avec esv':                'Sinus rhythm with esv',  

    #Infarctus (Diagnosis)
    'infarctus inférieur':                    'inferior wall infarct',
    'infarctus antérieur':                    'anterior wall infarct',
    'infarctus inféropostérieur':             'inferoposterior wall infarct',
    'infarctus antérolatéral':                'anterolateral wall infarct',
    'infarctus antéroseptal':                 'anteroseptal wall infarct',
    'infarctus septal':                       'septal wall infarct',
    'infarctus latéral':                      'lateral wall infarct',
    'infarctus postérieur':                   'posterior wall infarct',
    'infarctus inféro-latéral':               'inferolatéral wall infarct',


    'ischémie inférieur possible':           'possible inferior ischemia',
    'ischémie antérieur possible':           'possible anterior ischemia',
    'ischémie inféropostérieur possible':    'possible inferoposterior ischemia',
    'ischémie antérolatéral possible':       'possible anterolateral ischemia',
    'ischémie antéroseptal possible':        'possible anteroseptal ischemia',
    'ischémie septal possible':              'possible septal ischemia',
    'ischémie latéral possible':             'possible lateral ischemia',
    'ischémie postérieur possible':          'possible posterior ischemia',
    'ischémie inféro-latéral possible':     'possible inferolateral ischemia',

    'ischémie inférieur':                    'inferior ischemia',
    'ischémie antérieur':                    'anterior ischemia',
    'ischémie inféropostérieur':             'inferoposterior ischemia',
    'ischémie antérolatéral':                'anterolateral ischemia',
    'ischémie antéroseptal':                 'anteroseptal ischemia',
    'ischémie septal':                       'septal ischemia',
    'ischémie latéral':                      'lateral ischemia',
    'ischémie postérieur':                   'posterior ischemia',
    'ischémie inféro-latéral':               'inferolateral ischemia',



    #Autre (Concept)
    'bas voltage des qrs':                    'low voltage',
    "qrs fin":                                'narrow complexes',
    "qrs large":                              'wide complexes',
    "trouble de conduction intraventriculaire non-spécifique":   'non-specific intraventricular conduction disorder',
    'trouble de conduction':                 'conduction disorder',

    'qt allongé':                             'long qt',
    'repolarisation précoce':                 'early repolarization',

    #Autre (Diagnosis)
    'dilatation auriculaire droite':          'right atrial enlargement',
    'dilatation auriculaire gauche':          'lef atrial enlargement',
    'dilatation bi-auriculaire':              'bi-atrial enlargement',

    #Pacemaker (Diagnosis)
    'cardio-stimulateur ventriculaire':       'ventricular pacemaker',
    'cardio-stimulateur à demande':           'on-demand pacemaker',
    'cardio-stimulateur auriculaire':         'atrial pacemaker',
    'cardio-stimulateur séquentiel (double chambre)':            'dual chamber sequential pacemaker',
    'stimulateur cardiaque biventriculaire détecté':             'biventricular pacemaker detected',
    'pacemaker auriculaire':                                     'atrial pacemaker',
    'pacemaker sentinelle':                                      'sentinel pacemaker',
    'pacemaker ventriculaire':                                   'ventricular pacemaker',

    #Anomalie de l'onde T (Concept)
    "anomalie de l'onde t; ischémie inférieure possible":        "t-wave anomaly; possible inferior ischemia",
    "anomalie de l'onde t; ischémie latérale possible":          "t-wave anomaly; possible lateral ischemia",

    #Anomalie  de l'onde T Autre (Concept)
    "anomalie non-spécifique de l'onde t":    'non-specific t-wave anomaly',
    'anomalie du segment st':                 'st segment anomaly',
    'anomalie marquée du st; lésion sous-endocardique septale possible':          'marked st anomaly; subendocardial septal lesion possible',
    "anomalie de l'onde t; ischémie latérale possible":                           't-wave anomaly; possible lateral wall ischemia',
    "anomalie de l'onde t; ischémie antérieure possible":                         't-wave anomaly; possible anterior wall ischemia',
    "anomalie de l'onde t; ischémie antérolatérale possible":                     't-wave anomaly; possible anterolateral wall ischemia',
    "anomalie de l'onde t; ischémie inférolatérale possible":                     't-wave anomaly; possible inferolateral wall ischemia',
    "anomalie marquée de l'onde t; ischémie antérolatérale possible":             'large t-wave anomaly; possible anterolateral wall ischemia',
    "anomalie non-spécifiques de l'onde t":                                       "non-specific t-wave anomaly",
    "anomlies de l'onde t non-spécifique en antéroseptal":                        'non-specific t-wave anomaly; possible anterospetal wall ischemia',
    'anomalie importante du st; lésion sous-endocardique septale possible':       'large st anomaly; possible subendocardial septal lesion',
    'anomalie marquée du st; lésion sous-endocardique antérieure possible':       'large st anomaly; possible anterior subendocardial lesion',
    'anomalie marquée du st; lésion sous-endocardique inférolatérale possible':   'large st anomaly; possible inferolateral subendocardial lesion',
    'anomalie marquée du st; lésion sous-endocardique latérale possible':         'large st anomaly; possible lateral subendocardial lesion',
    'anomalie marquée du st; lésion sous-endocardique antéroseptale possible':    'large st anomaly; possible anteroseptal subendocardial lesion',


    'onde u':              'prominent u-wave',
    #Sus-decalage du ST (Concept)
    'sus-décalage du st, possibilité de repolarisation précoce':                        'st elevation, possibility of early repolarization',
    'sus-décalage du st; possibilité de lésion antérolatérale ou infarctus aigu':       'st elevation, possibility of anterolateral lesion or acute infarct',
    'sus-décalage du st; possibilité de lésion inférieure ou infarctus aigu':           'st elevation, possibility of inferior wall lesion or acute infarct',
    'sus-décalage du st; possibilité de repolarisation précoce, péricardite ou lésion': 'st elevation, possibility of anterolateral lesion or acute possibility of early repolarization, pericarditis or lesion',
    'sus-décalage du st; possibilité le lésion antérieure ou infarctus aigu':           'st elevation, possibility of anterior wall lesion or acute infarct',
    'sus-décalage du st; possibilité de lésion latérale ou infarctus aigu':             'st elevation, possibility of lateral wall lesion or acute infarct',

    "léger sus-décalage de st: répolarisation précoce":                                 'slight st elevation: early repolarization',
    'léger sus-décalage de st':                                                         'slight st elevation',
    'sus-décalage ascendant du st, probablement normal':                                'st elevation, probably normal',
    'Sus-décalage du st en dérivations antérolatérales':                                'st elevation in anterolateral leads',
    'Sus-décalage du st en dérivations inférieures':                                    'st elevation in inferior leads',
    'Sus-décalage du st en dérivations latérales':                                      'st elevation in lateral leads',
    'Sus-décalage du st en dérivations antérolatérales':                                'st elevation in anterolateral leads',
    'Sus-décalage du st en dérivations inférolatérales':                                'st elevation in inferolateral leads',
    'Sus-décalage du st en dérivations antérieures':                                    'st elevation in anterior leads',
    'sus-décalage du segment st(v3,v4,v5)':                                             'st elevation (v3,v4,v5)',

    'sous-décalage ascendant du st, probablement anormal':                              'st depression, probably abnormal',
    'sous-décalage du st; possibilité de lésion sous-endocardique':                     'st depression, possible subendocardial lesion',
    'sous-décalage ascendant du st, probablement normal':                               'st depression, probably normal' ,
    'sous-décalage du segment st(v3,v4,v5)':                                            'st depression in leads st(v3,v4,v5)',
    'anomalies non-spécifiques des segments st-t':                                      'non-specific anomaly of st-t segments',
    'anomalies st-t antérolatérales':                                                   'anterolateral st-t anomalies',
    "anomalie de l'onde t":                                                             "t-wave anomaly",
    "anomalies non-spécifiques de l'onde t":                                            'non-specific t-wave anomalies',
    'ischémie aiguë ou d’hyperkaliémie':                                                'acute ischemia or hyperkalemia',

    #Inversion de l'onde T (Concept)
    "inversion de l'onde t en dérivations inférieures":                                 't-wave inversion in inferior leads',
    "inversion de l'onde t en dérivations latérales":                                   't-wave inversion in lateral leads',
    "inversion de l'onde t en dérivations inférolatérales":                             't-wave inversion in inferolateral leads',
    't négatif(v1,v2,v3,v4,v5,v6)':                                                     'negative t-wave in v1,v2,v3,v4,v5,v6',
    't négatif(v3,v5)':                                                                 'negative t-wave in v3,v5',
    't négatif(v1,v2)':                                                                 'negative t-wave in v1,v2',

    'anomalie diffuse du segment st-t,aspect de lésion/ischémie myocardique':           'diffuse st-t segment abnormality, appearance of myocardial lesion/ischemia',

    #Bloc (Diagnosis)
    'bloc a-v du premier degré':                                                        'first-degree av block',
    'bloc av 2ème degré (type 1)':                                                      'av block 2nd degree (type 1)',
    'bloc de branche droit':                                                            'right bundle branch block',
    'bloc de branche droit incomplet':                                                  'incomplete right bundle branch block',
    'bloc de branche gauche':                                                           'left bundle branch block',
    'bloc de branche gauche incomplet':                                                 'incomplete left bundle branch block',
    'bloc a-v complet':                                                                 'complete av block',
    'bloc a-v variable':                                                                'variable av block',
    'bloc av congénital':                                                               'congenital av block',

    'bloc a-v 4:1':                                                                     '4:1 a-v block',
    'bloc a-v 2:1':                                                                     '2:1 a-v block',
    'bloc a-v 3:1':                                                                     '3:1 a-v block',
    'bloc a-v 5:1':                                                                     '5:1 a-v block',

    'bloc trifasciculaire':                                                             'trifascicular block',

    'hémibloc antérieur gauche':                                                        'left anterior hemiblock',
    'hémibloc postérieur gauche':                                                        'left posterior hemiblock',
    "bloc intraventriculaire non-spécifique":                                           'non-specific intraventricular block',
    'bloc s-a du deuxième degré (mobitz i)':                                                 'second degree sa block (mobitz i)',
    'bloc s-a du deuxième degré (mobitz ii)':                                                'second degree sa block (mobitz ii)',
    
    'bloc a-v du deuxième degré (mobitz i)':                                            'second-degree av block (mobitz i)',

    'bloc bifasiculaire':                                                               'bifasicular block',

    #Extrasystoles
    'extrasystole ventriculaire':                                                       'ventricular extrasystole',
    'extrasystole auriculaire':                                                         'atrial extrasystole',
    'extrasystole(s) supraventriculaire(s)':                                            'supraventricular extrasystole',
    'extrasystole':                                                                     'extrasystole',
    
    'tracé de brugada, type 1':                                                         'brugada type 1 pattern',
    'tracé de brugada, type 2':                                                         'brugada type 2 pattern',

 

    "rsr' ou qr en v1 suggère un retard de conduction ventriculaire droit":             "rsr' or qr in v1 suggests right ventricular conduction delay",
    "retard de conduction intraventriculaire non-spécifique":                           'non-specific intraventricular conduction delay',
    "divergence anormale axes qrs et t; possibilité d'anomalie primaire de l'onde t":   'abnormal divergence of qrs and t axes; possible primary t-wave anomaly',
    # Autre (Concept)
    'q anormal(v1,v2,v3,v4,v5,v6)':           'abnormal q-wave in leads v1,v2,v3,v4,v5,v6',
    'produit cornell':                        'cornell product',
    'sokolow-lyon':                           'sokolow-lyon',
    'romhilt-estes':                          'romhilt-estes',
    'évoque une maladie pulmonaire':          'suggests lung disease',
    'wolff-parkinson-white':                  'wolff-parkinson-white',
    'pouls incertain':                        'uncertain pulse',

    'pattern brugada type 2 (non diagnostic)':'brugada type 2 pattern (non-diagnostic)',

    'complexes intrinsèques':                 'intrinsic complexes',
    'complexes ventriculaires prématurés fréquents': 'frequent premature ventricular complexes',
    'basse tension (fils de coffre)':         'low-voltage (trunk wires)',
    'capture sinusale/auriculaire':           'sinus/atrial capture',
    'basse tension':                          'low-voltage',
    'conduction aberrante':                   'aberrant conduction',
    "voltage de grande amplitude dans les précordiales moyennes": 'large-amplitude voltage in the middle precordials',
    "complexes supraventriculaires":          'supraventricular complexes',
    "grandes ondes t":                        'large t-waves',      

    # Autre (Diagnosis)
    'péricardite':                            'pericarditis',
    '(occulté par un hémibloc ?)':            '(hidden by a hemibloc?)',
    "d'âge indéterminé":                      'of undetermined age',

    #Intervale et ondes (Concept)
    
    'intervalle qt prolongé':                 'extended qt interval',
    'onde p prolongée':                       'extended p-wave',
    'ondes p':                                'p-wave',
    'p bloqués':                              'blocked p-wave',
    'r en avl':                               'r-wave in avl',
    "pr court":                               'short pr interval',
    'pr long':                                'long pr interval',
    'p rétrograde':                           'retrograde p-wave',

    #warnings
    'inversion probable des électrodes':      'probable electrode inversion',

    "*** analyse ecg spécifique de l'âge et du sexe ***":   'age- and gender-specific ecg analysis',

    "pas d'ondes p décelables":               'no detectable p-waves',
    "lente progression de l'onde r en antérieur":'slow progression of the r-wave anteriorly',

    'suspecter une insuffisance du stimulateur cardiaque non précisée':   'suspect unspecified pacemaker failure',
    
    'conduction intermittente':               'intermittent conduction',
    'im aigu/ischémie':                       'acute infarction/ischemia',

    'rythme atrial gauche suspect?':          'suspicious left atrial rhythm?',
    "mauvaise qualité de l'ecg":              'poor ecg quality',
    "demo ecg":                               'demo ecg',
    "analyse pédiatrique de l'ecg":           'pediatric ecg analysis',
    'rythme sinusal normal bas voltage des qrs': 'normal sinus rhythm low voltage qrs',

    'vd systémique':                               'systemic right ventricle',
    'analyse impossible; aucun qrs décelable':     'analysis impossible; no qrs detectable',

    "matériel d'acquisition peut-être défectueux": 'acquisition equipment may be faulty',
    "analyse impossible; moins de 4 qrs détectés": 'analysis impossible; less than 4 qrs detected',
    'positionnement dérivations non standard, interprétation ecg non disponible':  'non-standard positioning, ecg interpretation not available',
    'ecg anormal': 'abnormal ecg',
    'ecg normal':  'normal ecg',
    'intermittent':'intermittent',
    'dextrocardie': 'dextrocardia',
    
    'rythme électro-entraîné':               'pacing rate',
    'ancien':                                'ancient' ,
    'microvoltage derivations precordiales': 'microvoltage precordial derivations',
    'ischemie inferieure non exclue':        'inferior ischemia not excluded',
    "régression de l'onde R de V1 à v6":     'r-wave regression from V1 to v6',
    
    "retard de progression de l'onde r en antéro septal":  'delayed anteroseptal r-wave progression',
    "progression lente de l'onde r de v1 à v3":            'slow progression of the r wave from v1 to v3',
    "progression lente de l'onde r de v1 à v5":            'slow progression of the r wave from v1 to v3',
    "progression lente de l'onde r en v3 à v4":            'slow progression of the r wave from v3 to v4',
    "régression des voltages de v1 à v6":                  'voltage regression from v1 to v6',

    'incomplet':                                           'incomplete',
    'régression onde r de v1 à v6 suggérant une dextrocardie': 'r-wave regression from v1 to v6 suggesting dextrocardia',
    "pauvre progression de l'onde r":           'poorr wave progression',
    # garder isolé?
    'tracé de mauvaise qualité':               'poor-quality ecg',
    "transition précoce":                      'early repolarization',
    "trouble de conduction intraventriculaire":'intraventricular conduction disorder',
    #Hypertrophie (Dignosis)
    'hypertrophie ventriculaire gauche':       'left ventricular hypertrophy',
    'hypertrophie ventriculaire droite':       'right ventricular hypertrophy',
    'hypertrophie biventriculaire':            'biventricular hypertrophy',

    "ischemie anterieure":                     'anterior ischemia',
    'dissociation isorythmique':               'isorhythmic dissociation',
    'tachycardie':                             'tachycardia',
    'sauf légère augmentation du pr':          'except for a slight increase in the pr interval',
    'ondes t pointues':                        'pointed t-waves',
    'ondes q en ii-iii-avf':                   'q-waves in leads ii,iii,avf',
    'echappes supra hissien':                  'supra-hissian escape rhythm',
    'progression lente onde r v1-v3':          'slow progression wave r in leads v1-v3',
    'bigéminé':                                'with bigeminy',
    "d'age indéterminé":                       'of undetermined age',
     #'ventriculaire':                          ['ventriculaire','ventricuolaire'],
     #'auriculaire':                            ['auriculaire'],
    "trop d'artefacts":                        'too many artifacts',
    'sous-jacente':                            'underlying',
    'anévrysme vg':                            'left ventricular aneurysm',
    
    'répolarisation précoce':                  'early repolarization',
    'rythme ectopique':                        'ectopic rhythm',
    #'Rapport R/S augmenté en V1, considérer rotation horaire': ['Rapport R/S augmenté en V1, considérer rotation horaire'],

    'bloqué':                                  'blocked',
    'ectopique':                               'ectopic',
    'erreur v4 v5':                            'error v4 v5',
    'extension postérieur':                    'posterior extension',
    'conduction 2:1':                          '2:1 conduction',
    "peut-être secondaire à l'anomalie du qrs":'may be secondary to qrs anomaly',
    "pacemaker ventriculaire":                 'ventricular pacemaker',
    "(pas d' décelables)":                     '(not detectable)',

    'antérieur':                              'anterior',
    'inférieur':                               'inferior',
    ' latéral':                                 'lateral',
    'antérolatérale':                          'anterolateral',

    'lésion péricarditique':                   'pericardial lesion',
    'suggestif de':                            'suggestive of',
    'compatible':                              'compatible',
    'a éliminer':                              'to eliminate',
    "possiblement aigu":                       'possibly acute',
    "surcharge":                               'overload',
    'avec implication du ventricule droit':    'with involvement of the right ventricle',
    'pr normal':                               'normal pr segment',
    'qtc normal':                              'normal qtc',
    'récent':                                  'recent',
    'nombreuses':                              'frequent',

    #important?
    'échappement':                             'escape',
    'variable':                                'variable',
    'suspecté':                                'suspected',
    'complet':                                 'complete',
    'atypique':                                'atypical',
    'jonctionel':                              'junctional',
    'qui semble être sinusal':                 'that appears sinusal',
    'limite de la normale':                    'limit of normal',
    'conduction aberrante':                    'aberrant conduction',
    'alternant':                               'alternating',
    'aigue':                                   'acute',
    'rare':                                    'rare',
    'consécutif':                              'consecutive',
    ' possible':                                'possible',
    'avec':                                    'with',
    'par ailleurs':                            'otherwise',
    'marqué':                                  'marked',
    'éliminer':                                'eliminate',
    ' ou ' :                                   ' or ',
    ' et ' :                                   ' and ',
    'incomplet':                               'incomplete',
    'ecg limite':                              'borderline ecg',
    'limite':                                  'limit',
    'accéléré(e)':                             'accelerated',
    'plutôt':                                  'rather',
    'inversion electrodes avr-avl':            'avr-avl electrode inversion',
    'pacemaker':                               'pacemaker',
    'cardiostimulateur':                       'cardiostimulator',
    'sinusal':                                 'sinusal',
    'accéléré':                                'accelerated',
    'rapide':                                  'fast',
    'lent':                                    'slow',
    'bradycardie':                             'bradycardia',

    'anomalies de la repolarisation':          'repolarization abnormalities',
    ' en ':                                    ' in ',


    ' normal':                                 'normal',
    ' ventriculaire':                          ' ventricular',
    ' auriculaire':                            ' auricular',
    'sinusale':                                'sinusal',

    'marqué':                                  'marked',
    ' complet':                                 'complete',
    'réponse contrôlée':                       'controlled response',
    'supra':                                   'supra' ,
    'ischémie':                                'ischémie',
    'quelques complexess entrainés':           'some entrained complexes',
    'cardio-stimulateur séquentiel':           'sequential cardio-stimulator',
    'paroxystique':                            'paroxysmal',
    'sus-décalage':                            'elevation',
    'wandering pacemaker':                     'wandering pacemaker',
    'consecutives':                            'consecutive',
    'mais':                                    'but',
    'infarctus':                               'infarct',
    'bas voltage':                             'low voltage',
    'rythme ':                                 'rhythm',
    ',':                                       ',',
    '':                                        '',
    'vs':                                      'versus',
    '.':                                       '.',
    '/':                                       '/',
    ' de ':                                    ' from ',
    '2:1':                                     '2:1',
    ':':                                       ':',
    ';':                                       ';',

    'ischemie antero-laterale':                'antero-lateral ischemia',
    'conduction':                              'conduction',
    'frequentes':                              'frequent',
    'indéterminé':                             'undetermined',
    'jonctionnel':                             'junctional',
    'delai av prolongé':                       'extended av deadline',
    'conduction av':                           'av conduction',
    "possibilité d'":                          'possibility of',
    'ischémie':                                'ischemia',
    'bradycardie jonctionnelle':               'junctional bradycardia',
    'vs':                                      'versus',
    'répolarisation précoce':                  'early repolarization',
    'postérieur':                              'posterior',
    'r proéminent en v1':                      'r-wave prominent in v1',
    'répolarisation précoce':                  'early repolarization',
    'ecg':                                     'ecg',
    'qt-u long':                               'long qt-u',
    'bas':                                     'low',
    'forme trigiminée':                        'trigeminal shaped',
    'isolées':                                 'isolated',
    'cardio-stimulateur':                      'cardio-stimulator',
    'qt':                                      'qt',
    "l'onde t":                                "t-wave",
    'ne peut être exclu':                      'cannot be excluded',
    'cardiostimulateur':                       'cardio-stimulator',
    'aberrance':                               'outlier',
    'gauche':                                  'left',
    'aussi':                                   'also',
    'courant':                                 'current',
    'stimulation':                             'stimulation',
    'dans les dérivations frontales':          'in frontal leads',
    'lente progression onde r en anterieur':   'slow r-wave progression in the anterior leads',
    'Onde':                                    'wave',
    'controlée':                               'controled',
    ' en ':                                    ' in ',
    ' à ':                                     ' with ',
    'variante de la normale':                 'variation from normal',
    'réponse ventriculaire':                  'ventricular response',
    'tachycardia régulière':                  'regular tachycardia',
    'lef':                                    'left',
    'onde':                                   'wave',
    '-':                                      '-'
    
    
    }

In [None]:
from tqdm.notebook import tqdm
from collections import Counter
import operator

list_to_tranlate = data['diagnosis'].tolist()
new_list = list()
list_correctly_annotated = list()

for potential in tqdm(list_to_tranlate):
    check = potential
    for k,v in dict_equivalence_0.items():
        for v_ in v:
            potential = potential.replace(v_,k)
            check =  check.replace(v_,'')

    if "".join(check.split()) == '':
        new_list.append(potential)
        list_correctly_annotated.append(True)
    else:
        new_list.append('')
        list_correctly_annotated.append(False)





eng_new_list = list()

for potential in tqdm(new_list):
    potential = potential.lower()
    potential = " ".join(potential.split())

    for k,v in dict_equivalence_eng.items():
        if k in potential:
            potential = potential.replace(k,v)

    eng_new_list.append(potential)

data['diagnosis_eng'] = eng_new_list
data

In [None]:
from tqdm.notebook import tqdm
from collections import Counter
import operator

list_to_tranlate = data['diagnosis'].tolist()
new_list = list()

for potential in tqdm(list_to_tranlate):
    check = potential
    for k,v in dict_equivalence_0.items():
        for v_ in v:
            potential = potential.replace(v_,k)
            check =  check.replace(v_,'')
    new_list.append(potential)


eng_new_list = list()

for potential in tqdm(new_list):
    potential = potential.lower()
    potential = " ".join(potential.split())

    for k,v in dict_equivalence_eng.items():
        if k in potential:
            potential = potential.replace(k,v)

    eng_new_list.append(potential)

data['diagnosis_eng'] = eng_new_list
data


In [None]:
list_to_tranlate = data['diagnosis'].tolist()

from tqdm.notebook import tqdm
from collections import Counter
import operator

new_list = list()
val_list = list()
for potential in tqdm(list_to_tranlate):
    potential_changed = potential
    for k,v in dict_equivalence_0.items():
        for v_ in v:
            potential = potential.replace(v_,k)
            potential_changed = potential_changed.replace(v_,'')

    if "".join(potential_changed.split()) == '':
        new_list.append(potential)
    else:
        new_list.append('')

dict_ = dict(zip(list_key_phenotype,[[0]*len(data)] * len(list_key_phenotype)))

for pos,diag_para in enumerate(tqdm(new_list)):
    for entry in list_key_phenotype:
        if entry in diag_para:
            dict_[entry][pos] = 1



eng_new_list = list()
eng_val_list = list()
for potential in tqdm(new_list):
    potential = potential.lower()
    potential = " ".join(potential.split())

    for k,v in dict_equivalence_eng.items():
        if k in potential:
            potential = potential.replace(k,v)

    eng_new_list.append(potential)

data['diagnosis_eng'] = eng_new_list
data

In [None]:
data = data[data.npy_path !='/media/data1/ravram/DeepECG/ekg_waveforms_output/ecg_npy/Error']
data.shape

In [None]:
df_ecg_parquet_no_dup = data.drop_duplicates(subset=['RestingECG_PatientDemographics_PatientID'],keep='first')
df_ecg_parquet_no_dup