In [None]:
# import 

import os
import pandas as pd
import soundfile as sf
import librosa



In [None]:

def extract_segments_with_metadata(audio_folder, annotation_folder, diagnosis_df, output_segments_folder):
    # Initialisation des logs
    segments_metadata = []

    # Parcourir les fichiers audio et annotations
    for audio_file in os.listdir(audio_folder):
        if audio_file.endswith(".wav"):
            # Extraire l'ID du patient
            patient_id = int(audio_file.split('_')[0])
            location = audio_file.split('_')[2]  # Localisation (ex : Al, Tc)
            equipment = audio_file.split('_')[4].replace('.wav', '')  # Équipement (ex : AKGC417L)

            # Récupérer le diagnostic correspondant
            diagnosis_row = diagnosis_df.loc[diagnosis_df["Patient ID"] == patient_id, "Diagnosis"]
            if not diagnosis_row.empty:
                diagnosis = diagnosis_row.values[0]
            else:
                diagnosis = "Unknown"

            # Récupérer l'annotation correspondante
            annotation_file = audio_file.replace(".wav", ".txt")
            annotation_path = os.path.join(annotation_folder, annotation_file)
            if os.path.exists(annotation_path):
                # Charger l'audio
                audio_path = os.path.join(audio_folder, audio_file)
                try:
                    y, sr = librosa.load(audio_path, sr=None)
                except Exception as e:
                    print(f"Erreur lors du chargement de {audio_file}: {e}")
                    continue

                # Charger les annotations
                annotations = pd.read_csv(annotation_path, sep='\t', names=["Start", "End", "Crackles", "Wheezes"])

                # Extraire les segments
                for idx, row in annotations.iterrows():
                    start_sample = int(row["Start"] * sr)
                    end_sample = int(row["End"] * sr)
                    segment = y[start_sample:end_sample]

                    # Vérifier la validité du segment
                    if len(segment) == 0 or len(segment) < sr * 0.2:  # Ignorer les segments < 200ms
                        continue

                    # Sauvegarder le segment dans un fichier séparé
                    segment_filename = f"{patient_id}_{location}_{equipment}_segment_{idx+1}_{row['Start']:.2f}-{row['End']:.2f}.wav"
                    segment_path = os.path.join(output_segments_folder, segment_filename)
                    sf.write(segment_path, segment, sr)

                    # Ajouter les métadonnées dans la liste
                    segments_metadata.append({
                        "Patient ID": patient_id,
                        "File Name": audio_file,
                        "Segment File": segment_filename,
                        "Diagnosis": diagnosis,
                        "Location": location,
                        "Equipment": equipment,
                        "Start (s)": row["Start"],
                        "End (s)": row["End"],
                        "Crackles": row["Crackles"],
                        "Wheezes": row["Wheezes"],
                    })

    # Créer un DataFrame final avec les diagnostics
    metadata_df = pd.DataFrame(segments_metadata)
    metadata_df.to_csv("segments_metadata_with_diagnosis.csv", index=False)

    print(f"Segments extraits et sauvegardés dans : {output_segments_folder}")
    print("Métadonnées des segments sauvegardées dans : segments_metadata_with_diagnosis.csv")



In [None]:
%pip -version

In [None]:
 if segments_metadata:
        # Ajouter les segments dans metadata_df
        new_metadata_df = pd.DataFrame(segments_metadata)

        # Fusionner avec les données démographiques existantes
        new_metadata_df = new_metadata_df.merge(
            demographics_df,
            on="Patient ID",
            how="left"
        )

        # Ajouter les nouveaux segments fusionnés dans metadata_df
        metadata_df = pd.concat([metadata_df, new_metadata_df], ignore_index=True)