# Importation des librairies

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import plotly.express as px
import glob

# Chargement des données

In [64]:
def extract_filenames():
    # Use glob to get all the filenames in the folder
    filenames = glob.glob('data/Respiratory_Sound_Database/audio_and_txt_files/*.txt')

    data = []
    for filename in filenames:
        filename = filename.split('.')[0]

        data_file = [f for f in filename.split('\\')[-1].split('_')]

        dict_data = {
            'Patient_number': str(data_file[0]),
            'Recording_index': data_file[1],
            'Chest_location': data_file[2],
            'Acquisition_mode': data_file[3],
            'Recording_equipment': data_file[4],
            'Path': filename+".wav",
        }

        with open(filename +".txt", 'r') as file:
            file = file.read()
        
        file = file.split('\n')[:-1]
        for f in file:
            f = f.split('\t')
            dict_data_file = {
                'Start': f[0],
                'End': f[1],
                'Crackles': f[2],
                'Wheezes': f[3]
            }
            data.append(dict_data | dict_data_file)

    filenames_df = pd.DataFrame(data)
    return filenames_df

def extract_demographic_data():

    path = 'data/demographic_info.txt'
    with open(path, 'r') as file:
        file = file.read()
    file = file.split('\n')[1:]
    file = [f.split(' ') for f in file]
    
    data = []
    for f in file :
        data.append({
            'Patient_number': f[0],
            'Age': f[1],
            'Sex': f[2]
        })
    return pd.DataFrame(data)

diag_df = pd.read_csv('data/Respiratory_Sound_Database/patient_diagnosis.csv', header=None, names=['Patient_number', 'Diagnosis'], dtype={'Patient_number': str, 'Diagnosis': str})


# Merge
data = pd.merge(extract_filenames(), diag_df, on='Patient_number')
data = pd.merge(data, extract_demographic_data(), on='Patient_number')
data.to_csv('data/data_generated/data_generated.csv', index=False)

