In [5]:
import pandas as pd
import os

# Chargement avec harmonisation des colonnes
def load_and_standardize_split(file_path):
    df = pd.read_csv(file_path)
    if 'PHQ_Binary' in df.columns:
        df = df.rename(columns={'PHQ_Binary': 'PHQ8_Binary'})
    return df

# Charger les splits
train_df = load_and_standardize_split("/kaggle/input/daic-woz/train_split_Depression_AVEC2017 (2).csv")
dev_df = load_and_standardize_split("/kaggle/input/daic-woz/dev_split_Depression_AVEC2017.csv")
test_df = load_and_standardize_split("/kaggle/input/daic-woz/full_test_split.csv")

# Fusion
full_df = pd.concat([train_df, dev_df, test_df], ignore_index=True)

# Générer les chemins
base_path = "/kaggle/input/daic-woz"

data = []

for _, row in full_df.iterrows():
    pid = row["Participant_ID"]
    folder_name = f"{pid}_P"
    
    audio_path = os.path.join(base_path, folder_name, f"{pid}_AUDIO.wav")
    transcript_path = os.path.join(base_path, folder_name, f"{pid}_TRANSCRIPT.csv")
    
    data.append({
        "Participant_ID": pid,
        "audio_path": audio_path,
        "transcript_path": transcript_path,
        "PHQ8_Binary": row["PHQ8_Binary"]
    })

# Sauvegarder
daic_paths_df = pd.DataFrame(data)
daic_paths_df.to_csv("daic_paths.csv", index=False)

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import nltk
from tqdm import tqdm

nltk.download('punkt')

# === LOAD GLOVE EMBEDDINGS ===
def load_glove_model(file_path):
    model = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            split_line = line.strip().split()
            word = split_line[0]
            vector = np.asarray(split_line[1:], dtype='float32')
            model[word] = vector
    return model

# === AUDIO FUNCTIONS ===
def get_patient_timestamps(transcript_path):
    df = pd.read_csv(transcript_path, sep="\t")
    df.columns = df.columns.str.lower()
    df = df[df['speaker'].str.lower() == 'participant']
    return list(df[['start_time', 'stop_time']].itertuples(index=False, name=None))

def load_patient_audio(audio_path, timestamps):
    y, sr = librosa.load(audio_path, sr=16000)
    patient_audio = [y[int(start * sr):int(end * sr)] for start, end in timestamps]
    return np.concatenate(patient_audio), sr

def segment_audio(audio, sr, segment_duration=7.6):
    segment_samples = int(segment_duration * sr)
    segments = [audio[i:i+segment_samples] for i in range(0, len(audio), segment_samples)]
    if len(segments[-1]) < segment_samples:
        segments = segments[:-1]
    return segments

def augment_audio(segments, noise_factor=0.005):
    return [seg + noise_factor * np.random.randn(len(seg)) for seg in segments]

def extract_mfcc_audio(segments, sr, n_mfcc=60, n_fft=1024, hop_length=322, win_length=960):
    return [librosa.feature.mfcc(y=seg, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft,
                                 hop_length=hop_length, win_length=win_length).T for seg in segments]

# === TEXT FUNCTIONS ===
def segment_transcripts(transcript_path, segment_duration=7.6):
    df = pd.read_csv(transcript_path, sep='\t')
    df = df.dropna(subset=['value'])
    total_time = df['stop_time'].max()
    n_segments = int(np.ceil(total_time / segment_duration))
    segments = []

    for i in range(n_segments):
        start, end = i * segment_duration, (i + 1) * segment_duration
        segment = df[(df['start_time'] >= start) & (df['stop_time'] < end)]
        text = " ".join(segment['value'].astype(str).tolist())
        segments.append(text)

    return segments

def text_to_embedding(text, model, max_words=9, emb_size=100):
    words = text.split()
    vecs = [model.get(w.lower(), np.zeros(emb_size)) for w in words[:max_words]]
    while len(vecs) < max_words:
        vecs.append(np.zeros(emb_size))
    emb = np.stack(vecs, axis=0).T  # shape (100, 9)
    if emb.shape[0] < 378:
        pad = np.zeros((378 - emb.shape[0], emb.shape[1]))
        emb = np.vstack((emb, pad))
    return emb  # shape (378, 9)

# === TRAITEMENT PAR PATIENT ===
def process_patient(audio_path, transcript_path, glove_model):
    try:
        timestamps = get_patient_timestamps(transcript_path)
        patient_audio, sr = load_patient_audio(audio_path, timestamps)
        audio_segments = segment_audio(patient_audio, sr)
        audio_segments = augment_audio(audio_segments)
        mfcc_audio = extract_mfcc_audio(audio_segments, sr)

        text_segments = segment_transcripts(transcript_path)
        mfcc_text = [text_to_embedding(t, glove_model) for t in text_segments]

        n_segments = min(len(mfcc_audio), len(mfcc_text))
        return mfcc_audio[:n_segments], mfcc_text[:n_segments]
    except Exception as e:
        print(f"Erreur patient {audio_path} : {e}")
        return [], []

In [None]:
# === PIPELINE PRINCIPAL ===
def build_dataset(csv_path, glove_path, output_path="processed_daic_dataset.npz"):
    df = pd.read_csv(csv_path)
    glove_model = load_glove_model(glove_path)

    all_audio = []
    all_text = []
    all_labels = []
    all_ids = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        audio_path = row["audio_path"]
        transcript_path = row["transcript_path"]
        label = row["PHQ8_Binary"]
        patient_id = row["id"]

        mfcc_audio, mfcc_text = process_patient(audio_path, transcript_path, glove_model)

        for a, t in zip(mfcc_audio, mfcc_text):
            all_audio.append(a)
            all_text.append(t)
            all_labels.append(label)
            all_ids.append(patient_id)

    np.savez_compressed(output_path,
                        ids=np.array(all_ids),
                        X_audio=np.array(all_audio, dtype=object),
                        X_text=np.array(all_text, dtype=object),
                        y=np.array(all_labels))
    
    print(f"✅ Dataset sauvegardé : {output_path}")

In [None]:
import numpy as np

data = np.load("processed_daic_dataset.npz", allow_pickle=True)
X_audio = data["audio"]      # shape: (n_samples, 378, 60)
X_text = data["text"]        # shape: (n_samples, 378, 9)
y = data["labels"]           # shape: (n_samples, 2)

In [None]:
from sklearn.model_selection import train_test_split

# Split train (70%) and temp (30%)
X_audio_train, X_audio_temp, X_text_train, X_text_temp, y_train, y_temp = train_test_split(
    X_audio, X_text, y, test_size=0.3, random_state=42, stratify=y
)

# Split temp (30%) into dev (15%) and test (15%)
X_audio_dev, X_audio_test, X_text_dev, X_text_test, y_dev, y_test = train_test_split(
    X_audio_temp, X_text_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)