In [1]:
import os
import pandas as pd

# 📥 1. Lire les fichiers CSV avec le bon séparateur
train = pd.read_csv("/kaggle/input/daic-woz/train_split_Depression_AVEC2017 (2).csv", sep=",")
dev = pd.read_csv("/kaggle/input/daic-woz/dev_split_Depression_AVEC2017.csv", sep=",")
test = pd.read_csv("/kaggle/input/daic-woz/full_test_split.csv", sep=",")

# 🧽 2. Nettoyer les noms de colonnes
train.columns = [col.strip() for col in train.columns]
dev.columns = [col.strip() for col in dev.columns]
test.columns = [col.strip() for col in test.columns]

# 🔁 3. Renommer les colonnes pour harmoniser les noms
train = train.rename(columns={"PHQ8_Score": "PHQ_Score", "PHQ8_Binary": "PHQ_Binary"})
dev = dev.rename(columns={"PHQ8_Score": "PHQ_Score", "PHQ8_Binary": "PHQ_Binary"})

# 📊 4. Garder seulement les colonnes nécessaires
columns = ['Participant_ID', 'PHQ_Score', 'PHQ_Binary']
train = train[columns]
dev = dev[columns]
test = test[columns]

# 📂 6. Ajouter les chemins des fichiers audio et des transcriptions
# Suppose that audio and transcript files are stored in respective directories
audio_dir = 'audio_files/'
transcript_dir = 'transcripts/'

# Function to generate file paths based on Participant_ID
def get_file_paths(df, audio_dir, transcript_dir):
    df['audio_path'] = df['Participant_ID'].apply(lambda x: os.path.join(audio_dir, f"{x}_AUDIO.wav"))
    df['transcript_path'] = df['Participant_ID'].apply(lambda x: os.path.join(transcript_dir, f"{x}_TRANSCRIPT.csv"))
    return df

train = get_file_paths(train, audio_dir, transcript_dir)
dev = get_file_paths(dev, audio_dir, transcript_dir)
test = get_file_paths(test, audio_dir, transcript_dir)

# 🧷 7. Concaténer les trois ensembles
full_df = pd.concat([train, dev, test], ignore_index=True)

# ✅ Résultat
print(full_df.head())

   Participant_ID  PHQ_Score  PHQ_Binary                 audio_path  \
0             303          0           0  audio_files/303_AUDIO.wav   
1             304          6           0  audio_files/304_AUDIO.wav   
2             305          7           0  audio_files/305_AUDIO.wav   
3             310          4           0  audio_files/310_AUDIO.wav   
4             312          2           0  audio_files/312_AUDIO.wav   

                  transcript_path  
0  transcripts/303_TRANSCRIPT.csv  
1  transcripts/304_TRANSCRIPT.csv  
2  transcripts/305_TRANSCRIPT.csv  
3  transcripts/310_TRANSCRIPT.csv  
4  transcripts/312_TRANSCRIPT.csv  
