In [1]:
!pip install librosa --quiet

In [2]:
import os
import re
import librosa
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim.downloader as api
from scipy.io import wavfile


In [3]:
!pip install nltk



In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
embedding_model = api.load("glove-wiki-gigaword-50")
vector_size = 50

def preprocess_text(text, max_len=9):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower().strip()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    vectors = []
    for word in tokens[:max_len]:
        if word in embedding_model:
            vectors.append(embedding_model[word])
        else:
            vectors.append(np.zeros(vector_size))
    while len(vectors) < max_len:
        vectors.append(np.zeros(vector_size))
    return np.array(vectors)

def extract_audio_features(audio_path, start, end, sr=16000):
    audio, _ = librosa.load(audio_path, sr=sr)
    start_sample = int(start * sr)
    end_sample = int(end * sr)
    segment = audio[start_sample:end_sample]
    
    # MFCCs shape = (n_mfcc, time_frames)
    mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13)
    # Flatten to fixed size vector
    return np.mean(mfcc, axis=1)  # (13,) moyen par MFCC

def preprocess_daic_sample(folder_path, window_size=7.6):
    transcript_path = os.path.join(folder_path, [f for f in os.listdir(folder_path) if f.endswith('_TRANSCRIPT.csv')][0])
    audio_path = os.path.join(folder_path, [f for f in os.listdir(folder_path) if f.endswith('_AUDIO.wav')][0])
    
    df = pd.read_csv(transcript_path, sep='\t')
    df = df.dropna(subset=["value"])
    
    samples = []
    
    for i in range(len(df)):
        row = df.iloc[i]
        if row['speaker'].lower() == "participant":
            start = row['start_time']
            end = min(start + window_size, row['stop_time'])
            sentence = row['value']
            
            # Texte prétraité en embedding
            text_embed = preprocess_text(sentence)  # shape (9, 50)
            
            # Audio MFCC
            mfcc_feat = extract_audio_features(audio_path, start, end)  # shape (13,)
            
            # Texte de l'intervieweur juste avant
            prev_text = ""
            if i > 0 and df.iloc[i-1]['speaker'].lower() != "participant":
                prev_text = df.iloc[i-1]['value']
            context_embed = preprocess_text(prev_text)
            
            # On stocke tout
            samples.append({
                "text_embedding": text_embed,
                "context_embedding": context_embed,
                "mfcc": mfcc_feat,
                "start": start,
                "end": end,
                "text": sentence,
                "context": prev_text
            })
    
    return samples


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




In [5]:
def preprocess_all_participants(root_path):
    all_samples = []
    
    for folder_name in os.listdir(root_path):
        folder_path = os.path.join(root_path, folder_name)
        if os.path.isdir(folder_path) and folder_name.endswith('_P'):
            print(f"Traitement de : {folder_name}")
            try:
                samples = preprocess_daic_sample(folder_path)
                all_samples.extend(samples)
            except Exception as e:
                print(f"Erreur avec {folder_name} : {e}")
    
    return all_samples


In [6]:
root_data_path = "/kaggle/input/daic-woz/"
all_data = preprocess_all_participants(root_data_path)

print("Total de segments collectés :", len(all_data))


Traitement de : 357_P
Traitement de : 362_P
Traitement de : 385_P
Traitement de : 360_P
Traitement de : 354_P
Traitement de : 393_P
Total de segments collectés : 441


In [7]:
def save_embeddings(samples, output_path="/kaggle/working/daic_preprocessed.npz"):
    texts = np.array([s["text_embedding"] for s in samples])
    contexts = np.array([s["context_embedding"] for s in samples])
    mfccs = np.array([s["mfcc"] for s in samples])
    
    np.savez(output_path, text=texts, context=contexts, mfcc=mfccs)
    print("Embeddings sauvegardés :", output_path)

In [8]:
# Sauvegarde
save_embeddings(all_data)

Embeddings sauvegardés : /kaggle/working/daic_preprocessed.npz


In [9]:
import pandas as pd

def save_as_dataframe(samples, output_path="/kaggle/working/daic_preprocessed.csv"):
    df = pd.DataFrame(samples)
    df.to_csv(output_path, index=False)
    print(f"Data saved as CSV at {output_path}")

save_as_dataframe(all_data)

Data saved as CSV at /kaggle/working/daic_preprocessed.csv
