In [36]:
!pip install language-tool-python



In [37]:
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import os
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import language_tool_python
from gensim.models.fasttext import load_facebook_vectors
from keras.preprocessing.sequence import pad_sequences

In [38]:
def load_audio(audio_path, sr=16000):
    return librosa.load(audio_path, sr=sr)

In [39]:
def slice_patient_audio(y, sr, timestamps, segment_length=7.6):
    """Keep only patient’s speech using provided timestamps and split into 7.6s segments"""
    patient_audio = np.concatenate([y[int(start*sr):int(end*sr)] for start, end in timestamps])
    segments = []
    segment_samples = int(segment_length * sr)
    for i in range(0, len(patient_audio), segment_samples):
        chunk = patient_audio[i:i + segment_samples]
        if len(chunk) == segment_samples:
            segments.append(chunk)
    return segments


In [40]:
def add_noise(y, noise_factor):
    return y + noise_factor * np.random.randn(len(y))

In [41]:
def pitch_shift(y, sr, steps):
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=steps)

In [42]:
def extract_mfcc(y, sr=16000, n_mfcc=19, win_length=0.06):
    hop_length = int(sr * win_length / 2)
    win_len = int(sr * win_length)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, 
                                hop_length=hop_length, win_length=win_len,
                                window='hamming')
    return mfcc.T  # shape (frames, n_mfcc)

In [43]:
import torch.nn as nn
# -------- CNN MODEL --------

class AudioCNNEncoder(nn.Module):
    def __init__(self, input_shape):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=(3, 3), padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=(3, 3), padding=1)
        self.dropout = nn.Dropout(0.3)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(32 * input_shape[0] * input_shape[1], 128)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = self.flatten(x)
        return self.fc(x)


In [44]:
def process_all_patients_audio(dataset_dir, cnn_model, sr=16000, segment_len=7.6, device='cpu'):
    cnn_model.to(device)
    cnn_model.eval()
    

    
    participant_ids = [p for p in os.listdir(base_dir) if p.endswith('_P')]
    processed_data = {}

    for patient_id in participant_ids:
        audio_path = os.path.join(base_dir, participant_id, f"{participant_id.split('_')[0]}_AUDIO.wav")
        if not os.path.exists(audio_path): continue

        y, _ = load_audio(audio_path, sr)
        segments = slice_patient_audio(y, sr, patient_timestamps_dict[patient_prefix], segment_len)

        embeddings = []

        for seg in segments:
            augmented_versions = [seg]
            # Noise Injection
            for alpha in [0.01, 0.02, 0.03]:
                augmented_versions.append(add_noise(seg, alpha))
            # Pitch Shifting
            for steps in [-0.5, -2, -2.5]:
                augmented_versions.append(pitch_shift(seg, sr, steps))

            for aug in augmented_versions:
                mfcc = extract_mfcc(aug, sr)  # shape: (frames, 19)
                mfcc_tensor = torch.tensor(mfcc).unsqueeze(0).unsqueeze(0).float().to(device)  # (1, 1, frames, 19)
                with torch.no_grad():
                    embedding = cnn_model(mfcc_tensor)
                embeddings.append(embedding.cpu().numpy())

        # Average all embeddings for the patient
        processed_data[patient_prefix] = np.mean(embeddings, axis=0)

    return processed_data

In [45]:
dummy = extract_mfcc(np.zeros(int(7.6*16000)), 16000)
cnn_model = AudioCNNEncoder(input_shape=dummy.shape)

In [46]:
# Téléchargement des ressources NLTK (à faire une seule fois dans un notebook Kaggle)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [47]:
# Préparation
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [48]:
# Chargement du modèle GloVe léger
print("Chargement du modèle GloVe...")
model = api.load("glove-wiki-gigaword-50")  # Vecteurs 50 dimensions
print("Modèle chargé.")


Chargement du modèle GloVe...
Modèle chargé.


In [49]:
# Fonction de prétraitement du texte
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Enlève la ponctuation
    text = re.sub(r'\d+', '', text)      # Enlève les chiffres
    text = text.lower().strip()          # Minuscule + trim
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

In [50]:
# Fonction pour convertir une phrase en matrice de vecteurs
def text_to_embedding_matrix(text, vector_size=50, max_len=9):
    tokens = preprocess_text(text)
    matrix = []
    
    for word in tokens[:max_len]:
        if word in model:
            matrix.append(model[word])
        else:
            matrix.append(np.zeros(vector_size))  # Si mot inconnu, vecteur nul
            
    # Padding si moins de `max_len` mots
    while len(matrix) < max_len:
        matrix.append(np.zeros(vector_size))
    
    return np.array(matrix)

In [51]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
embedding_model = api.load("glove-wiki-gigaword-50")
vector_size = 50

def preprocess_text(text, max_len=9):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower().strip()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    vectors = []
    for word in tokens[:max_len]:
        if word in embedding_model:
            vectors.append(embedding_model[word])
        else:
            vectors.append(np.zeros(vector_size))
    while len(vectors) < max_len:
        vectors.append(np.zeros(vector_size))
    return np.array(vectors)

def extract_audio_features(audio_path, start, end, sr=16000):
    audio, _ = librosa.load(audio_path, sr=sr)
    start_sample = int(start * sr)
    end_sample = int(end * sr)
    segment = audio[start_sample:end_sample]
    
    # MFCCs shape = (n_mfcc, time_frames)
    mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13)
    # Flatten to fixed size vector
    return np.mean(mfcc, axis=1)  # (13,) moyen par MFCC

def preprocess_daic_sample(folder_path, window_size=7.6):
    transcript_path = os.path.join(folder_path, [f for f in os.listdir(folder_path) if f.endswith('_TRANSCRIPT.csv')][0])
    audio_path = os.path.join(folder_path, [f for f in os.listdir(folder_path) if f.endswith('_AUDIO.wav')][0])
    
    df = pd.read_csv(transcript_path, sep='\t')
    df = df.dropna(subset=["value"])
    
    samples = []
    
    for i in range(len(df)):
        row = df.iloc[i]
        if row['speaker'].lower() == "participant":
            start = row['start_time']
            end = min(start + window_size, row['stop_time'])
            sentence = row['value']
            
            # Texte prétraité en embedding
            text_embed = preprocess_text(sentence)  # shape (9, 50)
            
            # Audio MFCC
            mfcc_feat = extract_audio_features(audio_path, start, end)  # shape (13,)
            
            # Texte de l'intervieweur juste avant
            prev_text = ""
            if i > 0 and df.iloc[i-1]['speaker'].lower() != "participant":
                prev_text = df.iloc[i-1]['value']
            context_embed = preprocess_text(prev_text)
            
            # On stocke tout
            samples.append({
                "text_embedding": text_embed,
                "context_embedding": context_embed,
                "mfcc": mfcc_feat,
                "start": start,
                "end": end,
                "text": sentence,
                "context": prev_text
            })
    
    return samples


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
