In [1]:
pip install language-tool-python

Collecting language-tool-python
  Downloading language_tool_python-2.9.3-py3-none-any.whl.metadata (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.7/54.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading language_tool_python-2.9.3-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: language-tool-python
Successfully installed language-tool-python-2.9.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import os
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import language_tool_python
from gensim.models.fasttext import load_facebook_vectors
from keras.preprocessing.sequence import pad_sequences

2025-05-08 10:01:54.638849: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746698514.855954      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746698514.916955      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
def load_audio(audio_path, sr=16000):
    return librosa.load(audio_path, sr=sr)

In [4]:
def slice_patient_audio(y, sr, timestamps, segment_length=7.6):
    """Keep only patient’s speech using provided timestamps and split into 7.6s segments"""
    patient_audio = np.concatenate([y[int(start*sr):int(end*sr)] for start, end in timestamps])
    segments = []
    segment_samples = int(segment_length * sr)
    for i in range(0, len(patient_audio), segment_samples):
        chunk = patient_audio[i:i + segment_samples]
        if len(chunk) == segment_samples:
            segments.append(chunk)
    return segments


In [5]:
def add_noise(y, noise_factor):
    return y + noise_factor * np.random.randn(len(y))

In [6]:
def pitch_shift(y, sr, steps):
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=steps)

In [7]:
def extract_mfcc(y, sr=16000, n_mfcc=19, win_length=0.06):
    hop_length = int(sr * win_length / 2)
    win_len = int(sr * win_length)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, 
                                hop_length=hop_length, win_length=win_len,
                                window='hamming')
    return mfcc.T  # shape (frames, n_mfcc)

In [8]:
# -------- CNN MODEL --------

class AudioCNNEncoder(nn.Module):
    def __init__(self, input_shape):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=(3, 3), padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=(3, 3), padding=1)
        self.dropout = nn.Dropout(0.3)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(32 * input_shape[0] * input_shape[1], 128)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = self.flatten(x)
        return self.fc(x)


In [9]:
def process_all_patients(dataset_dir, patient_timestamps_dict, cnn_model, sr=16000, segment_len=7.6, device='cpu'):
    cnn_model.to(device)
    cnn_model.eval()

    processed_data = {}

    for patient_prefix in tqdm(patient_timestamps_dict):
        audio_path = os.path.join(dataset_dir, f"{patient_prefix}.wav")
        if not os.path.exists(audio_path): continue

        y, _ = load_audio(audio_path, sr)
        segments = slice_patient_audio(y, sr, patient_timestamps_dict[patient_prefix], segment_len)

        embeddings = []

        for seg in segments:
            augmented_versions = [seg]
            # Noise Injection
            for alpha in [0.01, 0.02, 0.03]:
                augmented_versions.append(add_noise(seg, alpha))
            # Pitch Shifting
            for steps in [-0.5, -2, -2.5]:
                augmented_versions.append(pitch_shift(seg, sr, steps))

            for aug in augmented_versions:
                mfcc = extract_mfcc(aug, sr)  # shape: (frames, 19)
                mfcc_tensor = torch.tensor(mfcc).unsqueeze(0).unsqueeze(0).float().to(device)  # (1, 1, frames, 19)
                with torch.no_grad():
                    embedding = cnn_model(mfcc_tensor)
                embeddings.append(embedding.cpu().numpy())

        # Average all embeddings for the patient
        processed_data[patient_prefix] = np.mean(embeddings, axis=0)

    return processed_data

In [10]:
# Télécharger les ressources nécessaires NLTK
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialisation
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [11]:
window_size = 7.6
windows = []
current = 0.0

In [12]:
# Chemin vers le dossier des données
base_dir = '/kaggle/input/daic-woz'
all_vectors = []

In [13]:
pip install compress-fasttext

Collecting compress-fasttext
  Downloading compress-fasttext-0.1.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting scipy<1.14.0,>=1.7.0 (from gensim>=4.0.0->compress-fasttext)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: compress-fasttext
  Building wheel for compress-fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for compress-fasttext: filename=compress_fasttext-0.1.5-py3-none-any.whl size=16097 sha256=308f1a11d88e9142d3552bf226688e4bf6158fb8fe9b1dec734fd550e6be3cbb
  Stored in directory: /root/.c

In [14]:
import gensim.downloader as api

# Load the GloVe model (100-dimensional, small size)
glove_model = api.load("glove-wiki-gigaword-100")  # Only ~130MB, works fine on Kaggle




IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=10000.0 (msgs/sec)
NotebookApp.rate_limit_window=1.0 (secs)



In [15]:
while current < data['stop'].max():
    win_data = data[(data['start'] < current + window_size) & (data['stop'] > current)]
    participant_text = " ".join(win_data[win_data['speaker'] == 'Participant']['text'])
    interviewer_text = " ".join(win_data[win_data['speaker'] == 'Interviewer']['text'])
    windows.append({
        'start': current,
        'end': current + window_size,
        'participant_text': participant_text,
        'interviewer_text': interviewer_text
    })
    current += window_size

NameError: name 'data' is not defined

NameError: name 'data' is not defined

In [None]:
model = load_facebook_vectors('cc.en.300.bin')  # change le chemin si besoin

def text_to_matrix(text, max_words=9):
    tokens = preprocess_text(text)
    tokens = tokens[:max_words] + ['<pad>'] * (max_words - len(tokens))
    matrix = []
    for token in tokens:
        if token in model:
            matrix.append(model[token])
        else:
            matrix.append(np.zeros(300))  # vecteur nul pour les mots inconnus
    return np.array(matrix)

In [None]:
def resize_matrix_to_target(embedding_matrix, target_shape=(378, 9)):
    current_shape = embedding_matrix.shape
    resized = np.zeros(target_shape)
    min_rows = min(target_shape[0], current_shape[0])
    min_cols = min(target_shape[1], current_shape[1])
    resized[:min_rows, :min_cols] = embedding_matrix[:min_rows, :min_cols]
    return resized

In [None]:
# === 5. Construction des matrices (une par segment) ===
text_matrices = []
for win in windows:
    combined_text = win['participant_text'] + " " + win['interviewer_text']
    mat = text_to_matrix(combined_text)
    text_matrices.append(mat)

In [None]:
# === Résultat final ===
text_matrices = np.stack(text_matrices)  # shape: (nb_segments, 9, 300)
print("Shape finale des features textuels :", text_matrices.shape)