In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)

Mounted at /content/gdrive


In [None]:
!pip install spafe
!pip install praat-parselmouth
!pip install textstat
!pip install pocketsphinx
!pip install ctranslate2==4.4.0

In [None]:
!pip install git+https://github.com/m-bain/whisperx.git


In [1]:
import os
import pandas as pd

os.chdir("/content/gdrive/MyDrive/speech_analysis")

In [None]:
from process_file import process_file, process_file_model

In [3]:
import whisperx
import gc
import torch

torch.set_num_threads(1)

vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True)

device = "cuda"
batch_size = 8 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
transcription_model = whisperx.load_model("large-v3", device, compute_type=compute_type)

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /root/.cache/torch/hub/master.zip
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.39k [00:00<?, ?B/s]

vocabulary.json:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

No language specified, language will be first be detected for each audio file (increases inference time).


100%|█████████████████████████████████████| 16.9M/16.9M [00:02<00:00, 6.49MiB/s]
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu121. Bad things might happen unless you revert torch to 1.x.


In [4]:
def add_group_names(output_file):
    names = ["Pausing behavior", "Speech behavior", "Frequency Parameters", "Spectral Domain", "Voice Quality",
             "Loudness and Intensity", "Complexity", "Info"]

    ranges = [(0, 11), (11, 137),(137, 377), (377, 6331), (6331, 6475), (6475, 6715), (6715, 6859), (6859, 6861)]

    group_names = [""] * 6861
    for i, r in enumerate(ranges):
        for j in range(r[0], r[1]):
            group_names[j] = names[i]
    train_df = pd.read_csv(output_file)

    column_names = zip(group_names, train_df.columns)
    train_df.columns = pd.MultiIndex.from_tuples(column_names)

    train_df.to_csv(output_file, index=False)

In [5]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import csv


def calculate_all_features(input_dir, output_file, label):
    previously_calculated = {}
    if os.path.exists(output_file):
        features_pd = pd.read_csv(output_file)
        previously_calculated = set(features_pd["filename"].tolist())


    csvfile = open(output_file, "a+")
    writer = None
    write_header = True
    if len(previously_calculated) > 0:
        write_header = False
        df = pd.read_csv(output_file)
        fieldnames = list(df.columns)
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    progress_bar = tqdm(range(len(os.listdir(input_dir))), position=0, leave=True)
    for file in os.listdir(input_dir):
        if file in previously_calculated:
            continue
        if file.endswith('.wav'):
                file_path = os.path.join(input_dir, file)
                try:
                    acoustic_features = process_file(file_path)
                    features = process_file_model(file_path, vad_model, utils, transcription_model)
                    features.update(acoustic_features)
                    # features = {}

                except Exception as e:
                    print(e)
                    print(file)
                    features = {}
                features['filename'] = file
                features['label'] = label
                if write_header:
                    writer = csv.DictWriter(csvfile, fieldnames=features.keys())
                    writer.writeheader()
                    write_header = False
                writer.writerows([features])

        progress_bar.update(1)

    csvfile.close()
    add_group_names(output_file)


In [9]:
directory_path_ad = "/content/gdrive/MyDrive/Data/2021/Audio-denoise_new/test_data_2021/AD-test-denoise"
directory_path_cn = "/content/gdrive/MyDrive/Data/2021/Audio-denoise_new/test_data_2021/CN-denoise"

In [7]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
calculate_all_features(directory_path_ad, "oo17.csv", 1)
# calculate_all_features(directory_path_cn, "oo1.csv", 0)

In [15]:
import pandas as pd

df = pd.read_csv("oo17.csv", header=[0, 1])
df = df.dropna(axis=1, how='all')

print(f"Dimension: {df.shape}")


Dimension: (4, 6848)
