# Определение моделей

In [1]:
import whisperx
import torch
from whisperx.diarize import DiarizationPipeline
HF_TOKEN = "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"



def transcribe_audio(audio_file, model_name="large-v3", compute_type="float16"):
    """Транскрибация аудиофайла"""
    model = whisperx.load_model(model_name, device="cuda" if torch.cuda.is_available() else "cpu", compute_type=compute_type)
    transcription_result = model.transcribe(audio_file)
    return transcription_result

def perform_diarization(audio_file):
    """Диаризация аудиофайла и выделение самого длинного сегмента используя WhisperX"""
    diarization_pipeline = DiarizationPipeline(use_auth_token=HF_TOKEN, device='cuda')
    diarized = diarization_pipeline(audio_file)

    return diarized

def count_unique_speakers(diarize_df):
    """Подсчёт уникальных спикеров в результатах диаризации"""
    unique_speakers = diarize_df['speaker'].nunique()
    return unique_speakers

  torchaudio.set_audio_backend("soundfile")
  torchaudio.set_audio_backend("soundfile")


# Тестовая обработка аудио

In [2]:
import time

def base(audio_file):
    start_time1 = time.time()
    transcription_result = transcribe_audio(audio_file)
    end_time1 = time.time()
    processing_time = end_time1 - start_time1
    print(f'Transcription time: {processing_time}')

    

    # Measure diarization time
    start_time = time.time()
    diarize_df = perform_diarization(audio_file)
    end_time = time.time()
    processing_time1 = end_time - start_time
    print(f'Diarization time: {processing_time1}')

    # Count unique speakers
    num_speakers = count_unique_speakers(diarize_df)

    # Extract text from transcription result
    if isinstance(transcription_result, list):
        text = " ".join([item['text'] for item in transcription_result])
    elif isinstance(transcription_result, dict) and 'segments' in transcription_result:
        text = " ".join([segment['text'] for segment in transcription_result['segments']])
    else:
        text = transcription_result.get('text', '')

    return text, num_speakers

# Example usage
text, num_speakers = base('аудио/Встреча 8. .m4a')
print(f'Transcribed text: {text}')
print(f'Number of speakers: {num_speakers}')


config.json:   0%|          | 0.00/2.39k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

vocabulary.json:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.0+cu121. Bad things might happen unless you revert torch to 1.x.
Detected language: ru (0.98) in first 30s of audio...
Transcription time: 281.39105129241943
Diarization time: 2.883897304534912
Transcribed text:  Вы знаете, что в Государственную Думу у меня вынесено предложение о назначении вас на должность председателя правительства Российской Федерации. Совсем недавно мы встречались с коллегами и оценивали работу правительства за предыдущие годы. Сделано в сложных условиях немало, и мне кажется, что было бы правильно, если бы  Мы продолжили с вами работу, и вы продолжили работу в качестве председателя правительства.  Мы с вами говорили и о структуре, говорили о персонале. В целом, думаю, мы на правиль

# Генерация submission с временем работы

In [3]:
import pandas as pd
import os

def create_dataset_from_audio_folder(folder_path):
    # Список для хранения данных
    data = []

    # Перебор всех файлов в папке
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".wav") or file_name.endswith(".mp3") or file_name.endswith(".m4a") or file_name.endswith(".ogg"):  # Учитываем только аудиофайлы
            file_path = os.path.join(folder_path, file_name)
            
            # Получаем транскрибированный текст и число спикеров
            try:
                transcribed_text, number_of_speakers = base(file_path)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
                transcribed_text, number_of_speakers = "Ошибка при обработке", 0
            
            # Добавляем данные в список
            data.append({
                'Наименование аудиозаписи': file_name,
                'Транскрибированный текст': transcribed_text,
                'Число спикеров': number_of_speakers
            })

    # Создаем DataFrame
    df = pd.DataFrame(data)
    
    # Сохраняем DataFrame в CSV файл
    df.to_csv('dataset.csv', index=False, encoding='utf-8-sig')
    print("Dataset has been created and saved to 'dataset.csv'")

# Пример использования
folder_path = 'audio_test/'
create_dataset_from_audio_folder(folder_path)


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.0+cu121. Bad things might happen unless you revert torch to 1.x.
Detected language: ru (1.00) in first 30s of audio...
Transcription time: 32.58378529548645
Diarization time: 10.450380086898804


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.0+cu121. Bad things might happen unless you revert torch to 1.x.
Detected language: ru (1.00) in first 30s of audio...
Transcription time: 25.41658878326416
Diarization time: 7.72376012802124


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.0+cu121. Bad things might happen unless you revert torch to 1.x.
Detected language: ru (0.99) in first 30s of audio...
Transcription time: 33.45371389389038
Diarization time: 10.222927570343018


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.0+cu121. Bad things might happen unless you revert torch to 1.x.
Detected language: ru (1.00) in first 30s of audio...
Transcription time: 13.42027997970581
Diarization time: 4.556212425231934
Dataset has been created and saved to 'dataset.csv'


In [None]:
dataset = pd.read_csv("submission.csv")
dataset.to_csv("submission.csv", index=False, sep=";")