# Подготовка

In [None]:
!pip install pyctcdecode
!pip install https://github.com/kpu/kenlm/archive/master.zip

In [None]:
!pip install vosk
!wget https://alphacephei.com/vosk/models/vosk-model-small-ru-0.22.zip
!unzip vosk-model-small-ru-0.22.zip

In [None]:
!wget https://alphacephei.com/vosk/models/vosk-model-ru-0.42.zip
!unzip vosk-model-ru-0.42.zip

In [None]:
!pip install pydub

In [None]:
!pip install audioread

In [None]:
import pandas as pd
import IPython.display as ipd
import librosa
import requests
import time
import tqdm
import torch
from scipy.signal import resample
from vosk import Model, KaldiRecognizer
import wave
import json
from pydub import AudioSegment
import audioread

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
format = '.flac'
path_to_data = ''
labels_file = path_to_data + 'text.xlsx'

In [None]:
labels = pd.read_excel(labels_file)
labels

In [None]:
class audio_data:
    def __init__(self, data_frame, path_to_data, format):
        self.data_frame = data_frame
        self.path_to_data = path_to_data
        self.format = format
        self.asr_systems = {}

    def get_file_path(self, index):
        return self.path_to_data + self.data_frame.iloc[index]['name'] + self.format

    def get_file_text_label(self, index):
        return self.data_frame.iloc[index]['label']

    def get_audio_file(self, index):
        return librosa.load(
            self.get_file_path(index),
            sr=None
        )

In [None]:
data = audio_data(labels, path_to_data, format)
data.get_file_path(0)

In [None]:
ipd.Audio(data.get_file_path(10))

# Audio Data Length

In [None]:
length = 0

for i in range(len(data.data_frame)):
    with audioread.audio_open(data.get_file_path(i)) as f:
        length += f.duration
length

# Speech Flow

In [None]:
file_id = 10

In [None]:
import requests
import time

API_KEY_ID = ""
API_KEY_SECRET = ""
LANG = "ru"

FILE_PATH = data.get_file_path(file_id)

# The translation result type.
# 1, the default result type, the json format for sentences and words with begin time and end time.
# 2, the json format for the generated subtitles with begin time and end time.
# 3, the srt format for the generated subtitles with begin time and end time.
# 4, the plain text format for transcription results without begin time and end time.
RESULT_TYPE = 4

headers = {"keyId": API_KEY_ID, "keySecret": API_KEY_SECRET}


def create(path_to_file):
    create_data = {
        "lang": LANG,
    }
    files = {}
    create_url = "https://api.speechflow.io/asr/file/v1/create"

    create_url += "?lang=" + LANG
    files['file'] = open(path_to_file, "rb")
    response = requests.post(create_url, headers=headers, files=files)

    if response.status_code == 200:
        create_result = response.json()
        if create_result["code"] == 10000:
            task_id = create_result["taskId"]
        else:
            print("create error:")
            print(create_result["msg"])
            task_id = ""
    else:
        print('create request failed: ', response.status_code)
        task_id = ""
    return task_id


def query(task_id):
    query_url = "https://api.speechflow.io/asr/file/v1/query?taskId=" + task_id + "&resultType=" + str(RESULT_TYPE)
    while (True):
        response = requests.get(query_url, headers=headers)
        if response.status_code == 200:
            query_result = response.json()
            if query_result["code"] == 11000:
                return query_result["result"]
                break
            elif query_result["code"] == 11001:
                time.sleep(3)
                continue
            else:
                print("query error:")
                print(query_result["msg"])
                break


def speech_flow_transcribe(path_to_file):
    task_id = create(path_to_file)
    if (task_id != ""):
        return query(task_id)

In [None]:
res = speech_flow_transcribe(FILE_PATH)
print(res)

In [None]:
df = data.data_frame

In [None]:
transcriptions = list()

for index, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    transcription = speech_flow_transcribe(data.get_file_path(index))
    transcriptions.append(transcription)

In [None]:
transcriptions

# SaluteSpeech

In [None]:
url = "https://ngw.devices.sberbank.ru:9443/api/v2/oauth"

payload={
  'scope': 'SALUTE_SPEECH_PERS'
}
headers = {
  'Content-Type': 'application/x-www-form-urlencoded',
  'Accept': 'application/json',
  'RqUID': '',
  'Authorization': 'Basic '
}

response = requests.request("POST", url, headers=headers, data=payload, verify=False);

print(response.text)

In [None]:
token = ''
url = "https://smartspeech.sber.ru/rest/v1/speech:recognize"

from pydub import AudioSegment

audio_file_path = data.get_file_path(10)

audio = AudioSegment.from_file(audio_file_path, format="flac")

if audio.channels == 1:
    audio = audio.set_channels(2)

if audio.frame_rate not in [8000, 16000]:
    audio = audio.set_frame_rate(16000)

audio.export("temp_audio.mp3", format="mp3")

with open("temp_audio.mp3", "rb") as audio_file:
    audio_data = audio_file.read()

headers = {
    'Content-Type': 'audio/mpeg',
    'Authorization': f'Bearer {token}',
}

response = requests.post(url, headers=headers, data=audio_data, verify=False)

In [None]:
ipd.Audio("temp_audio.mp3")

In [None]:
def sber_transcribe(file_path):
    audio = AudioSegment.from_file(file_path, format="flac")

    if audio.channels == 1:
        audio = audio.set_channels(2)

    if audio.frame_rate not in [8000, 16000]:
        audio = audio.set_frame_rate(16000)

    audio.export("temp_audio.mp3", format="mp3")

    with open("temp_audio.mp3", "rb") as audio_file:
        audio_data = audio_file.read()

    headers = {
        'Content-Type': 'audio/mpeg',
        'Authorization': f'Bearer {token}',
    }

    params = {
        'language': 'ru-RU'
    }

    response = requests.post(url, headers=headers, data=audio_data, params=params, verify=False)

    return ' '.join(json.loads(response.text)['result'])

In [None]:
transcriptions = list()

for index, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    transcription = sber_transcribe(data.get_file_path(index))
    transcriptions.append(transcription)

In [None]:
transcriptions

# Wave2vec2

In [None]:
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
model = AutoModelForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian")

In [None]:
def wave2vec2_transcribe(audio_file_path):
    audio_input, sample_rate = librosa.load(audio_file_path, sr=None)

    sr = 16_000

    target_samples = int(len(audio_input) * sr / sample_rate)
    resampled_audio = resample(audio_input, target_samples)

    inputs = processor(resampled_audio, sampling_rate=sr, return_tensors="pt").input_values

    with torch.no_grad():
        logits = model(inputs).logits

    transcription = processor.batch_decode(logits.cpu().numpy()).text
    return transcription[0]

file_id = 2
audio_file_path = data.get_file_path(file_id)
transcription = wave2vec2_transcribe(audio_file_path)
transcription

In [None]:
transcriptions = list()

for index, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    transcription = wave2vec2_transcribe(data.get_file_path(index))
    transcriptions.append(transcription)

In [None]:
transcriptions

# Whisper

In [None]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

processor_tiny = AutoProcessor.from_pretrained("openai/whisper-tiny")
wh_model_tiny = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-tiny")

In [None]:
processor_small = AutoProcessor.from_pretrained("openai/whisper-small")
wh_model_small = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")

In [None]:
processor_large = AutoProcessor.from_pretrained("openai/whisper-large")
wh_model_large = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large")

In [None]:
def whisper_transcribe(audio_file_path, processor, model, prev_text="за за за. привет.", language="ru"):

    audio_input, sample_rate = librosa.load(audio_file_path, sr=None)

    sr = 16000
    target_samples = int(len(audio_input) * sr / sample_rate)
    resampled_audio = resample(audio_input, target_samples)

    input_features = processor(
        resampled_audio, sampling_rate=sr, return_tensors="pt"
    ).input_features

    prompt_ids = torch.tensor(processor.get_prompt_ids(prev_text))

    with torch.no_grad():
        predicted_ids = model.generate(input_features, language=language, prompt_ids=prompt_ids)

    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

file_id = 1
audio_file_path = data.get_file_path(file_id)
transcription = whisper_transcribe(audio_file_path, language="ru")
print(transcription)

In [None]:
transcriptions = list()

for index, row in tqdm.tqdm(labels.iterrows(), total=len(labels)):
    transcription = whisper_transcribe(data.get_file_path(index), wh_model_tiny, processor_tiny, prev_text=row.label)
    transcriptions.append(transcription)

In [None]:
transcriptions = list()

for index, row in tqdm.tqdm(labels.iterrows(), total=len(labels)):
    transcription = whisper_transcribe(data.get_file_path(index), wh_model_small, processor_small, prev_text=row.label)
    transcriptions.append(transcription)

In [None]:
transcriptions = list()

for index, row in tqdm.tqdm(labels.iterrows(), total=len(labels)):
    transcription = whisper_transcribe(data.get_file_path(index), wh_model_large, processor_large, prev_text=row.label)
    transcriptions.append(transcription)

In [None]:
transcriptions

# Vosk models

In [None]:
file_id = 10
df.iloc[file_id]

In [None]:
model = Model("vosk-model-small-ru-0.22")

wf = AudioSegment.from_file(data.get_file_path(file_id))
wf = wf.set_channels(1).set_frame_rate(16000)
wf.export("audio.wav", format="wav")
wf = wave.open("audio.wav", "rb")

if (wf.getnchannels() != 1
    or wf.getsampwidth() != 2
    or wf.getframerate() not in [8000, 16000]
    ):

    raise ValueError("Аудиофайл должен быть моно, 16-бит, 8кГц или 16кГц")

rec = KaldiRecognizer(model, wf.getframerate())

while True:
    d = wf.readframes(4000)
    if len(d) == 0:
        break
    if rec.AcceptWaveform(d):
        print(json.loads(rec.Result())["text"])

print(json.loads(rec.FinalResult())["text"])

In [None]:
def vosk_transcribe(file_path):
    wf = AudioSegment.from_file(file_path)
    wf = wf.set_channels(1).set_frame_rate(16000)
    wf.export("audio.wav", format="wav")
    wf = wave.open("audio.wav", "rb")

    if (wf.getnchannels() != 1
        or wf.getsampwidth() != 2
        or wf.getframerate() not in [8000, 16000]
        ):

        raise ValueError("Аудиофайл должен быть моно, 16-бит, 8кГц или 16кГц")

    rec = KaldiRecognizer(model, wf.getframerate())

    while True:
        d = wf.readframes(4000)
        if len(d) == 0:
            break
        if rec.AcceptWaveform(d):
            return json.loads(rec.Result())["text"]

    return json.loads(rec.FinalResult())["text"]

In [None]:
transcriptions = list()

for index, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    transcription = vosk_transcribe(data.get_file_path(index))
    transcriptions.append(transcription)

In [None]:
transcriptions

In [None]:
model = Model("vosk-model-ru-0.42")

wf = AudioSegment.from_file(data.get_file_path(file_id))
wf = wf.set_channels(1).set_frame_rate(16000)
wf.export("audio.wav", format="wav")
wf = wave.open("audio.wav", "rb")

if (wf.getnchannels() != 1
    or wf.getsampwidth() != 2
    or wf.getframerate() not in [8000, 16000]
    ):

    raise ValueError("Аудиофайл должен быть моно, 16-бит, 8кГц или 16кГц")

rec = KaldiRecognizer(model, wf.getframerate())

while True:
    d = wf.readframes(4000)
    if len(d) == 0:
        break
    if rec.AcceptWaveform(d):
        print(json.loads(rec.Result())["text"])

print(json.loads(rec.FinalResult())["text"])

In [None]:
def vosk_transcribe(file_path):
    wf = AudioSegment.from_file(file_path)
    wf = wf.set_channels(1).set_frame_rate(16000)
    wf.export("audio.wav", format="wav")
    wf = wave.open("audio.wav", "rb")

    if (wf.getnchannels() != 1
        or wf.getsampwidth() != 2
        or wf.getframerate() not in [8000, 16000]
        ):

        raise ValueError("Аудиофайл должен быть моно, 16-бит, 8кГц или 16кГц")

    rec = KaldiRecognizer(model, wf.getframerate())

    while True:
        d = wf.readframes(4000)
        if len(d) == 0:
            break
        if rec.AcceptWaveform(d):
            return json.loads(rec.Result())["text"]

    return json.loads(rec.FinalResult())["text"]

In [None]:
transcriptions = list()

for index, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    transcription = vosk_transcribe(data.get_file_path(index))
    transcriptions.append(transcription)

In [None]:
transcriptions

# Google API

In [None]:
!pip install SpeechRecognition

In [None]:
import speech_recognition as sr

In [None]:
def google_transcribe(audio_path):
    recognizer = sr.Recognizer()
    try:
        # Загрузка аудиофайла
        with sr.AudioFile(audio_path) as source:
            audio_data = recognizer.record(source)
        # Расшифровка речи с использованием Google Speech API
        text = recognizer.recognize_google(audio_data, language='ru-RU')  # Измените 'ru-RU' на нужный язык
        return text
    except sr.UnknownValueError:
        return "Не удалось распознать речь."
    except sr.RequestError as e:
        return f"Ошибка запроса к сервису распознавания: {e}"

In [None]:
google_transcribe(data.get_file_path(3))

In [None]:
transcriptions = list()

for index, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    transcription = google_transcribe(data.get_file_path(index))
    transcriptions.append(transcription)

In [None]:
transcriptions

# Whisper API

In [None]:
from openai import OpenAI

In [None]:
# todo: delete API key
client = OpenAI(api_key='')

def whisperAPI_transcribe(file_path, prompt):
    try:
        with open(file_path, "rb") as audio_file:
            response = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                prompt=prompt,
                language="ru"
            )
        return response.text
    except Exception as e:
        print(f"Error during transcription: {e}")
        return None

In [None]:
file_id = 1
audio_file_path = data.get_file_path(file_id)
transcription = whisperAPI_transcribe(audio_file_path, "За за за. Привет. Катя")
print(transcription)

In [None]:
transcriptions = list()

for index, row in tqdm.tqdm(labels.iterrows(), total=len(labels)):
    transcription = whisperAPI_transcribe(data.get_file_path(index), prompt=row.label)
    transcriptions.append(transcription)

In [None]:
transcriptions

# Yandex SpeechKit

In [None]:
# todo: delete API key
api_key = ''

In [None]:
def convert_flac_to_lpcm(input_path, output_path):
    audio = AudioSegment.from_file(input_path, format="flac")
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    raw_data = audio.raw_data

    with open(output_path, "wb") as f:
        f.write(raw_data)

In [None]:
url = 'https://stt.api.cloud.yandex.net/speech/v1/stt:recognize?lang=ru-RU&topic=general&format=lpcm&sampleRateHertz=16000'
audio_file_path = data.get_file_path(10)

convert_flac_to_lpcm(audio_file_path, "tmp.lpcm")

with open("tmp.lpcm", "rb") as audio_file:
    audio_data = audio_file.read()

headers = {
    'Authorization': f'Api-Key {api_key}',
}

response = requests.post(url, headers=headers, data=audio_data, verify=False)
res = response
' '.join(json.loads(response.text)['result'])

In [None]:
ipd.Audio('tmp.lpcm')

In [None]:
def yandex_transcribe(file_path):
    convert_flac_to_lpcm(file_path, "tmp.lpcm")

    with open("tmp.lpcm", "rb") as audio_file:
        audio_data = audio_file.read()

    headers = {
        'Authorization': f'Api-Key {api_key}',
    }

    response = requests.post(url, headers=headers, data=audio_data, verify=False)
    return json.loads(response.text)['result']

In [None]:
yandex_transcribe(audio_file_path)

In [None]:
transcriptions = list()

for index, row in tqdm.tqdm(labels.iterrows(), total=len(labels)):
    transcription = yandex_transcribe(data.get_file_path(index))
    transcriptions.append(transcription)

In [None]:
transcriptions