In [23]:
import pandas as pd
import numpy as np
from pathlib import Path
import wave
from scipy.fft import rfft, irfft
import sys
import soundfile as sf
import pyloudnorm as pyln
from pydub import AudioSegment
from scipy.io import wavfile
from dataclasses import (
    dataclass,
)
from typing import (
    Optional,
)
from bdw.check import Check

sys.path.append('..')
from audio import Audio
from text.profanity import (
    PROFANITY_WORD_FILTER_LANG_NAME,
)
from volume.human_speech import (
    HUMAN_SPEECH_FREQ_BOTTOM,
    HUMAN_SPEECH_FREQ_TOP,
    HIGH_FREQUENCY_SPEECH_THRESHOLD,
)
from configs.base import (
    RB_FILE_READING_MODE,
)
from processing.text.normalization import (
    normalized_tokens_2_normalized_text,
    text_2_normalized_text,
)
from high_level_feature_extractor.text.profanity import (
    text_2_is_contain_swear_words,
)

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

%load_ext autoreload
%autoreload all

# volume

In [3]:
EXAMPLE_AUDIO_PATH:Path = Path('/data/vkiselev/data/other/univer/deploma/dusha/crowd/crowd_train/wavs/00000d522439136554c888f4cfd92131.wav')

In [4]:
audio_example:Audio = Audio.wav_file_path_init(path=EXAMPLE_AUDIO_PATH)
audio_example

Audio(sample_width=2, sr=16000, n_frames=82560, data=array([  0,   0,   0, ..., -17, -18, -16], dtype=int16), n_channels=1)

In [5]:
def speech_filter(
    audio:Audio, 
    low_freq=HUMAN_SPEECH_FREQ_BOTTOM, 
    high_freq=HUMAN_SPEECH_FREQ_TOP,
    )->Audio:

    fft_result:np.ndarray = rfft(audio.data)
    fft_result_filtered:np.ndarray = fft_result.copy()
    freqs:np.ndarray = np.fft.fftfreq(audio.n_frames, d=1.0/audio.sr)

    positive_freqs:np.ndarray = freqs[:len(freqs) // 2 + 1]

    for i, freq in enumerate(positive_freqs):
        if abs(freq) > high_freq or abs(freq) < low_freq:
            fft_result_filtered[i] = 0

    filtered_signal:np.ndarray = irfft(fft_result_filtered)
    sample_dtype:type = audio.sample_dtype()
    filtered_signal:np.ndarray = filtered_signal.astype(sample_dtype) 
    return audio.new_data_copy(data=filtered_signal)

audio_filtered = speech_filter(audio=audio_example)

speech_filter(audio=audio_example)


Audio(sample_width=2, sr=16000, n_frames=82560, data=array([  6,   6,   6, ..., -10, -11,  -9], dtype=int16), n_channels=1)

In [6]:
def audio_volume(
    audio_path:Path,
    )->np.float64:
    try:
        data, rate = sf.read(audio_path)
        meter:pyln.meter.Meter = pyln.Meter(rate)
        return meter.integrated_loudness(data)

    except Exception as e:
        print(f"Error: {e}")
        return None

loudness = audio_volume(EXAMPLE_AUDIO_PATH)

In [7]:
loudness

np.float64(-27.88527535926921)

In [8]:
data, rate = sf.read(EXAMPLE_AUDIO_PATH)
type(data), type(rate)

(numpy.ndarray, int)

In [9]:
def wav_path_2_HF_power_ratio(
    file_path:Path,
    HF_threshold:int = HIGH_FREQUENCY_SPEECH_THRESHOLD,
    )->np.float64:
    sampling_rate, signal = wavfile.read(file_path)
    # Normalize to [-1, 1]
    signal:np.ndarray = signal / np.max(np.abs(signal))

    # Apply Hann window
    window:np.ndarray = np.hanning(len(signal))
    signal_windowed:np.ndarray = signal * window

    n:int = len(signal_windowed)
    freq_magnitudes:np.ndarray = np.abs(np.fft.fft(signal_windowed))
    freqs:np.ndarray = np.fft.fftfreq(n, d=1/sampling_rate)

    # Keep only positive frequencies (half the spectrum)
    positive_freqs:np.ndarray = freqs[:n//2]
    positive_magnitudes:np.ndarray = freq_magnitudes[:n//2]

    # Convert magnitudes to power (energy)
    power_spectrum:np.ndarray = positive_magnitudes ** 2

    total_energy:np.float64 = np.sum(power_spectrum)
    high_freq_mask:np.ndarray = positive_freqs > HF_threshold  # Adjust threshold as needed
    high_freq_energy:np.float64 = np.sum(power_spectrum[high_freq_mask])

    ratio:np.float64 = high_freq_energy / total_energy
    return ratio 

wav_path_2_HF_power_ratio(EXAMPLE_AUDIO_PATH)

np.float64(0.027629564595984295)

In [10]:
@dataclass
class HighLevelSpeechFeatures:
    loudness: np.float64
    HF_power_ratio:np.float64
    @classmethod
    def wav_path_init(
        path:Path,
        transcription:Optional[str] = None,
        ):

        pass


In [24]:
text:str = 'Нормально пизды сегодня!'

text_2_is_contain_swear_words(text)


True