In [1]:
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
import librosa
import numpy as np
import io

from scipy.signal import butter, lfilter
from IPython.display import Audio as IPythonAudio



In [2]:
# Parameters for recording
duration = 5  # seconds
sample_rate = 22050  # Hz, the default for librosa or your model



In [22]:
audio_path = r'C:\Users\Administrator\Desktop\Backend-Algorithm-LLM\Algorithm\audio_text_llm\SenseVoiceSmall\audio_sample\North vs. South Chinese Accent.mp3'
speech, sample_rate = librosa.load(audio_path, sr = None)

# If the sample rate is not 16,000 Hz, resample the audio
target_sample_rate = 16000
if sample_rate != target_sample_rate:
    speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=target_sample_rate)
    sample_rate = target_sample_rate  # Update the sample rate

speech = speech[:int(900000)]

In [5]:
# Function to create a band-pass filter
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

# Function to apply the band-pass filter
def bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

# Set your low and high cut-off frequencies (in Hz)
lowcut = 1000.0
highcut = 3400.0

# Apply the band-pass filter
filtered_speech = bandpass_filter(speech, lowcut, highcut, target_sample_rate)


In [6]:
# Print details
print("Sample Rate:", sample_rate)
print("Audio Array:", filtered_speech)
print("Audio Array datatype is Array: ", isinstance(speech,np.ndarray) )

Sample Rate: 16000
Audio Array: [ 0.          0.          0.         ... -0.01914331 -0.0403462
 -0.03032975]
Audio Array datatype is Array:  True


In [7]:
chunk_size = [0, 10, 5]  # [0, 10, 5] 600ms, [0, 8, 4] 480ms
encoder_chunk_look_back = 4  # number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1  # number of encoder chunks to lookback for decoder cross-attention

model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")

New version available: 1.1.12. Your current version is 1.1.2.
Please use the command "pip install -U funasr" to upgrade.


  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


2024-10-21 15:03:06,665 - modelscope - INFO - Use user-specified model revision: v2.0.4
  src_state = torch.load(path, map_location=map_location)


In [74]:
chunk_stride = chunk_size[1] * 960  # 600ms
transcribed_text = []

cache = {}
total_chunk_num = int(len((filtered_speech) - 1) / chunk_stride + 1)
for i in range(total_chunk_num):
    speech_chunk = filtered_speech[i * chunk_stride:(i + 1) * chunk_stride]
    is_final = i == total_chunk_num - 1
    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size,
                         encoder_chunk_look_back=encoder_chunk_look_back,
                         decoder_chunk_look_back=decoder_chunk_look_back)
    for entry in res:
        transcribed_text.append(entry['text'])
final_transcript = ''.join(transcribed_text)
print(final_transcript)

rtf_avg: 0.144: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 11.49it/s]                                                                                          
rtf_avg: 0.119: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 13.79it/s]                                                                                          
rtf_avg: 0.123: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 13.34it/s]                                                                                          
rtf_avg: 0.126: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 13.14it/s]                                                                                          
rtf_avg: 0.165: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  9.98it/s]                                                                                          
rtf_avg: 0.118: 100%|[34m██████████[0m| 1/1 [00:00<00:00, 13.90it/s]                                                                                          
rtf_avg: 0.122: 100%|[34m████████

张晓静二十九岁她来上海五年了这天早晨他向往常一样站在卫生间的洗手池前一边快速的刷着牙一边浏览着手机上客户刚刚发来的消息他的脑子里飞快地计划着今天的工作你恐怕不会相信他是那种接到工作从不抱怨而且计划和学习能力极强从不怕吃苦的人毫不夸张的说任何老板遇到他都会对他的工作能力非常满意他就职职于海的一家设计公司司





# Manual Splitting method

In [None]:
audio_path = r'C:\Users\Administrator\Desktop\Backend-LLM\audio_text_llm\SenseVoiceSmall\audio_sample\听故事学中文 Learn Chinese with 12 Stories - The Easiest Way to Improve Chinese.mp3'
audio_array, sampling_rate = librosa.load(audio_path, sr = None)
# List to hold transcriptions
transcriptions = []
model = AutoModel(
    model='paraformer-zh',
    # init_param = pretrained_model_path
    )
for start in range(0, total_samples, chunk_samples):
    end = min(start + chunk_samples, total_samples)
    chunk = audio_array[start:end]

    # Convert chunk to 16kHz if necessary
    # Note: Ensure your model accepts the sample rate of 16kHz or resample if needed
    chunk_resampled = librosa.resample(chunk, orig_sr = sampling_rate, target_sr = 16000)

    # Process the chunk
    res = model.generate(
        input=chunk_resampled,
        cache={},
        language="auto",
        use_itn=True,
        batch_size_s=60,
        merge_vad=True,
        merge_length_s=15,
    )

    # Get the transcribed text
    text = rich_transcription_postprocess(res[0]["text"])
    transcriptions.append(text)
    # Combine all transcriptions into one
full_transcription = " ".join(transcriptions)
# Save the transcription to a text file
with open("transcription_'paraformer-zh.txt", "w", encoding="utf-8") as file:
    file.write(full_transcription)


In [None]:
# Resampling the Audio so it matches to the requirement of the model
audio_16KHz = librosa.resample(audio_array,
                               orig_sr=sampling_rate,
                
                               target_sr=16000)

# paraformer-zh model with vad_model


In [3]:
audio_path = r'C:\Users\Administrator\Desktop\Backend-Algorithm-LLM\Audio_transcription_files\tmpw19f5zrc.wav'
speech, sample_rate = librosa.load(audio_path, sr = None)

# If the sample rate is not 16,000 Hz, resample the audio
target_sample_rate = 16000
if sample_rate != target_sample_rate:
    speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=target_sample_rate)
    sample_rate = target_sample_rate  # Update the sample rate

  speech, sample_rate = librosa.load(audio_path, sr = None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [14]:
model = AutoModel(
    model='paraformer-zh',
    vad_model="fsmn-vad",
    vad_kwargs={"max_single_segment_time": 30000},
    spk_model="cam++", spk_model_revision="v2.0.2",
    punc_model = "ct-punc"
)



text = rich_transcription_postprocess(res[0]["text"])
print(text)

com = []
for i in res:
   for j in i['sentence_info']:
      print("Text: ", j['text'])
      print( "Speaker: ", j['spk'])
      


New version available: 1.1.12. Your current version is 1.1.2.
Please use the command "pip install -U funasr" to upgrade.


  src_state = torch.load(path, map_location=map_location)
2024-10-24 17:20:16,188 - modelscope - INFO - Use user-specified model revision: v2.0.2


Detect model requirements, begin to install it: C:\Users\Administrator\.cache\modelscope\hub\iic\speech_campplus_sv_zh-cn_16k-common\requirements.txt
install model requirements successfully


rtf_avg: 0.009: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  2.03it/s]                                                                                          
  with autocast(False):
rtf_avg: 0.111: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  2.38it/s]
rtf_avg: 0.015: 100%|[34m██████████[0m| 5/5 [00:00<00:00, 43.05it/s]
rtf_avg: 0.074: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.93it/s]
rtf_avg: 0.010: 100%|[34m██████████[0m| 9/9 [00:00<00:00, 66.47it/s]
rtf_avg: 0.054: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.48it/s]
rtf_avg: 0.009: 100%|[34m██████████[0m| 16/16 [00:00<00:00, 72.34it/s]
rtf_avg: 0.043: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.02s/it]
rtf_avg: 0.008: 100%|[34m██████████[0m| 31/31 [00:00<00:00, 87.57it/s]
rtf_avg: 0.041: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.24s/it]
rtf_avg: 0.008: 100%|[34m██████████[0m| 40/40 [00:00<00:00, 86.55it/s]
rtf_avg: 0.042: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.25s/it]
rtf_avg: 0.007: 100%|[34m██

嗯，小明，谢谢你，今天能来自上次见面，以一来你感觉怎么样？唉，我最近真的很焦虑，胸口像压着一块石头，呼吸都觉得很困难哦，我明白了，能告诉我是什么让你感到如此焦虑吗？主要就是工作上的事情，我有一个重要的项目即将到期了，我现在担心自己没办法拿到预期，听起来真的很有压力。你在想这个项目时，脑海中会出现什么样的想法呢？我总是想象自己会失败，脑海里就像循环播放着各种糟糕的结果，这听起来让人疲惫。你有没有找到一些方法来应对这些困扰你的想法？我试过深呼吸，但是有时候我觉得这只会让我更加沮丧，而不是帮助。这很正常。有些方法对每个人效果不一样，你愿意一起探讨一些其他的技巧吗？也许我们可以找到对你更有效的方法。好呀，我愿意试试。呃，那么除了工作上的压力，你在生活中还有其他让你感到困扰的事情。嗯，其实还有一些我和朋友的关系也有点紧张，感觉大家都忙着自己的生活，很很少有时间见面。嗯，那听起来有些孤独，你是否有尝试跟他们沟通，表达你的感受呢？我有试过，但是每次都不知道怎么开口，感觉他们怕他们觉得我是在抱怨哦，我理解沟通有时确实很困难，或许我们可以一起练习一下，找找合适的表达方式。好的，这样我会觉得很有信心。嗯，太好了，我们也可以探讨如何在工作和人际关系中找到平衡，帮助你减轻压力。我很期待能学到这些。谢谢您。李医生啊，不用谢小明，你的勇气值得赞上，我们会一起努力的。
Text:  嗯，
Speaker:  0
Text:  小明，
Speaker:  0
Text:  谢谢你，
Speaker:  0
Text:  今天能来自上次见面，
Speaker:  0
Text:  以一来你感觉怎么样？
Speaker:  0
Text:  唉，
Speaker:  1
Text:  我最近真的很焦虑，
Speaker:  1
Text:  胸口像压着一块石头，
Speaker:  1
Text:  呼吸都觉得很困难哦，
Speaker:  0
Text:  我明白了，
Speaker:  0
Text:  能告诉我是什么让你感到如此焦虑吗？
Speaker:  1
Text:  主要就是工作上的事情，
Speaker:  1
Text:  我有一个重要的项目即将到期了，
Speaker:  1
Text:  我现在担心自己没办法拿到预期，
Speaker:  1
Text:  听起来真的很有压力。





In [12]:
res

[{'key': 'rand_key_2yW4Acq9GFz6Y',
  'text': '嗯，小明，谢谢你，今天能来自上次见面，以一来你感觉怎么样？唉，我最近真的很焦虑，胸口像压着一块石头，呼吸都觉得很困难哦，我明白了，能告诉我是什么让你感到如此焦虑吗？主要就是工作上的事情，我有一个重要的项目即将到期了，我现在担心自己没办法拿到预期，听起来真的很有压力。你在想这个项目时，脑海中会出现什么样的想法呢？我总是想象自己会失败，脑海里就像循环播放着各种糟糕的结果，这听起来让人疲惫。你有没有找到一些方法来应对这些困扰你的想法？我试过深呼吸，但是有时候我觉得这只会让我更加沮丧，而不是帮助。这很正常。有些方法对每个人效果不一样，你愿意一起探讨一些其他的技巧吗？也许我们可以找到对你更有效的方法。好呀，我愿意试试。呃，那么除了工作上的压力，你在生活中还有其他让你感到困扰的事情。嗯，其实还有一些我和朋友的关系也有点紧张，感觉大家都忙着自己的生活，很很少有时间见面。嗯，那听起来有些孤独，你是否有尝试跟他们沟通，表达你的感受呢？我有试过，但是每次都不知道怎么开口，感觉他们怕他们觉得我是在抱怨哦，我理解沟通有时确实很困难，或许我们可以一起练习一下，找找合适的表达方式。好的，这样我会觉得很有信心。嗯，太好了，我们也可以探讨如何在工作和人际关系中找到平衡，帮助你减轻压力。我很期待能学到这些。谢谢您。李医生啊，不用谢小明，你的勇气值得赞上，我们会一起努力的。',
  'timestamp': [[2040, 2280],
   [2460, 2640],
   [2640, 2880],
   [3220, 3460],
   [3460, 3640],
   [3640, 3760],
   [3760, 3880],
   [3880, 4020],
   [4020, 4180],
   [4180, 4420],
   [4720, 4960],
   [5040, 5200],
   [5200, 5420],
   [5420, 5560],
   [5560, 5800],
   [5860, 6100],
   [6200, 6340],
   [6340, 6560],
   [6560, 6700],
   [6700, 6820],
   [6820, 7000],
   

In [15]:
res = model.generate(
    input=speech,
    cache={},
    language="zn",  # "zn", "en", "yue", "ja", "ko", "nospeech"
    use_itn=True,
    batch_size_s=60,
    merge_vad=True,  #
    merge_length_s=15,
)
text = rich_transcription_postprocess(res[0]["text"])
print(text)

com = []
for i in res:
   for j in i['sentence_info']:
    #   print("Text: ", j['text'])
    #   print( "Speaker: ", j['spk'])
      com.append(f'Speaker {j['spk']}: {j['text']}')

com
      

rtf_avg: 0.010: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.91it/s]                                                                                          
  with autocast(False):
rtf_avg: 0.299: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.92it/s]
rtf_avg: 0.036: 100%|[34m██████████[0m| 2/2 [00:00<00:00, 17.95it/s]
rtf_avg: 0.117: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  2.25it/s]
rtf_avg: 0.014: 100%|[34m██████████[0m| 5/5 [00:00<00:00, 44.44it/s]
rtf_avg: 0.078: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.74it/s]
rtf_avg: 0.011: 100%|[34m██████████[0m| 9/9 [00:00<00:00, 59.89it/s]
rtf_avg: 0.059: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.67it/s]
rtf_avg: 0.009: 100%|[34m██████████[0m| 13/13 [00:00<00:00, 75.84it/s]
rtf_avg: 0.060: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.65it/s]
rtf_avg: 0.009: 100%|[34m██████████[0m| 13/13 [00:00<00:00, 74.71it/s]
rtf_avg: 0.058: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.70it/s]
rtf_avg: 0.009: 100%|[34m████

嗯，小明，谢谢你，今天能来自上次见面，以一来你感觉怎么样？哎，我最近真的很焦虑，胸口像压着一块石头，呼吸都觉得很困难哦，我明白了，能告诉我是什么让你感到如此焦虑吗？主要就是工作上的事情，我有一个重要的项目即将到期了，我现在担心自己没办法拿到预期，听起来真的很有压力。你在想这个项目时，脑海中会出现什么样的想法呢？我总是想象自己会失败，脑海里就像循环播放着各种糟糕的结果，这听起来让人疲惫。你有没有找到一些方法来应对这些困扰你的想法？我试过深呼吸，但是有时候我觉得这只会让我更加觉上，而不是帮助。这很正常。有些方法对每个人效果不一样，你愿意一起探讨一些其他的技巧吗？也许我们可以找到对你更有效的方法。好呀，我愿意试试。呃，那么除了工作上的压力，你在生活中还有其他让你感到困扰的事情吗？嗯，其实还有一些我和朋友的关系也有点紧张，感觉大家都忙着自己的生活，很很少有时间见面。嗯，那听起来有些孤独，你是否有尝试跟他们沟通，表达你的感受呢？我有试过，但是每次都不知道怎么开口感觉他们怕他们觉得我是在抱怨哦，我理解沟通有时确实很困难。或许我们可以一起练习一下，找找合适的表达方式。好的，这样我会觉得很有信心。嗯，太好了，我们也可以探讨如何在工作和人际关系中找到平衡，帮助你减轻压力。我很期待能学到这些。谢谢您。李医生啊，不用谢小明，你的勇气值得赞上，我们会一起努力的。





['Speaker 0: 嗯，',
 'Speaker 0: 小明，',
 'Speaker 0: 谢谢你，',
 'Speaker 0: 今天能来自上次见面，',
 'Speaker 0: 以一来你感觉怎么样？',
 'Speaker 1: 哎，',
 'Speaker 1: 我最近真的很焦虑，',
 'Speaker 1: 胸口像压着一块石头，',
 'Speaker 0: 呼吸都觉得很困难哦，',
 'Speaker 0: 我明白了，',
 'Speaker 0: 能告诉我是什么让你感到如此焦虑吗？',
 'Speaker 1: 主要就是工作上的事情，',
 'Speaker 1: 我有一个重要的项目即将到期了，',
 'Speaker 1: 我现在担心自己没办法拿到预期，',
 'Speaker 0: 听起来真的很有压力。',
 'Speaker 0: 你在想这个项目时，',
 'Speaker 1: 脑海中会出现什么样的想法呢？',
 'Speaker 1: 我总是想象自己会失败，',
 'Speaker 1: 脑海里就像循环播放着各种糟糕的结果，',
 'Speaker 0: 这听起来让人疲惫。',
 'Speaker 0: 你有没有找到一些方法来应对这些困扰你的想法？',
 'Speaker 1: 我试过深呼吸，',
 'Speaker 1: 但是有时候我觉得这只会让我更加觉上，',
 'Speaker 0: 而不是帮助。',
 'Speaker 0: 这很正常。',
 'Speaker 0: 有些方法对每个人效果不一样，',
 'Speaker 0: 你愿意一起探讨一些其他的技巧吗？',
 'Speaker 1: 也许我们可以找到对你更有效的方法。',
 'Speaker 1: 好呀，',
 'Speaker 1: 我愿意试试。',
 'Speaker 0: 呃，',
 'Speaker 0: 那么除了工作上的压力，',
 'Speaker 1: 你在生活中还有其他让你感到困扰的事情吗？',
 'Speaker 1: 嗯，',
 'Speaker 1: 其实还有一些我和朋友的关系也有点紧张，',
 'Speaker 1: 感觉大家都忙着自己的生活，',
 'Speaker 1: 很很少有时间见面。',
 'Speaker 0: 嗯，',
 'Spea

In [18]:
type(com[0][0])

str

# SenseVoiceSmall Model with vad model

In [12]:
model = AutoModel(
    model='iic/SenseVoiceSmall',
     vad_model="fsmn-vad",
     vad_kwargs={"max_single_segment_time": 30000},
    device="cuda:0",
    #punc_model = "ct-punc"
)

# en
res = model.generate(
    input=audio_16KHz,
    cache={},
    language="zn",  # "zn", "en", "yue", "ja", "ko", "nospeech"
    use_itn=True,
    batch_size_s=60,
    merge_vad=True,  #
    merge_length_s=15,
)
text = rich_transcription_postprocess(res[0]["text"])
print(text)

New version available: 1.1.12. Your current version is 1.1.2.
Please use the command "pip install -U funasr" to upgrade.




KeyboardInterrupt: 

In [75]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load the processor and model
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [1]:
import librosa
import numpy as np

# Set the sampling rate
sample_rate = 16000

def record_audio(duration=5):
    # Record audio for a specified duration
    audio = librosa.record(duration=duration, sr=sample_rate)
    return audio

# Example usage
audio_data = record_audio(duration=5)


AttributeError: No librosa attribute record

In [2]:
while True:
    audio_data = record_audio(duration=5)  # Record for 5 seconds

    # Convert audio data to the format required by the model
    audio_input = processor(audio_data, sampling_rate=sample_rate, return_tensors="pt")

    # Use the model to transcribe the audio
    with torch.no_grad():
        generated_ids = model.generate(**audio_input)

    # Decode the generated ids to get the transcription
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(transcription)


AttributeError: No librosa attribute record