In [1]:
import requests
from PIL import Image

import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration


In [2]:
model_id = "/home/ubuntu/Align-DS-V"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True, 
).to(0)

In [3]:
processor = AutoProcessor.from_pretrained(model_id)

# Define a chat history and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image") 
conversation = [
    {

      "role": "user",
      "content": [
          {"type": "text", "text": "What is the result of this problem?"},
          {"type": "image"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

image_file = "/home/ubuntu/Align-DS-V/assets/demo.jpg" # in this repo
raw_image = Image.open(image_file)
inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

output = model.generate(**inputs, max_new_tokens=4096, do_sample=False)
print(processor.decode(output[0], skip_special_tokens=True))


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


<｜User｜>What is the result of this problem?<｜Assistant｜><think>To solve the problem, I will first interpret the image to understand what mathematical operation is being represented. Then, I will perform the calculation based on the numbers provided in the image and confirm the result.
The image shows a chalkboard with the equation \(18 + 23 = 41\) written on it. The numbers 18 and 23 are in light blue, and the result 41 is in light green.
The equation \(18 + 23 = 41\) is presented on the chalkboard. To solve this, I will add the two numbers on the left side of the equation: 18 and 23. Adding these together, \(18 + 23\), I calculate that the sum is 41. This matches the number on the right side of the equation, confirming its correctness.</think>41


In [None]:
# <think>To solve the problem, I will first interpret the image to understand what
# mathematical operation is being represented. Then, I will perform the calculation
# based on the numbers provided in the image and confirm the result. The image shows
# a chalkboard with the equation \(18 + 23 = 41\) written on it. The numbers 18 and
# 23 are in light blue, and the result 41 is in light green. The equation \(18 + 23 = 41\)
# is presented on the chalkboard. To solve this, I will add the two numbers on the
# left side of the equation: 18 and 23. Adding these together, \(18 + 23\), I calculate
# that the sum is 41. This matches the number on the right side of the equation,
# confirming its correctness.</think>41

In [1]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier
import torchaudio

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name, 
    run_opts={"device": device}, 
    savedir=os.path.join("./tmp", spk_model_name)
)

INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
  from speechbrain.pretrained import EncoderClassifier
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'spe

In [14]:
def extract_speaker_embedding(audio_file):
    """从上传的音频文件中提取说话人特征向量"""
    # 加载音频文件
    if isinstance(audio_file, tuple):
        sample_rate, waveform = audio_file
        waveform = torch.tensor(waveform)
    else:
        waveform, sample_rate = torchaudio.load(audio_file)
    
    # 确保音频是单声道的
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0)
    
    # 重采样到16kHz（如果需要）
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        waveform = resampler(waveform)
    
    # 确保波形是 [batch, time] 格式
    if len(waveform.shape) == 1:
        waveform = waveform.unsqueeze(0)
    
    # 将波形移到正确的设备上
    waveform = waveform.to(speaker_model.device)
    
    # 使用SpeechBrain提取说话人嵌入
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(waveform)
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze(1).to("cuda")
    
    return speaker_embeddings

In [15]:
audio_file = '/home/ubuntu/test.wav'
emb = extract_speaker_embedding(audio_file)
emb.size()


torch.Size([1, 512])

In [8]:
emb

tensor([[[-0.0642,  0.0324,  0.0326,  0.0216,  0.0052, -0.0316, -0.0429,
           0.0404,  0.0553,  0.0140, -0.0615, -0.0702,  0.0583,  0.0149,
           0.0469,  0.0464,  0.0163,  0.0404,  0.0019,  0.0217,  0.0249,
           0.0308, -0.0144, -0.0471, -0.0636, -0.0090, -0.0636, -0.0030,
           0.0459,  0.0480, -0.0029,  0.0272,  0.0392, -0.0120,  0.0304,
          -0.0361,  0.0237,  0.0463,  0.0162, -0.0654,  0.0482,  0.0080,
           0.0346,  0.0387,  0.0313, -0.0813, -0.0200,  0.0135, -0.0932,
           0.0548,  0.0181,  0.0327,  0.0242,  0.0364, -0.0808, -0.0199,
           0.0115,  0.0201,  0.0340,  0.0225,  0.0056, -0.0066, -0.0113,
          -0.0148,  0.0274,  0.0656,  0.0374, -0.0467, -0.0651, -0.0464,
           0.0293,  0.0078,  0.0184,  0.0083,  0.0252,  0.0376,  0.0190,
           0.0253, -0.0715, -0.0929, -0.0880, -0.0374, -0.0708, -0.0834,
          -0.0318, -0.0661, -0.0441,  0.0272,  0.0064, -0.0421,  0.0286,
          -0.0868,  0.0129, -0.0762,  0.0460, -0.00

In [11]:
from datasets import load_dataset

# 加载声音相似度嵌入向量（可选）
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to("cuda")
speaker_embeddings.shape,speaker_embeddings

(torch.Size([1, 512]),
 tensor([[-7.5731e-02, -2.7370e-02,  1.4933e-02,  4.5861e-02,  8.3840e-03,
          -2.7535e-02, -5.1030e-02, -6.1435e-02,  1.4576e-02,  1.9632e-02,
          -7.7323e-02, -7.8355e-02,  5.8233e-02,  3.7577e-02,  1.4377e-02,
           1.7147e-02, -1.3966e-02,  1.3549e-03,  9.4501e-03,  9.6230e-03,
           3.8752e-02,  2.5284e-03, -1.5207e-02, -4.5730e-02, -7.0040e-02,
          -8.4035e-03, -5.4758e-02,  4.7528e-03,  5.4306e-02,  1.8867e-02,
          -2.7039e-03,  2.1273e-02,  3.8547e-02, -4.7406e-02,  1.2328e-02,
          -6.9829e-02,  2.7079e-02,  5.5035e-02, -6.0107e-02, -6.4483e-02,
           6.6905e-03, -5.0482e-02,  4.0781e-02,  3.7543e-03,  3.2528e-02,
          -1.2350e-01, -1.8370e-02,  1.1340e-02, -5.8363e-02,  4.8635e-02,
           1.9366e-02,  3.4130e-02,  2.4126e-02,  1.6346e-02, -8.6061e-02,
           2.4379e-03,  1.1567e-02,  3.0723e-02,  3.2634e-02,  1.7792e-02,
           3.7105e-02, -1.0155e-02, -1.4131e-02,  4.0054e-02,  2.5886e-03,
  

In [1]:
def number_to_english_words(text):
    """将文本中的数字转换为英文单词"""
    
    def int_to_english(n):
        ones = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
                'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
                'seventeen', 'eighteen', 'nineteen']
        tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
        
        def helper(num):
            if num == 0:
                return ''
            elif num < 20:
                return ones[num]
            elif num < 100:
                return tens[num // 10] + (' ' + ones[num % 10] if num % 10 != 0 else '')
            elif num < 1000:
                return ones[num // 100] + ' hundred' + (' and ' + helper(num % 100) if num % 100 != 0 else '')
            elif num < 1000000:
                return helper(num // 1000) + ' thousand' + (' ' + helper(num % 1000) if num % 1000 != 0 else '')
            elif num < 1000000000:
                return helper(num // 1000000) + ' million' + (' ' + helper(num % 1000000) if num % 1000000 != 0 else '')
            else:
                return helper(num // 1000000000) + ' billion' + (' ' + helper(num % 1000000000) if num % 1000000000 != 0 else '')
    
    def convert_number(match):
        number = match.group()
        # 如果是小数
        if '.' in number:
            integer_part, decimal_part = number.split('.')
            # 处理整数部分
            if int(integer_part) == 0:
                integer_english = 'zero'
            else:
                integer_english = int_to_english(int(integer_part))
            # 处理小数部分
            decimal_english = ' point ' + ' '.join(int_to_english(int(d)) for d in decimal_part)
            return integer_english + decimal_english
        else:
            # 整数处理
            num = int(number)
            if num == 0:
                return 'zero'
            return int_to_english(num)
    
    import re
    # 匹配独立的数字（包括小数），但不匹配日期、时间等特殊格式
    pattern = r'\b\d+\.?\d*\b'
    return re.sub(pattern, convert_number, text)

In [4]:
import re
import inflect

def replace_numbers_with_words(text):
    p = inflect.engine()
    
    def replace_match(match):
        return p.number_to_words(int(match.group()))  # 将匹配到的数字转换成英文
    
    return re.sub(r'\d+', replace_match, text)  # 查找并替换所有数字

# 示例
text = "41"
converted_text = replace_numbers_with_words(text)
print(converted_text)  # 输出: I have two apples and fifteen oranges.


forty-one


In [5]:
import re
from num2words import num2words

def replace_numbers_with_words(text):
    def replace_match(match):
        return num2words(int(match.group()))  # 转换为英文单词
    
    return re.sub(r'\d+', replace_match, text)

# 示例
text = "He bought 3 books for 25 dollars."
converted_text = replace_numbers_with_words(text)
print(converted_text)  # 输出: He bought three books for twenty-five dollars.


He bought three books for twenty-five dollars.
