In [1]:
import sounddevice as sd
import torch
import transformers
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, AutoTokenizer, TFAutoModelForSeq2SeqLM
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq, AdamWeightDecay
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm
2024-11-11 11:29:18.744137: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-11 11:29:18.791200: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731304758.860642   33241 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731304758.881271   33241 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-11 11:29:18.952053: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

In [15]:
duration = 15
sample_rate = 16000
print(f"Recording for {duration} seconds...")
audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float32')
sd.wait()
print("Recording complete.")

Recording for 15 seconds...
Recording complete.


In [16]:
audio = audio.flatten()

In [17]:
speech_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
speech_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
input_values = speech_tokenizer(audio, return_tensors="pt", sampling_rate=sample_rate).input_values
logits = speech_model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = speech_tokenizer.decode(predicted_ids[0])
print("Transcription:", transcription)

Transcription: I AM BACK MAN WELCOME CON YOU HELLO


In [19]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
dataset = load_dataset("cfilt/iitb-english-hindi")

In [20]:
trans_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
trans_model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

2024-11-11 11:34:32.175948: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [21]:
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["hi"] for ex in examples["translation"]]
    model_inputs = trans_tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    with trans_tokenizer.as_target_tokenizer():
        labels = trans_tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [22]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(trans_tokenizer, model=trans_model, return_tensors="tf")

Map: 100%|██████████| 1659083/1659083 [08:32<00:00, 3239.94 examples/s]
Map: 100%|██████████| 520/520 [00:00<00:00, 2510.54 examples/s]
Map: 100%|██████████| 2507/2507 [00:00<00:00, 2918.80 examples/s]


In [23]:
train_dataset = trans_model.prepare_tf_dataset(
    tokenized_datasets["train"],
    collate_fn=data_collator,
    batch_size=16,
    shuffle=True,
)

In [24]:
validation_dataset = trans_model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=data_collator,
    batch_size=16,
    shuffle=False,
)

In [25]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
trans_model.compile(optimizer=optimizer)

In [26]:
trans_model.fit(train_dataset, validation_data=validation_dataset, epochs=1)

   186/103692 [..............................] - ETA: 182:22:48 - loss: 3.2613

KeyboardInterrupt: 

In [27]:
trans_model.save_pretrained("trained_en_hi_translation_model")
trans_tokenizer.save_pretrained("trained_en_hi_translation_model")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]]}


('trained_en_hi_translation_model/tokenizer_config.json',
 'trained_en_hi_translation_model/special_tokens_map.json',
 'trained_en_hi_translation_model/vocab.json',
 'trained_en_hi_translation_model/source.spm',
 'trained_en_hi_translation_model/target.spm',
 'trained_en_hi_translation_model/added_tokens.json')

In [28]:
tokenized_input = trans_tokenizer([transcription], return_tensors="tf")

In [29]:
translated_ids = trans_model.generate(**tokenized_input, max_length=128)
translated_text = trans_tokenizer.decode(translated_ids[0], skip_special_tokens=True)

In [30]:
print("Translation in Hindi:", translated_text)

Translation in Hindi: मैं वापस आदमी आप नरकी में शामिल हो गया है
