In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install jiwer==2.2.0
!pip install datasets
!pip install lang_trans
!pip install transformers

Collecting jiwer==2.2.0
  Downloading jiwer-2.2.0-py3-none-any.whl (13 kB)
Collecting python-Levenshtein
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
[?25l[K     |██████▌                         | 10 kB 44.3 MB/s eta 0:00:01[K     |█████████████                   | 20 kB 35.3 MB/s eta 0:00:01[K     |███████████████████▌            | 30 kB 20.3 MB/s eta 0:00:01[K     |██████████████████████████      | 40 kB 7.1 MB/s eta 0:00:01[K     |████████████████████████████████| 50 kB 4.2 MB/s 
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py) ... [?25l[?25hdone
  Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-linux_x86_64.whl size=149857 sha256=c0363a2b7655200a8f5256af6c3e4cac8e8c60a63d8036886946de6d429caef2
  Stored in directory: /root/.cache/pip/wheels/05/5f/ca/7c4367734892581bb5ff896f15027a932c551080b2abd3e00d
Successfully built python-Levenshtein
Installing collected package

In [3]:
import os
import jiwer
import torch
import random
import torchaudio
import pandas as pd
from datasets import ClassLabel
from datasets import load_dataset
from lang_trans.arabic import buckwalter
from IPython.display import display, HTML
from transformers import set_seed, Wav2Vec2ForCTC, Wav2Vec2Processor

In [4]:
%%capture
#download the data
!wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ar.tar.gz
!tar -xzvf ar.tar.gz

In [5]:
model_path = '/content/drive/MyDrive/KFUPM-Master/ICS606/Models/SinaiFineTuned/checkpoint-1600'

In [6]:
# set_seed(42)
test_split = load_dataset("common_voice", "ar", split="test")
resamplers = {  # all three sampling rates exist in test split
    48000: torchaudio.transforms.Resample(48000, 16000),
    44100: torchaudio.transforms.Resample(44100, 16000),
    32000: torchaudio.transforms.Resample(32000, 16000),
}

Downloading:   0%|          | 0.00/4.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

Downloading and preparing dataset common_voice/ar (download: 1.64 GiB, generated: 1.41 GiB, post-processed: Unknown size, total: 3.05 GiB) to /root/.cache/huggingface/datasets/common_voice/ar/6.1.0/b879a355caa529b11f2249400b61cadd0d9433f334d5c60f8c7216ccedfecfe1...


Downloading:   0%|          | 0.00/1.76G [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset common_voice downloaded and prepared to /root/.cache/huggingface/datasets/common_voice/ar/6.1.0/b879a355caa529b11f2249400b61cadd0d9433f334d5c60f8c7216ccedfecfe1. Subsequent calls will reuse this data.


In [7]:
def prepare_example(example):
  speech, sampling_rate = torchaudio.load(example['audio']['path'])
  if sampling_rate in resamplers:
      example["speech"] = resamplers[sampling_rate](speech).squeeze().numpy()
  else:
      example["speech"] = resamplers[4800](speech).squeeze().numpy()
  return example

In [8]:
test_split = test_split.map(prepare_example)

  0%|          | 0/7622 [00:00<?, ?ex/s]

In [9]:
processor = Wav2Vec2Processor.from_pretrained('bakrianoo/sinai-voice-ar-stt')
model = Wav2Vec2ForCTC.from_pretrained('bakrianoo/sinai-voice-ar-stt').to("cuda").eval()

Downloading:   0%|          | 0.00/158 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/457 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/138 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

In [10]:
def predict(batch):
    inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        predicted = torch.argmax(model(inputs.input_values.to("cuda")).logits, dim=-1)
    predicted[predicted == -100] = processor.tokenizer.pad_token_id  # see fine-tuning script
    batch["predicted"] = processor.batch_decode(predicted)
    return batch

In [11]:
test_split = test_split.map(predict, batched=True, batch_size=16)

  0%|          | 0/477 [00:00<?, ?ba/s]

  tensor = as_tensor(value)


In [12]:
transformation = jiwer.Compose([
    # normalize some diacritics, remove punctuation, and replace Persian letters with Arabic ones
    jiwer.SubstituteRegexes({
      r'[auiFNKo\~_،؟»\?;:\-,\.؛«!"]': "", "\u06D6": "",
      r"[\|\{]": "A", "p": "h", "ک": "k", "ی": "y"}),
    # default transformation below
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
    jiwer.SentencesToListOfWords(),
    jiwer.RemoveEmptyStrings(),
])

metrics = jiwer.compute_measures(
    truth=[buckwalter.trans(s) for s in test_split["sentence"]],  # Buckwalter transliteration
    hypothesis=[buckwalter.trans(s) for s in test_split["predicted"]],
    truth_transform=transformation,
    hypothesis_transform=transformation,
)
print(f"WER: {metrics['wer']:.2%}")

WER: 23.70%


In [13]:
processor = Wav2Vec2Processor.from_pretrained('bakrianoo/sinai-voice-ar-stt')
model = Wav2Vec2ForCTC.from_pretrained(model_path).to("cuda").eval()

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
test_split = test_split.map(predict, batched=True, batch_size=16)

  0%|          | 0/477 [00:00<?, ?ba/s]

In [15]:
transformation = jiwer.Compose([
    # normalize some diacritics, remove punctuation, and replace Persian letters with Arabic ones
    jiwer.SubstituteRegexes({
      r'[auiFNKo\~_،؟»\?;:\-,\.؛«!"]': "", "\u06D6": "",
      r"[\|\{]": "A", "p": "h", "ک": "k", "ی": "y"}),
    # default transformation below
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
    jiwer.SentencesToListOfWords(),
    jiwer.RemoveEmptyStrings(),
])

metrics = jiwer.compute_measures(
    truth=[buckwalter.trans(s) for s in test_split["sentence"]],  # Buckwalter transliteration
    hypothesis=[buckwalter.trans(s) for s in test_split["predicted"]],
    truth_transform=transformation,
    hypothesis_transform=transformation,
)
print(f"WER: {metrics['wer']:.2%}")

WER: 59.22%
