# Loading Collected Dataset

A dataset must be a json file containing the following fields :

- ID that specifies an identificator for a sample.
- PCM array values in signed 16-bit integer format.
- Duration in seconds.
- Size/Number of Frames in bytes.
- Gender can take the following values :
  - Male.
  - Female.
  - Prefer not to say.
- Nativity can take the following values :
 - Native.
 - nonNative.
- Owner : Specifying a valid email address (will be dropped).
- Arabic script transcription.
- Latin script transcription.
- IPA script transcription.

## Code

In this code section, I am printing out information about the dataset in the form of a **pandas DataFrame**, which includes details like **column names, data types, and the number of non-null values for each column**. This is useful for having a tabular view of the dataset, increasing human readability.

In [None]:
import pandas as pd
import numpy as np
import json

with open('dataset.json', 'r') as dataset_json:
  dataset_dict = json.load(dataset_json)

dataset_df = pd.DataFrame.from_dict(dataset_dict)

try:
  columns_drop = ["_id", "owner", "id"]
  dataset_df.drop(columns_drop, axis=1, inplace=True)
  print(dataset_df.info())
except Exception as e:
  print(e)

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 0 to 7
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   arabic    8 non-null      object 
 1   latin     8 non-null      object 
 2   ipa       8 non-null      object 
 3   pcm       8 non-null      object 
 4   size      8 non-null      int64  
 5   duration  8 non-null      float64
 6   gender    8 non-null      object 
 7   isNative  8 non-null      object 
dtypes: float64(1), int64(1), object(6)
memory usage: 576.0+ bytes
None


In [None]:
!pip install datasets
!pip install scikit-learn



In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import torch
import numpy as np

# Map categorical columns to numerical values
dataset_df['gender'] = dataset_df['gender'].map({'Male': 0, 'Female': 1, 'Prefer not to say': 3}).fillna(-1)
dataset_df['isNative'] = dataset_df['isNative'].map({'Native': 1, 'nonNative': 0}).fillna(-1)

# Find the maximum length of lists in the 4th column (index 3)
max_length = 0
for i in range(len(dataset_df)):
    if isinstance(dataset_df.iat[i, 3], list):
        max_length = max(max_length, len(dataset_df.iat[i, 3]))
    else:
        raise ValueError(f"Unexpected data type at row {i}: {type(dataset_df.iat[i, 3])}")
print("Maximum length:", max_length)

# Split into training and testing datasets
train_df, test_df = train_test_split(dataset_df, test_size=0.2, random_state=42)

# Create DatasetDict for Hugging Face's `datasets` library
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True)),
})

print(dataset)


Maximum length: 99096
DatasetDict({
    train: Dataset({
        features: ['arabic', 'latin', 'ipa', 'pcm', 'size', 'duration', 'gender', 'isNative'],
        num_rows: 6
    })
    test: Dataset({
        features: ['arabic', 'latin', 'ipa', 'pcm', 'size', 'duration', 'gender', 'isNative'],
        num_rows: 2
    })
})


In [None]:
!pip install transformers



In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor

model_name = "masoudmzb/wav2vec2-xlsr-multilingual-53-fa"

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name, force_download=True)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

model = Wav2Vec2ForCTC.from_pretrained(model_name)

model.freeze_feature_encoder()

tokenizer_config.json:   0%|          | 0.00/307 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/307 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



In [None]:
print(processor.tokenizer.get_vocab())

{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, 'آ': 5, 'ئ': 6, 'ا': 7, 'ب': 8, 'ت': 9, 'ث': 10, 'ج': 11, 'ح': 12, 'خ': 13, 'د': 14, 'ذ': 15, 'ر': 16, 'ز': 17, 'س': 18, 'ش': 19, 'ص': 20, 'ض': 21, 'ط': 22, 'ظ': 23, 'ع': 24, 'غ': 25, 'ف': 26, 'ق': 27, 'ل': 28, 'م': 29, 'ن': 30, 'ه': 31, 'و': 32, 'پ': 33, 'چ': 34, 'ژ': 35, 'ک': 36, 'گ': 37, 'ی': 38, '\u200c': 39}


In [None]:
import torch
from torch.nn.functional import pad

def preprocessing(batch):
    # Process audio input
    audio = batch['pcm']  # Assuming PCM values are numpy arrays of equal length
    input_values = processor(
        audio,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True  # Ensure padding for PCM
    ).input_values
    batch["input_values"] = torch.tensor(input_values[0])  # Store audio input values

    # Tokenize transcriptions with padding to ensure same shape
    labels_arabic = processor.tokenizer(
        batch["arabic"],
        return_tensors="pt",
        padding="max_length",  # Add padding to max length
        truncation=True,      # Truncate if exceeding max length
    ).input_ids[0]
    batch["labels"] = {'arabic':labels_arabic}
    return batch

    #print(processor.tokenizer.batch_decode(labels_arabic))
"""
    labels_latin = processor.tokenizer(
        batch["latin"],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
    ).input_ids[0]

    labels_ipa = processor.tokenizer(
        batch["ipa"],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
    ).input_ids[0]

    # Combine all labels into one tensor
    max_label_length = max(len(labels_arabic), len(labels_latin), len(labels_ipa))
    pad_token_tensor = torch.tensor(processor.tokenizer.pad_token_id, dtype=torch.long)

    padded_arabic = pad(labels_arabic, (0, max_label_length - len(labels_arabic)), value=pad_token_tensor) # Pad using torch.nn.functional.pad
    padded_latin = pad(labels_latin, (0, max_label_length - len(labels_latin)), value=pad_token_tensor)
    padded_ipa = pad(labels_ipa, (0, max_label_length - len(labels_ipa)), value=pad_token_tensor)

    print(processor.tokenizer.batch_decode(padded_arabic))

    batch["labels"] = {
        "arabic": labels_arabic
        #"latin": padded_latin,
        #"ipa": padded_ipa,
    }

    # Include metadata as input
    batch["metadata"] = torch.tensor([
        batch["gender"],     # 1, 2, or 3 (numerical form)
        batch["isNative"],   # 1 for native, 0 for non-native
        batch["duration"],   # Audio duration in seconds
        batch["size"],       # Size of the audio file
    ], dtype=torch.float)
"""

# Apply preprocessing to dataset and remove original columns
dataset = dataset.map(preprocessing, remove_columns=dataset["train"].column_names)

# Check the dataset
print(dataset)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

  batch["input_values"] = torch.tensor(input_values[0])  # Store audio input values
Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 6
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 2
    })
})


In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.functional import pad
import torch

def custom_collate(batch):
    # 1. Pad input_values
    input_values = [item['input_values'] for item in batch]
    input_values = [torch.tensor(x) for x in input_values]
    input_values = pad_sequence(input_values, batch_first=True, padding_value=processor.tokenizer.pad_token_id)

    # 2. Collate Arabic labels (assuming they are already padded in preprocessing)
    labels_arabic = [item['labels']['arabic'] for item in batch]
    labels_arabic = [torch.tensor(x) for x in labels_arabic]
    labels_arabic = pad_sequence(labels_arabic, batch_first=True, padding_value=processor.tokenizer.pad_token_id)

    # 3. Collate metadata
    #metadata_input = [item['metadata'] for item in batch]
    #metadata_input = [torch.tensor(x) for x in metadata_input]
    #metadata = torch.stack(metadata_input)

    return {'input_values': input_values, 'labels': labels_arabic }

try :
  train_loader = DataLoader(dataset['train'], batch_size=6, shuffle=True, collate_fn=custom_collate) # Use custom_collate
  test_loader = DataLoader(dataset['test'], batch_size=6, collate_fn=custom_collate) # Use custom_collate
except RuntimeError as e:
  print(e)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.05, betas=(0.9,0.09))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
epochs = 15

for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for batch in train_loader:
      optimizer.zero_grad()

      input_values = batch["input_values"].to(device)
      labels = batch["labels"].to(device)

      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

      outputs = model(input_values, labels=labels)

      loss = outputs.loss
      if torch.isnan(loss).any():
        print("NaN loss detected!")
        break

      loss.backward()
      optimizer.step()
      train_loss += loss.item()
      print(f"Epoch {epoch + 1}, Training Loss: {train_loss / len(train_loader)}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            inputs = batch['input_values'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_values=inputs, labels=labels)
            predictions = outputs.logits.argmax(dim=-1)
            transcription = processor.batch_decode(predictions)
            val_loss += outputs.loss.item()

            print("transcritpion : ", transcription)


    val_loss /= len(test_loader)  # Calculate average validation loss

    print(f"Epoch {epoch + 1}/{epochs} | Validation Loss: {val_loss:.4f}")


Epoch 1, Training Loss: 1.2963021993637085
transcritpion :  ['', 'سلام کیدر نور زدید']
Epoch 1/15 | Validation Loss: 2.8168
Epoch 2, Training Loss: 1.0022311210632324
transcritpion :  ['', 'سلام کی در نور زدید']
Epoch 2/15 | Validation Loss: 2.7169
Epoch 3, Training Loss: 1.0501456260681152
transcritpion :  ['', 'سلام کی در نور زدید']
Epoch 3/15 | Validation Loss: 2.6387
Epoch 4, Training Loss: 0.8695451617240906
transcritpion :  ['', 'سلام کی در نور زدید']
Epoch 4/15 | Validation Loss: 2.5341
Epoch 5, Training Loss: 1.0774097442626953
transcritpion :  ['ف', 'سلام کی در نور زدید']
Epoch 5/15 | Validation Loss: 2.4457
Epoch 6, Training Loss: 0.8747596144676208
transcritpion :  ['ف', 'سلام کی درچ نور زدید']
Epoch 6/15 | Validation Loss: 2.3403
Epoch 7, Training Loss: 0.7984883189201355
transcritpion :  ['ف', 'سلام کی درچ نور زدید']
Epoch 7/15 | Validation Loss: 2.2516
Epoch 8, Training Loss: 0.7033762335777283
transcritpion :  ['ف', 'سلام کی درچ نور زدید']
Epoch 8/15 | Validation Loss: 2

In [None]:
model.save_pretrained("phonetic_alignment_model")
processor.save_pretrained("phonetic_alignment_processor")

print("Model training completed and saved!")

Model training completed and saved!


In [None]:
for batch in test_loader:
  input_values = batch["input_values"].to(device)
  metadata = batch["metadata"].to(device)
  labels = batch["labels"].to(device)

  logits = model(input_values).logits
  predicted_ids = torch.argmax(logits, dim=-1)
  transcription = processor.batch_decode(predicted_ids)
  print(processor.tokenizer.batch_decode(batch['labels']))
  print(transcription)

['السلام عل<unk>م عفا<unk> نسول<unk> ف<unk>ن جات طر<unk>ق المارش<unk>', 'سلام <unk>ف دا<unk>ر<unk> شنو جد<unk>د<unk>']
['لف', 'سلام کی دقرچ نول زدید']
