In [None]:
# remove first if exists
!rm -rf arabic-poetry-speech-classification.git
!git clone https://github.com/MagedSaeed/arabic-poetry-speech-classification.git

Cloning into 'arabic-poetry-speech-classification'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 25 (delta 8), reused 21 (delta 8), pack-reused 0[K
Unpacking objects: 100% (25/25), done.


In [None]:
# mount the drive to get the datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# put the task path you want to work on here
%cd arabic-poetry-speech-classification/signal-classification

/content/arabic-poetry-speech-classification/signal-classification


In [None]:
!pip install torchaudio
!pip install transformers
!pip install datasets
!pip install lang_trans
!pip install arabic_reshaper
!pip install python-bidi
!pip install pydub
!pip install soundfile
!pip install jiwer
!pip install PyArabic



In [None]:
import os
import re
import sys
import json
import torch
import jiwer
import logging
import librosa
import datasets
import itertools
import torchaudio
import numpy as np
import transformers
import pandas as pd
from torch import nn
import seaborn as sns
import torch.nn as nn
import soundfile as sf
import arabic_reshaper
from pyarabic import araby
from packaging import version
from pydub import AudioSegment
from dataclasses import asdict
from trainer import CTCTrainer
import matplotlib.pyplot as plt
from pydub.utils import mediainfo
from argparse import ArgumentParser
from collections import defaultdict
from torch.nn import functional as F
from contextlib import contextmanager
from bidi.algorithm import get_display
from lang_trans.arabic import buckwalter
from dataclasses import dataclass, field
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score
from models import Wav2Vec2ClassificationModel
from processors import CustomWav2Vec2Processor
from typing import Any, Dict, List, Optional, Union
from transformers import HfArgumentParser,TrainingArguments
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers import is_apex_available,set_seed ,Trainer,Wav2Vec2FeatureExtractor
from arg_parsers import DataTrainingArguments, ModelArguments, DataCollatorCTCWithPadding
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2Model,Wav2Vec2PreTrainedModel
% matplotlib inline

In [None]:
!cp -r /content/drive/MyDrive/KFUPM-Master/ICS606/Dataset/All_poems.zip .

In [None]:
if os.path.exists('dataset'):
  if len(os.listdir('dataset')) == 0:
    os.system('unzip All_poems.zip -d dataset')
else:
  os.system('unzip All_poems.zip -d dataset')

In [None]:
!mkdir -p dataset_wav

In [None]:
!cp /content/drive/MyDrive/KFUPM-Master/ICS606/Dataset/testset.csv .

In [None]:
metadata_test_path = 'testset.csv'
dataset_folder = 'dataset'
dataset_wav_folder = 'dataset_wav'

In [None]:
test_metadata = pd.read_csv(metadata_test_path)

In [None]:
sample_rates = set()
for file_path in test_metadata['Utterance name']:
  complete_path = f'{dataset_folder}/{file_path}'
  complete_wav_path = f'{dataset_wav_folder}/{file_path}'
  # os.system(f'ffmpeg -i {complete_path} {complete_wav_path}')
  audio = AudioSegment.from_file(complete_path)
  sample_rates.add(audio.frame_rate)
  audio.export(f'{dataset_wav_folder}/{file_path}', format='wav')
sample_rates

{44100, 48000}

## dataset processing

In [None]:
test_metadata = pd.read_csv(metadata_test_path)
test_dataset = Dataset.from_pandas(test_metadata)

# Preprocessing the datasets.
# We need to read the aduio files as arrays and tokenize the targets.
resamplers = {  # The dataset contains all the uncommented sample rates
    48000: torchaudio.transforms.Resample(48000, 16000),
    44100: torchaudio.transforms.Resample(44100, 16000),
    # 32000: torchaudio.transforms.Resample(32000, 16000),
}

labels = {
    bahr: bahr_index
    for bahr_index, bahr in enumerate(sorted(set(test_metadata["Bahr"])))
}
print("labels are:", labels)
print("len:", len(labels))

def speech_file_to_array_fn(batch):
    start = 0
    stop = 20
    srate = 16_000
    speech_array, sampling_rate = torchaudio.load(
        f'dataset_wav/{batch["Utterance name"]}'
    )
    speech_array = speech_array[0]
    batch["speech"] = resamplers[sampling_rate](speech_array).squeeze().numpy()
    batch["sampling_rate"] = srate
    batch["label"] = labels[batch["Bahr"]]
    return batch

test_dataset = test_dataset.map(
    speech_file_to_array_fn,
    remove_columns=test_dataset.column_names,
    num_proc=1,

)
test_dataset

labels are: {'البسيط': 0, 'الخفيف': 1, 'الرجز': 2, 'الرمل': 3, 'السريع': 4, 'الطويل': 5, 'الكامل': 6, 'المتدارك': 7, 'المتقارب': 8, 'المجتث': 9, 'المديد': 10, 'المضارع': 11, 'المقتضب': 12, 'المنسرح': 13, 'الهزج': 14, 'الوافر': 15}
len: 16


  0%|          | 0/367 [00:00<?, ?ex/s]

Dataset({
    features: ['speech', 'sampling_rate', 'label'],
    num_rows: 367
})

## prepare the model and predict

In [None]:
model_dir = "/content/drive/MyDrive/KFUPM-Master/ICS606/Models/GeneratedModels/checkpoint-7000"
model = Wav2Vec2ClassificationModel.from_pretrained(model_dir).to('cuda')
processor = CustomWav2Vec2Processor.from_pretrained(model_dir)

In [None]:
def predict(batch):
  features =processor(batch["speech"], max_length=320000, sampling_rate=16000, pad_to_multiple_of=320000, padding=True, return_tensors="pt")
  attention_mask = features.attention_mask.to('cuda')
  with torch.no_grad():
      predicted = torch.argmax(model(features.input_values.to("cuda"), attention_mask = attention_mask)['logits'], dim=-1)
  batch['pred_label'] = predicted.tolist()
  return batch

## Test Results

In [None]:
test_dataset = test_dataset.map(predict, batched=True, batch_size=16, remove_columns=['speech'])

  0%|          | 0/23 [00:00<?, ?ba/s]

  tensor = as_tensor(value)
  return (input_length - kernel_size) // stride + 1


In [None]:
labels = [item['label'] for item in test_dataset]
pred_labels = [item['pred_label'] for item in test_dataset]
print(classification_report(labels,pred_labels))
print(confusion_matrix(labels, pred_labels))

              precision    recall  f1-score   support

           0       0.83      0.71      0.77        14
           1       1.00      0.79      0.88        19
           2       0.57      1.00      0.73         4
           3       0.95      0.97      0.96        38
           4       0.90      1.00      0.95        18
           5       0.94      0.96      0.95       102
           6       0.86      0.90      0.88        20
           7       1.00      0.95      0.98        22
           8       1.00      0.90      0.95        10
           9       1.00      0.62      0.77         8
          10       1.00      1.00      1.00        14
          11       0.90      1.00      0.95        18
          12       1.00      1.00      1.00         2
          13       1.00      0.96      0.98        23
          14       0.94      1.00      0.97        15
          15       0.97      0.97      0.97        40

    accuracy                           0.94       367
   macro avg       0.93   