In [None]:
from transformers import WhisperForConditionalGeneration, WhisperTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import pipeline
from collections import defaultdict
from transformers import BertTokenizerFast
from transformers import AutoModelForTokenClassification
from transformers import WhisperFeatureExtractor
import torch
import os
import json

In [None]:
whisperModel = WhisperForConditionalGeneration.from_pretrained("/kaggle/input/gp-voice-models/whisperModel/whisperModel")
whisperTokenizer = WhisperTokenizer.from_pretrained("/kaggle/input/gp-voice-models/whisperModel/whisperModel")
whisper_feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium")

classification_path = '/kaggle/input/gp-voice-models/Classification_Model/Classification_Model/arabic-text-classification-model'

NER_model_path = '/kaggle/input/gp-voice-models/NER_Model/NER_Model/Model'
NER_tokenizer = '/kaggle/input/gp-voice-models/NER_Model/NER_Model/ModelTokenizer'

classificationTokenizer = AutoTokenizer.from_pretrained(classification_path)
classificationModel = AutoModelForSequenceClassification.from_pretrained(classification_path)

NER_model = AutoModelForTokenClassification.from_pretrained(NER_model_path)
NER_tokenizer = BertTokenizerFast.from_pretrained(NER_tokenizer)

In [None]:
def speechToText(voice_path):
#         device = "cuda:0" if torch.cuda.is_available() else "cpu"
#         torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        pipe = pipeline(
            "automatic-speech-recognition",
            model=whisperModel,
            tokenizer=whisperTokenizer,
            feature_extractor=whisper_feature_extractor,
            max_new_tokens=128,
            chunk_length_s=30,
            batch_size=16,
            return_timestamps=True,
#             torch_dtype=torch_dtype,
#             device=device,
        )
        
        # Update the sample to be an audio data
        sample = open(f"{voice_path}", "rb").read()

        result = pipe(sample, generate_kwargs={"language": "arabic"})

        return result['text']

In [None]:
def classify(text):
    # Tokenize the input text and move tensors to the GPU if available
    inputs = classificationTokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")

    # Get model output (logits)
    outputs = classificationModel(**inputs)

    probs = outputs[0].softmax(1)

    pred_label_idx = probs.argmax()
    pred_label = classificationModel.config.id2label[pred_label_idx.item()]

    return pred_label

In [None]:
def get_ner_entities(text):
    nlp = pipeline("ner", model=NER_model, tokenizer=NER_tokenizer)

    ner_results = nlp(text)

    # Extract entities and their corresponding labels
    entities = defaultdict(list)
    current_word = ""
    current_labels = []
    for result in ner_results:
        # Handle subwords by concatenating them back into complete words
        word = result['word']
        if word.startswith('##'):
            current_word += word[2:]
        else:
            if current_word:  # If there was a previous word, add it with its labels
                entities[current_word].extend(current_labels)
                current_word = ""  # Reset current_word for the next word
                current_labels = []  # Reset current_labels for the next word
            current_word = word
            current_labels.append(result['entity'])

    # Add the last word with its labels
    if current_word:
        entities[current_word].extend(current_labels)

    # # Print entities and their labels
    # for word, labels in entities.items():
    #     # label = label_encoder.transform(labels)
    #     print(f"Word: {word}, Labels: {', '.join(labels)}")
    return entities

In [None]:
voice_path = "/kaggle/input/gp-voices"
file_names = os.listdir(voice_path)
for fileName in file_names:
    output = speechToText(f'{voice_path}/{fileName}')
    output = output.split('.')[0]
    print(output)
    className = classify(output)
    print(f'class Name : {className}')
    
    entities = get_ner_entities(output)
    with open("/kaggle/working/output.txt", "w") as f:
        f.write(className + "\n")
        for word, labels in entities.items():
            print(f"Word: {word}, Labels: {', '.join(labels)}")
            f.write(f"Word: {word}, Labels: {', '.join(labels)}" + "\n")