In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))


CUDA available: True
GPU: NVIDIA GeForce RTX 2060 SUPER


In [2]:
import whisper

# Load model and move to GPU
model = whisper.load_model("medium").to("cuda")


In [90]:
input_file = "Quran_recitations/001.mp3"

In [91]:
# Transcribe audio file
result = model.transcribe(input_file, language="ar", word_timestamps=True)
print(result["text"])

 الحمد لله رب العالمين الرحمن الرحيم مالك يوم الدين إياك نعبد وإياك نستعين إهدنا الصراط المستقيم صراط الذين أنعمت عليهم غير المغضوب عليهم ولا الضالين آمين


In [92]:
for segment in result["segments"]:
    for word in segment["words"]:
        print(f"Word: {word['word']}, Start: {word['start']}, End: {word['end']}")

Word:  الحمد, Start: 0.0, End: 0.78
Word:  لله, Start: 0.78, End: 1.56
Word:  رب, Start: 1.56, End: 1.92
Word:  العالمين, Start: 1.92, End: 4.28
Word:  الرحمن, Start: 4.28, End: 5.48
Word:  الرحيم, Start: 5.48, End: 6.44
Word:  مالك, Start: 6.44, End: 7.2
Word:  يوم, Start: 7.2, End: 7.68
Word:  الدين, Start: 7.68, End: 9.1
Word:  إياك, Start: 9.1, End: 10.02
Word:  نعبد, Start: 10.02, End: 10.62
Word:  وإياك, Start: 10.62, End: 11.72
Word:  نستعين, Start: 11.72, End: 14.04
Word:  إهدنا, Start: 14.04, End: 14.8
Word:  الصراط, Start: 14.8, End: 15.6
Word:  المستقيم, Start: 15.6, End: 17.96
Word:  صراط, Start: 17.96, End: 19.06
Word:  الذين, Start: 19.06, End: 19.82
Word:  أنعمت, Start: 19.82, End: 20.72
Word:  عليهم, Start: 20.72, End: 22.02
Word:  غير, Start: 22.02, End: 22.76
Word:  المغضوب, Start: 22.76, End: 23.86
Word:  عليهم, Start: 23.86, End: 24.92
Word:  ولا, Start: 24.92, End: 25.34
Word:  الضالين, Start: 25.34, End: 32.16
Word:  آمين, Start: 32.16, End: 34.2


In [93]:
print(result['segments'])

[{'id': 0, 'seek': 0, 'start': np.float64(0.0), 'end': np.float64(4.28), 'text': ' الحمد لله رب العالمين', 'tokens': [50364, 21542, 2304, 3215, 24976, 3224, 12602, 3555, 18863, 45340, 9957, 50564], 'temperature': 0.0, 'avg_logprob': -0.13650521738775845, 'compression_ratio': 1.6826923076923077, 'no_speech_prob': 0.21959486603736877, 'words': [{'word': ' الحمد', 'start': np.float64(0.0), 'end': np.float64(0.78), 'probability': np.float64(0.8539891441663107)}, {'word': ' لله', 'start': np.float64(0.78), 'end': np.float64(1.56), 'probability': np.float64(0.9775266647338867)}, {'word': ' رب', 'start': np.float64(1.56), 'end': np.float64(1.92), 'probability': np.float64(0.970398873090744)}, {'word': ' العالمين', 'start': np.float64(1.92), 'end': np.float64(4.28), 'probability': np.float64(0.9965699712435404)}]}, {'id': 1, 'seek': 0, 'start': np.float64(4.28), 'end': np.float64(9.1), 'text': ' الرحمن الرحيم مالك يوم الدين', 'tokens': [50564, 34892, 5016, 27842, 34892, 5016, 32640, 3714, 6027

In [94]:
whisper_output = result

In [83]:
import re
import json
from difflib import SequenceMatcher

def normalize_arabic(text):
    # Strip leading/trailing whitespace first
    text = text.strip()
    # Remove leading numbers with optional punctuation and spaces
    text = re.sub(r'^[0-9]+[.:،]?\s*', '', text)
    # Remove tatweel (ـ) and diacritics
    text = re.sub(r'[\u0640]', '', text)
    text = re.sub(r'[\u064b-\u065f]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def find_best_match(verse_words, whisper_words, start_idx):
    """Find the best matching sequence in Whisper words using fuzzy matching."""
    best_match_idx = None
    best_score = 0
    for i in range(start_idx, len(whisper_words) - len(verse_words) + 1):
        # Compute average similarity across words
        match_score = sum(similar(whisper_words[i + j]['text'], verse_words[j]) for j in range(len(verse_words))) / len(verse_words)
        if match_score > best_score:
            best_score = match_score
            best_match_idx = i
    return best_match_idx if best_score > 0.7 else None  # threshold can be adjusted

def to_srt_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"{hours:02}:{minutes:02}:{seconds:06.3f}".replace('.', ',')

# Load Quran verses and translations
with open('quran.json', encoding='utf-8') as f:
    quran = json.load(f)
with open('translation.json', encoding='utf-8') as f:
    translations = json.load(f)

# Prepare Whisper words (assumes whisper_output is already defined)
whisper_words = []
for segment in whisper_output['segments']:
    for word_info in segment['words']:
        word = normalize_arabic(word_info['word'])
        if word:  # Only add if it's not an empty string
            whisper_words.append({
                'text': word,
                'start': word_info['start'],
                'end': word_info['end']
            })


# Align verses
current_idx = 0
subtitles = []

# Skip verse 1:1 (basmala) by filtering it out
sorted_verses = sorted(quran.keys(), key=lambda x: (int(x.split(':')[0]), int(x.split(':')[1])))
filtered_verses = [vk for vk in sorted_verses if vk != "1:1"]

for verse_key in filtered_verses:
    verse_ar = normalize_arabic(quran[verse_key])
    verse_words = verse_ar.split()
    
    match_start = find_best_match(verse_words, whisper_words, current_idx)
    if match_start is not None:
        start_time = whisper_words[match_start]['start']
        end_time = whisper_words[match_start + len(verse_words) - 1]['end']
        current_idx = match_start + len(verse_words)
        
        subtitles.append({
            'verse': verse_key,
            'start': start_time,
            'end': end_time,
            'translation': translations.get(verse_key, "Translation not found")
        })

# (Optional) Adjust timing to prevent gaps between subtitles
for i in range(len(subtitles) - 1):
    if subtitles[i]['end'] < subtitles[i+1]['start']:
        subtitles[i]['end'] = subtitles[i+1]['start']

# Generate SRT file
srt_content = []
for idx, sub in enumerate(subtitles, 1):
    start = to_srt_time(sub['start'])
    end = to_srt_time(sub['end'])
    srt_content.append(f"{idx}\n{start} --> {end}\n{sub['translation']}\n")

# Assuming the output file naming based on input_file variable
output_file = f"subtitles/{input_file.split('/')[-1].split('.')[0]}.srt"
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(srt_content))

print(f"SRT file generated successfully: {output_file}")


SRT file generated successfully: subtitles/078.srt


In [84]:
print(whisper_words)

[{'text': 'عن', 'start': np.float64(0.7), 'end': np.float64(1.06)}, {'text': 'ما', 'start': np.float64(1.06), 'end': np.float64(1.36)}, {'text': 'يتساءلون', 'start': np.float64(1.36), 'end': np.float64(5.32)}, {'text': 'عن', 'start': np.float64(5.32), 'end': np.float64(5.74)}, {'text': 'النبأ', 'start': np.float64(5.74), 'end': np.float64(6.72)}, {'text': 'العظيم', 'start': np.float64(6.72), 'end': np.float64(8.28)}, {'text': 'الذي', 'start': np.float64(8.7), 'end': np.float64(9.12)}, {'text': 'هم', 'start': np.float64(9.12), 'end': np.float64(9.66)}, {'text': 'فيه', 'start': np.float64(9.66), 'end': np.float64(10.0)}, {'text': 'مختلفون', 'start': np.float64(10.0), 'end': np.float64(11.82)}, {'text': 'كلا', 'start': np.float64(12.06), 'end': np.float64(12.6)}, {'text': 'سيعلمون', 'start': np.float64(12.6), 'end': np.float64(14.46)}, {'text': 'ثم', 'start': np.float64(14.68), 'end': np.float64(15.26)}, {'text': 'كلا', 'start': np.float64(15.26), 'end': np.float64(15.9)}, {'text': 'سيعلم

In [85]:
result['text']

' 9. عن ما يتساءلون 10. عن النبأ العظيم 11. الذي هم فيه مختلفون 12. كلا سيعلمون 13. ثم كلا سيعلمون 14. ألم نجعل الأرض مهدادا 15. والجبال أوتادا 16. وخنقناكم أزواجا 17. وجعنا نومكم سباتا 18. وجعنا الليل نباسا 19. وجعنا النهار معاشا 20. وبنينا فوقكم سبعا شدادا 21. وجعنا سراجا وهاجا 22. وأنزلنا من المحصرات ماء ثجاجا 23. لنخرج به حبا ونباتا 24. جنات ألفافا 25. إن يوم الفصل كان ميقوة 26. يوم ينفخ في الصور 27. فتأتون أفواجا 28. وختحت السماء 29. فكانت أبوابا 30. سيرت الجبال فكانت سرابا 31. إن جهنم كانت مرصادا 32. للطاذين مآبا 33. لابثين فيها أحقابا 34. لا يذوقون فيها برد ولا شرابا 35. إلا حميم أو غساقا 36. جزاء أو وفاقا 37. إنهم كانوا لا يرجون حسابا 38. وكذبوا بآياتنا كذابا 39. وكل شيء نحصيناه كتابا 40. فذوقوا فلم نزيدكم إلا عذابا 41. إن للمتقين مفازا 42. هدائق وأعنابا 43. وكواعب أترابا 44. وكأسا دهاقا 45. لا يسمعون فيها لغوا ولا كذابا 46. جزاء من ربك عطاء حسابا 47. رب السماوات والأرض وما بينهما الرحمن لا يملكون منه خطابا 48. يوم يقوم الروح والملائك تصفا لا يتكلمون 49. إلا من أدن له الرحمن وق

In [86]:
print(whisper_words)

[{'text': 'عن', 'start': np.float64(0.7), 'end': np.float64(1.06)}, {'text': 'ما', 'start': np.float64(1.06), 'end': np.float64(1.36)}, {'text': 'يتساءلون', 'start': np.float64(1.36), 'end': np.float64(5.32)}, {'text': 'عن', 'start': np.float64(5.32), 'end': np.float64(5.74)}, {'text': 'النبأ', 'start': np.float64(5.74), 'end': np.float64(6.72)}, {'text': 'العظيم', 'start': np.float64(6.72), 'end': np.float64(8.28)}, {'text': 'الذي', 'start': np.float64(8.7), 'end': np.float64(9.12)}, {'text': 'هم', 'start': np.float64(9.12), 'end': np.float64(9.66)}, {'text': 'فيه', 'start': np.float64(9.66), 'end': np.float64(10.0)}, {'text': 'مختلفون', 'start': np.float64(10.0), 'end': np.float64(11.82)}, {'text': 'كلا', 'start': np.float64(12.06), 'end': np.float64(12.6)}, {'text': 'سيعلمون', 'start': np.float64(12.6), 'end': np.float64(14.46)}, {'text': 'ثم', 'start': np.float64(14.68), 'end': np.float64(15.26)}, {'text': 'كلا', 'start': np.float64(15.26), 'end': np.float64(15.9)}, {'text': 'سيعلم

In [95]:
import re
import json
from difflib import SequenceMatcher

def normalize_arabic(text):
    # Remove leading/trailing whitespace
    text = text.strip()
    # Remove optional leading numbers with punctuation and spaces
    text = re.sub(r'^\s*[0-9]+[.:،]?\s*', '', text)
    # Remove tatweel and diacritics
    text = re.sub(r'[\u0640]', '', text)
    text = re.sub(r'[\u064b-\u065f]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def to_srt_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = seconds % 60
    return f"{hours:02}:{minutes:02}:{secs:06.3f}".replace('.', ',')

# === Load data ===
# Quran JSON (format: { "78:1": "text", "78:2": "text", ... } for surah Naba, for example)
with open('quran.json', encoding='utf-8') as f:
    quran = json.load(f)
with open('translation.json', encoding='utf-8') as f:
    translations = json.load(f)

# Assume whisper_output is defined and contains word-level timestamps.
# Example: whisper_output['segments'] is a list of segments; each segment has a "words" list.
whisper_words = []
for segment in whisper_output['segments']:
    for word_info in segment['words']:
        word = normalize_arabic(word_info['word'])
        if word:  # skip empty tokens
            whisper_words.append({
                'text': word,
                'start': word_info['start'],
                'end': word_info['end']
            })

# === Build the expected sequence for the surah ===
# For this example, we assume we are aligning surah Naba (surah number 78).
# Filter verses that start with "78:" (adjust as needed)
surah = "1"
filtered_verses = sorted([vk for vk in quran.keys() if vk.startswith(surah + ":")],
                          key=lambda x: int(x.split(':')[1]))
expected_words = []    # list of words for the entire surah
verse_boundaries = {}  # map verse key to a tuple (start_index, end_index) in expected_words

for vk in filtered_verses:
    verse_text = normalize_arabic(quran[vk])
    words = verse_text.split()
    start_idx = len(expected_words)
    expected_words.extend(words)
    end_idx = len(expected_words) - 1
    verse_boundaries[vk] = (start_idx, end_idx)

# === Build the transcribed word sequence from Whisper ===
transcribed_words = [w['text'] for w in whisper_words]

# === Global alignment using SequenceMatcher ===
matcher = SequenceMatcher(None, expected_words, transcribed_words)
matching_blocks = matcher.get_matching_blocks()
# matching_blocks is a list of triples (i, j, n)
# where expected_words[i:i+n] == transcribed_words[j:j+n]

# We'll build a mapping from each expected word index to a transcribed word index.
# For indices that are matched, we record the transcribed index.
mapping = {}
for block in matching_blocks:
    for offset in range(block.size):
        mapping[block.a + offset] = block.b + offset

# === Now, for each verse, determine the corresponding timestamps ===
subtitles = []
for vk, (exp_start, exp_end) in verse_boundaries.items():
    # We want to find the earliest and latest transcribed indices corresponding to this verse.
    trans_indices = [mapping[i] for i in range(exp_start, exp_end + 1) if i in mapping]
    if not trans_indices:
        print(f"Warning: No alignment for verse {vk}")
        continue
    t_start = min(trans_indices)
    t_end = max(trans_indices)
    # Get timestamps from whisper_words
    if t_start < len(whisper_words) and t_end < len(whisper_words):
        start_time = whisper_words[t_start]['start']
        end_time = whisper_words[t_end]['end']
        subtitles.append({
            'verse': vk,
            'start': start_time,
            'end': end_time,
            'translation': translations.get(vk, "Translation not found")
        })
    else:
        print(f"Index error for verse {vk}")

# === Generate SRT file ===
srt_lines = []
for idx, sub in enumerate(subtitles, 1):
    start = to_srt_time(sub['start'])
    end = to_srt_time(sub['end'])
    srt_lines.append(f"{idx}\n{start} --> {end}\n{sub['translation']}\n")

# For example, if input_file is defined (like "078.mp3"), then:
output_file = f"subtitles/{input_file.split('/')[-1].split('.')[0]}.srt"
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(srt_lines))

print(f"SRT file generated successfully: {output_file}")


SRT file generated successfully: subtitles/001.srt
