In [1]:
from pytube import YouTube
from pytube import Playlist
import os
import subprocess
import re
from vals import (
                  RECITERS,
                  rprint,
                  gprint,
                  colored_tqdm,
                  download_youtube_audio,
                  split_audio)
from glob import glob
from tqdm import tqdm
import pandas as pd
import random

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.DataFrame(columns = ['reciter','sura'])

In [3]:
def get_thresh(df,reciter,sura):
    # Filter DataFrame based on the conditions
    filtered_df = df[df['reciter'] == reciter]
    if len(filtered_df[filtered_df['sura'] == sura]) > 4:
         return True

In [4]:
def get_reciter(RECITERS, string):
    matches = [substring for substring in RECITERS if substring.lower() in string.lower()]
    
    if not matches:
        # No match found
        return None  # or raise an exception, return a default value, etc.
    
    return matches[0]

def get_sura(title):
    match = re.search(r"سورة\s[الء-ي]+", title)
    
    if match is None:
        # No match found
        return None  # or raise an exception, return a default value, etc.
    
    return match.group(0)

In [5]:
def logging_in(reciter ,title,df):
    """
    logginf in the name of the reciter and the name of the sura
    """
    sura = get_sura(title)
    reciter_name = reciter
    new_row = {'reciter': reciter_name, 'sura':sura}
    print(f"logging in {reciter} for {sura}")
    df.loc[len(df)] = new_row
    return df

In [6]:
def random_slicer(input_list, num_items):
    """
    slices three random items from a list
    """
    # Make a copy of the input list to avoid modifying the original list
    input_list = list(input_list)
    shuffled_list = input_list.copy()

    # Shuffle the list randomly
    random.shuffle(shuffled_list)

    # Slice the first 'num_items' elements from the shuffled list
    sliced_items = shuffled_list[:num_items]

    return sliced_items

In [7]:
def get_title(url):
    yt = YouTube(url)
    audio_stream = yt.streams.get_audio_only("mp4")
    return audio_stream.title

In [9]:
def download_audio_from_playlist(url, output_path, df):
    try:
        playlist = Playlist(url)
        urls = playlist.video_urls

        sura = None  # Initialize sura outside the loop

        while True:
            shuffled_playlist = random_slicer(urls, 10)

            for url in shuffled_playlist:
                title = get_title(url)
                reciter = get_reciter(RECITERS, title)

                if reciter is None:
                    reciter = get_reciter(RECITERS, playlist.title)
                else:
                    sura = get_sura(title)
                    df_new = logging_in(reciter, title, df)

                    if sura is not None:
                        if get_thresh(df_new, sura, reciter):
                            print('Reslicing the playlist')
                            break  # Break out of the inner loop to re-randomize the playlist
                        else:
                            try:
                                download_youtube_audio(url, output_path)
                                print(f"Downloading {sura} for {reciter}\n Title of the video is {title}")
                            except Exception as e:
                                print(f"Error processing video: {title} - {e}")
    
    except:
        print("Couldn't return playlist")
        return title, reciter, sura, df_new


In [11]:
RECITERS

['احمد الشلبي',
 'علاء عقل',
 'اسماعيل القاضي',
 'علاء ياسر',
 'محمد حجازي',
 'محمد نصر الدين',
 'مختار الحاج',
 'إرويانتو',
 'زين ابو كوثر',
 'Zain Abu Kautsar',
 'حسين عبد الظاهر',
 'طارق محمد',
 'عبد الباسط',
 'المنشاوي',
 'عبد العزيز سحيم']

In [None]:
# List of YouTube video URLs
video_urls = {
        #  "عبد الباسط":[
        #                "https://www.youtube.com/playlist?list=PLdgJ3Z9AFIkVxK28C8vDmeMXDHvpCTwGi",
        #                "https://www.youtube.com/playlist?list=PL7FQ8_TtkWWGPqGeX1t9Y7Ls4c_n6Fx8_"],

        #  "المنشاوي":["https://www.youtube.com/playlist?list=PLr2rWP0a1jdHTsYPbz0hiDfa0-VeRFeJ2",
        #              "https://www.youtube.com/playlist?list=PLMrC4AI4wdRrSQLGtK4nmSJXlC1sZLhy8",
        #              "https://www.youtube.com/playlist?list=PLr2rWP0a1jdHQWhdLZDW5Gsri4_cPqD8_"],

    #    "طارق محمد":[

    #                 "https://www.youtube.com/playlist?list=PLZ8mb5KlWBIyEfWHl2cvH-817c630_dcr"]
        
}

for name , playlists in video_urls.items():
    os.makedirs(f"data/{name}", exist_ok=True)
    print(f'creating directory for {name}')
    for playlist in playlists:
             
        relative_output_dir_path = os.path.relpath(f"data/{name}")

        title,reciter,sura,_ = download_audio_from_playlist(playlist, relative_output_dir_path, df)

        videos_for_conv_paths = glob(os.path.join(relative_output_dir_path,"*.mp4"))
        for i,video in tqdm(enumerate(videos_for_conv_paths)):

            prefix = f"{title[:-1]}_{reciter}_{sura}_{i + 1}"

            split_audio(video,prefix, relative_output_dir_path)
            
        for file in videos_for_conv_paths:
            # if filename.endswith(".mp4"):
            #     file_path = os.path.join(relative_output_dir_path, filename)
                os.remove(file)
                print(f"Deleted: {file}")
        print("Restarting")
        

In [None]:
df

# Mass download from single urls

In [16]:
links = [

    "https://www.youtube.com/watch?v=GqbV7P9ynn8&t=267s",
    "https://www.youtube.com/watch?v=jVkWlaP1BSM",
    "https://www.youtube.com/watch?v=pJxZcVp03TE",
    "https://www.youtube.com/watch?v=YOjqQ4x9ow4",
    
]

# nine diverse urls for each reciter
# but we need to define our global variables first!
NAME =  "طارق محمد"
path = f"data/single/{NAME}"
os.makedirs(path, exist_ok=True) #create dir for the reciter for a yt video instead of the playlist dir

relative_output_dir_path = os.path.relpath(path) # get the relative path

In [None]:
for link in links: # iterate over the links
    
    with tqdm(desc=f"Downloading: {link}", unit="file") as progress_bar:
        download_youtube_audio(link, relative_output_dir_path, progress_callback=progress_bar.update)#download the actual video

In [None]:
videos_for_conv_paths = glob(os.path.join(relative_output_dir_path,"*.mp4")) # get the mp4 file in the dir
for i,video in enumerate(videos_for_conv_paths): #iterate over this list 
        prefix = f"{NAME}_URL({i + 1})" #use the index for knowing which video (it will help us later to know which item to select)
        split_audio(video,prefix, relative_output_dir_path)
        gprint(f"{video}: Audio Split Done")
        os.remove(video)
        rprint(f"{video}: Video Deleted")

In [17]:
segments_for_deletion = glob(os.path.join(relative_output_dir_path,"*.mp3"))
progress_bar = colored_tqdm(segments_for_deletion, color_code="\033[91m", desc="Deleted after conversion to wav", unit="file")    
for segment in progress_bar:
    try:
        # Check if the file has .mp3 extension
        if segment.lower().endswith(".mp3"):
            subprocess.run(['ffmpeg', '-y', '-loglevel', 'panic', '-i', segment, '-acodec', 'pcm_s16le', '-ar', '44100', f"{segment[:-4]}.wav"])
            os.remove(segment)
            progress_bar.set_description(f"{os.path.basename(segment)} Deleted after conversion to wav")
        else:
            rprint(f"Skipping non-MP3 file: {os.path.basename(segment)}")

    except Exception as e:
        rprint(f"Error processing {os.path.basename(segment)}: {e}")

progress_bar.close()
print(".mp3's Deleted")

[91m██████████| 20256/20256 [18:19<00:00, 18.42file/s][0m

.mp3's Deleted





In [6]:
import os
"test_examples/alaa.wav".removesuffix(".wav")

'test_examples/alaa'