# Songs Scrapping

### Install Required Libraries

In [None]:
!pip install yt-dlp
!pip install pandas

Collecting yt-dlp
  Downloading yt_dlp-2025.2.19-py3-none-any.whl.metadata (171 kB)
Downloading yt_dlp-2025.2.19-py3-none-any.whl (3.2 MB)
   ---------------------------------------- 0.0/3.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.2 MB ? eta -:--:--
   ------ --------------------------------- 0.5/3.2 MB 1.6 MB/s eta 0:00:02
   --------- ------------------------------ 0.8/3.2 MB 1.7 MB/s eta 0:00:02
   ------------- -------------------------- 1.0/3.2 MB 1.4 MB/s eta 0:00:02
   ------------------- -------------------- 1.6/3.2 MB 1.6 MB/s eta 0:00:01
   -------------------------- ------------- 2.1/3.2 MB 1.8 MB/s eta 0:00:01
   -------------------------------- ------- 2.6/3.2 MB 1.9 MB/s eta 0:00:01
   ---------------------------------------  3.1/3.2 MB 2.0 MB/s eta 0:00:01
   ---------------------------------------- 3.2/3.2 MB 2.0 MB/s eta 0:00:00
Installing collected packages: yt-dlp
Successfully installed yt-dlp-2025.2.19


## Scrape Videos Metadata from Youtube (Title , Channel , Channel's Subscriber Count , Views , Likes , URL )

**Problem with Scraper :**

I have added proper random sleep time between 5sec to 10sec but still you can get Error : 429 (Too Many Requests) from server and server wil block your IP for sending so many requests.So try to run this code only once or twice. If you have a static IP than that is a serious issue you cannot access youtube becuase your IP is blocked by youtube servers otherwise if you have Dynamic IP then dont worry you will be able to access youtube as soon as you IP changes

**Solution :**
1. Use VPN 
2. Use kaggle or  colab to run this code (if kaggle/colab ip got blocked further scrapping can't be done but your own local IP is safe)
3. Increase sleep time range i.e between 20sec to 30sec in  search_youtube_videos() function (This will take longer time to scrape)

I have automated everything here just run the code and go back to your lazy life :)

In [None]:
import yt_dlp
import csv
from difflib import SequenceMatcher

def parse_views(view_str):
    """Convert view/like counts from string '1,234,567' to integer 1234567"""
    if isinstance(view_str, (int, float)):
        return int(view_str)
    return int(view_str.replace(',', '')) if view_str else 0

def get_best_match_ratio(small: str, large: str) -> float:
    """Find the best similarity ratio by sliding over the larger string."""
    best_ratio = 0.0
    len_small = len(small)
    
    for i in range(len(large) - len_small + 1):
        segment = large[i : i + len_small]  # Extract substring of the same length as `small`
        ratio = SequenceMatcher(None, small, segment).ratio()
        best_ratio = max(best_ratio, ratio)

    return best_ratio

def title_similarity(str1,str2,threshold=0.80) :
    """Check if smaller string matches at least `threshold` percent within the larger string."""
    small, large = sorted([str1, str2], key=len)
    
    match_ratio = get_best_match_ratio(small, large)
    
    return match_ratio >= threshold

def get_video_engagement(video):
    """Extract and convert engagement metrics with fallback values"""
    return {
        'views': parse_views(video.get('view_count', 0)),
        'subscribers': parse_views(video.get('channel_follower_count', 0)),
        'likes': parse_views(video.get('like_count', 0)),
        'duration': video.get('duration', 0),
        'title': video.get('title', ''),
        'url': video.get('webpage_url', ''),
        'channel': video.get('uploader', '')
    }

def is_better_candidate(current, new_candidate):
    """Compare two videos to determine which has better engagement"""
    return (
        (new_candidate['views'] > current['views']) or
        (new_candidate['views'] == current['views'] and 
         new_candidate['subscribers'] > current['subscribers']) or
        (new_candidate['views'] == current['views'] and
         new_candidate['subscribers'] == current['subscribers'] and
         new_candidate['likes'] > current['likes'])
    )

def filter_videos(videos, max_duration, min_views, min_subs, exclude_keywords):
    """Process videos with all filters and duplicate handling"""
    filtered = {}
    exclude_set = {kw.lower() for kw in (exclude_keywords or [])}

    for video in videos:
        if not video:
            continue

        eng = get_video_engagement(video)
        
        # Basic filters
        if (eng['duration'] > max_duration or
            eng['views'] < min_views or
            eng['subscribers'] < min_subs or
            any(kw in eng['title'].lower() for kw in exclude_set)):
            continue

        # Duplicate handling
        found_duplicate = False
        for key in list(filtered.keys()):
            if title_similarity(key, eng['title']):
                found_duplicate = True
                if is_better_candidate(filtered[key], eng):
                    del filtered[key]
                    filtered[eng['title']] = eng
                break
        
        if not found_duplicate:
            filtered[eng['title']] = eng

    return sorted(filtered.values(), 
                 key=lambda x: (-x['views'], -x['subscribers'], -x['likes']))

def search_youtube_videos(query,csv_path,max_results=10, max_duration=1200, 
                         min_views=100000, min_subs=100000, exclude_keywords=None):
    search_query = f"ytsearch{max_results}:{query}"  # Search for videos

    ydl_opts = {
        "quiet": True,
        "default_search": "ytsearch",   # Search YouTube
        "extract_flat": False,          # Get full metadata
        "sleep_interval": 5,            # Wait 5 seconds between downloads
        "max_sleep_interval": 10        # Randomize wait up to 10 seconds
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(search_query, download=False)  
        videos = info.get('entries', []) if info else []

    filtered = filter_videos(videos, max_duration, min_views, 
                            min_subs, exclude_keywords)

    # Prepare CSV data
    csv_data = [{
        'Title': v['title'],
        'Channel': v['channel'],
        'Subscribers': f"{v['subscribers']:,}",
        'Views': f"{v['views']:,}",
        'Likes': f"{v['likes']:,}",
        'Duration (seconds)': v['duration'],
        'URL': v['url']
    } for v in filtered]

    # Write to CSV
    csv_file = csv_path + ".csv"
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=csv_data[0].keys() if csv_data else [])
        writer.writeheader()
        writer.writerows(csv_data)

    return filtered, csv_file

if __name__ == "__main__":
  Search_Queries = ["Nayyara Noor Songs","Tina Sani Songs","Lata Mangeshkar Solo Songs","Muhammad Rafi Solo Songs","Asha Bhosle Solo Songs"]
  CSV_Paths = ["Nayyara_Noor_Songs","Tina_Sani_Songs","Lata_Mangeshkar_Songs","Muhammad_Rafi_Songs","Asha_Bhosle_Songs"]
  for query,csv_path in zip(Search_Queries,CSV_Paths) :
    videos, csv_path = search_youtube_videos(
        query,
        csv_path,
        max_results=200,        # Get top 200 results
        max_duration=1200,      # Maximum duration of videos in seconds
        min_views=100000,       # Minimum views
        min_subs=100000,        # Minimum subscribers
        exclude_keywords=["remix", "cover", "mashup", "tribute"]
    )

## Scrape Audio from URLs

##### Extract URL from CSV Files and Save into Text Files

In [None]:
import os
import pandas as pd

# Folder containing the CSV files
folder_path = "Data-Scrapping/Dataset_Metadata/"
target_path = "Data-Scrapping/Extracted_links/"
# Ensure the output folder exists
os.makedirs(target_path, exist_ok=True)

# Process all CSV files in the folder
for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        file_path = os.path.join(folder_path, file)
        output_file_path = os.path.join(target_path, file.replace(".csv", ".txt"))
        # os.makedirs(output_file_path, exist_ok=True)
        try:
            df = pd.read_csv(file_path)
            if 'URL' in df.columns:
                df['URL'].dropna().to_csv(output_file_path, index=False, header=False)
                print(f"Extracted links saved to: {output_file_path}")
            else:
                print(f"'links' column not found in {file}")
        except Exception as e:
            print(f"Error processing {file}: {e}")

Extracted links saved to: Data-Scrapping/Extracted_links/Asha_Bhosle_Songs.txt
Extracted links saved to: Data-Scrapping/Extracted_links/Lata_Mangeshkar_Songs.txt
Extracted links saved to: Data-Scrapping/Extracted_links/mehdi_hasan_songs.txt
Extracted links saved to: Data-Scrapping/Extracted_links/Muhammad_Rafi_Songs.txt
Extracted links saved to: Data-Scrapping/Extracted_links/Nayyara_Noor_Songs.txt
Extracted links saved to: Data-Scrapping/Extracted_links/Tina_Sani_Songs.txt


##### Scrape Audio from Links and Save in Respective Folders

In [32]:
!mkdir "Mehdi Hassan Songs"

In [None]:
!yt-dlp -f bestaudio --extract-audio --audio-format mp3 --audio-quality 0 \
-o "Mehdi Hassan Songs/%(title)s.%(ext)s" \
-a "Data-Scrapping/extracted_links/mehdi_hasan_songs.txt" \
--sleep-interval 5 --max-sleep-interval 15

In [None]:
!mkdir "Asha Bhosle Songs"

In [None]:
!yt-dlp -f bestaudio --extract-audio --audio-format mp3 --audio-quality 0 \
-o "/kaggle/working/Asha Bhosle Songs/%(title)s.%(ext)s" \
-a "/kaggle/input/songs-links/Asha_Bhosle_Songs.txt" \
--sleep-interval 5 --max-sleep-interval 15


In [None]:
!mkdir "Lata Mangeshkar Songs"

In [None]:
!yt-dlp -f bestaudio --extract-audio --audio-format mp3 --audio-quality 0 \
-o "/kaggle/working/Lata Mangeshkar Songs/%(title)s.%(ext)s" \
-a "/kaggle/input/songs-links/Lata_Mangeshkar_Songs.txt" \
--sleep-interval 5 --max-sleep-interval 15


In [None]:
!mkdir "Muhammad Rafi Songs"

In [None]:
!yt-dlp -f bestaudio --extract-audio --audio-format mp3 --audio-quality 0 \
-o "/kaggle/working/Muhammad Rafi Songs/%(title)s.%(ext)s" \
-a "/kaggle/input/songs-links/Muhammad_Rafi_Songs.txt" \
--sleep-interval 5 --max-sleep-interval 15


In [None]:
!mkdir "Nayyar Noor Songs"

In [None]:
!yt-dlp -f bestaudio --extract-audio --audio-format mp3 --audio-quality 0 \
-o "/kaggle/working/Muhammad Rafi Songs/%(title)s.%(ext)s" \
-a "/kaggle/input/songs-links/Nayyara_Noor_Songs.txt" \
--sleep-interval 5 --max-sleep-interval 15


In [None]:
!mkdir "Tina Sani Songs"

In [None]:
!yt-dlp -f bestaudio --extract-audio --audio-format mp3 --audio-quality 0 \
-o "/kaggle/working/Tina Sani Songs/%(title)s.%(ext)s" \
-a "/kaggle/input/songs-links/Tina_Sani_Songs.txt" \
--sleep-interval 5 --max-sleep-interval 15
