In [1]:
#Install needed libraries
!pip install librosa
!pip install numpy
!pip install scipy
!pip install pandas
!pip install matplotlib
!pip install scikit-learn
!pip install torch
!pip install yt-dlp
!pip install google-api-python-client
!pip install isodate
!pip install tempfile



ERROR: Could not find a version that satisfies the requirement tempfile (from versions: none)
ERROR: No matching distribution found for tempfile


In [2]:
#Import Libraries
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import yt_dlp
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.signal import find_peaks
from scipy.spatial.distance import cosine
from googleapiclient.discovery import build
import time
import random 
from googleapiclient.errors import HttpError
import isodate 
from tempfile import NamedTemporaryFile #for Streamlit app
import streamlit as st #streamlit app
import joblib #streamlit/parallel jobs if needed
import os #for saving my key in an env var
from dotenv import load_dotenv #to load that env var

In [3]:
#Get credentials for youtube data API
load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY")  
YOUTUBE_API_SERVICE_NAME = "youtube" #This should be a set feature (stay at youtube and just make sure I don't need to create a service account)
YOUTUBE_API_VERSION = "v3" #Check under google's api service feature and make sure I check the box to enable the api latest version

In [4]:
#create a list of composers I want to parse through on youtube
composers = [
    "Hans Zimmer", "John Williams", "James Newton Howard", 
    "Howard Shore", "Danny Elfman", "Ennio Morricone", 
    "Alexandre Desplat", "Thomas Newman", "Michael Giacchino",
    "John Powell", "Harry Gregson-Williams", "Rupert Gregson-Williams",
    "James Horner", "Lorne Balfe", "Kevin Kiner", "Henry Jackman",
    "Dominic Lewis", "Patrick Doyle", "Nicholas Hooper",
    "Joe Kraemer", "David Arnold", "John Barry", "Eric Serra",
    "John Debney", "Brian Tyler", "Alan Silvestri",
    "Mark Mothersbaugh", "Christophe Beck", "Joel P West",
    "Ludwig Goransson", "Randy Newman", "Daniel Pemberton",
    "Jerry Goldsmith", "Randy Edelman", "John Ottman",
    "Mark Isham", "Alan Menken", "Klaus Badelt",
    "Michael Kamen", "Marc Shaiman", "Adolph Deutsch",
    "Trevor Rabin", "Gavin Greenaway", "Justin Hurwitz",
    "Trent Reznor and Atticus Ross", "Trevor Jones",
    "Elmer Bernstein", "Carlos Rafael Rivera", "Don Davis",
    "Joe Hisaishi", "Tan Dun", "Dave Grusin",
    "Dario Marianelli", "Steve Jablonsky", "Bill Conti",
    "Ramin Djawadi", "Anthony Gonzalez", "Mychael Danna",
    "Johnny Klimek", "Tom Tykwer", "Michael Paraskevas",
    "Harold Faltermeyer", "Natalie Holt", "James Shearman",
    "Robin Carolan", "Tyler Bates", "Bear McCreary",
    "Nicholas Britell", "Carter Burwell", "Martin Phipps",
    "Naoki Sato", "Takeshi Furukawa", "John Paesano",
    "Benjamin Wallfisch", 
]

In [5]:
personal_projects_path = r"G:\My Drive\Personal Projects" #update my path to my personal project folder
if personal_projects_path not in sys.path:
    sys.path.append(personal_projects_path)

os.chdir(personal_projects_path) 
print(f"✅ Now working in: {os.getcwd()}")

✅ Now working in: G:\My Drive\Personal Projects


In [None]:
#Info on requests:
#The search request counts as one API call per page of results.
#The video details request is made for each video returned in the search results.
#Since each page fetches up to 25 videos, that means each full page of results adds ~26 requests (1 search request + 25 video details requests).


#Request Tracker
MAX_DAILY_REQUESTS = 10000
requests_made = 0

#Create a file to store video urls
URL_FILE = r"video_urls.txt"
full_path = os.path.abspath(URL_FILE)

#Set a max number of files for each artist (we will update this after each full iter so once everyone has 5 we will move to 10 etc
MAX_VIDEOS = 5

#Read the existing files for each artist
video_counts = {composer: 0 for composer in composers}
existing_videos = {}
#Count the number of unique files for each artist
if os.path.exists(URL_FILE):
    with open(URL_FILE, "r", encoding="utf-8") as f:
        for line in f:
            composer, url = line.strip().split(" | ")
            existing_videos.setdefault(composer, []).append(url)
            video_counts[composer] += 1

#Create a function to fetch videos
def fetch_composer_ost_links(composer, max_videos=5):
    global requests_made

    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY)
    query = f"{composer} ost"
    links = []
    next_page_token = None
#Define keywords we need to not have pulled in our videos...filter out all of these
    exclude_keywords = [
        "live", "interview", "trailer", "behind the scenes", "making of",
        "creating", "composing", "inspiration", "mix", "remix", "medley",
        "suite", "score analysis", "cover of", "analysis", "analyzing",
        "producing", "orchestrating", "orchestration", "recording", "concert",
        "Top 5", "top 5", "Top 10", "top 10", "Best of", "best of", "Boston Pops", "boston pops",
        "Greatest Hits", "greatest hits", "Best Pieces", "best pieces", "Best Works", "best works"
    ]
#while we are unde the api daily limits
    while len(links) < max_videos and requests_made + 1 <= MAX_DAILY_REQUESTS:
        if requests_made >= MAX_DAILY_REQUESTS:
            print("🚨 API limit reached, stopping collection.")
            break

        # Search request (counts as 1 request)
        search_response = youtube.search().list(
            q=query,
            part="id,snippet",
            type="video",
            maxResults=min(25, max_videos - len(links)),
            pageToken=next_page_token
        ).execute()
        requests_made += 1  # Track search request

        time.sleep(random.uniform(1, 3))  # Sleep timer between requests

        video_ids = []
        for item in search_response.get("items", []):
            title = item["snippet"]["title"].lower()
            video_id = item["id"]["videoId"]
            video_url = f"https://www.youtube.com/watch?v={video_id}" #Tell me the name of the url

            if video_url in existing_videos.get(composer, []) or video_url in links:
                continue  #No duplicate tracks
            
            if not any(keyword in title for keyword in exclude_keywords):
                if any(term in title for term in ["ost", "original motion picture", "soundtrack", "original score", "score"]):
                    video_ids.append(video_id)  #Collect video IDs for batch request

        if not video_ids:
            next_page_token = search_response.get("nextPageToken")
            if not next_page_token:
                break  #Stop if no more pages
            continue

        #Video details request (counts as len(video_ids) requests)
        try:
            video_response = youtube.videos().list(
                part="contentDetails",
                id=",".join(video_ids)  #Batch request for efficiency (looking into this)
            ).execute()
            requests_made += len(video_ids)  
        except Exception as e:
            print(f"⚠️ Error fetching video details: {e}") #if there's an error with video details skip the song
            continue

        for item in video_response.get("items", []):
            video_id = item["id"]
            video_url = f"https://www.youtube.com/watch?v={video_id}" #the actual data we're pulling
            content_details = item.get("contentDetails", {})

            if "duration" not in content_details:
                print(f"⚠️ Skipping {video_url} (Missing duration info)") #if there's no video duration info, skip the song
                continue

            duration = content_details["duration"]
            duration_seconds = isodate.parse_duration(duration).total_seconds()

            if 60 <= duration_seconds <= 12 * 60:  #12 minute maximum video length
                links.append(video_url)
                print(f"✅ Added: {video_url} ({duration_seconds // 60} min)")

        next_page_token = search_response.get("nextPageToken")
        if not next_page_token:
            break  #if you run out of pages stop

        time.sleep(random.uniform(5, 15))  #sleep delay

    return links


#find videos for composers that haven't reached max
for composer in composers:
    if video_counts[composer] >= MAX_VIDEOS:
        print(f"⏭️ Skipping {composer}, already has {MAX_VIDEOS} videos.")
        continue  #kkip composers with max 

    try:
        print(f"🎼 Fetching {MAX_VIDEOS - video_counts[composer]} tracks for {composer}...") #tell me when you move onto a new artist
        composer_links = fetch_composer_ost_links(composer, max_videos=MAX_VIDEOS - video_counts[composer])

        new_urls = [f"{composer} | {url}" for url in composer_links]
        video_counts[composer] += len(new_urls)

        # Append new URLs to the file
        with open(URL_FILE, "a", encoding="utf-8") as f:
            for url in new_urls:
                f.write(url + "\n")

        print(f"✅ Added {len(new_urls)} tracks for {composer}") #tell me you're adding tracks for each artist

    except Exception as e:
        print(f"⚠️ Error with {composer}: {e}") #if there's an error with certain artists let's pull them
#Tell me where we are saving data at the end
print(f"📂 URLs saved to: {full_path}")


⏭️ Skipping Hans Zimmer, already has 5 videos.
⏭️ Skipping John Williams, already has 5 videos.
⏭️ Skipping James Newton Howard, already has 5 videos.
⏭️ Skipping Howard Shore, already has 5 videos.
⏭️ Skipping Danny Elfman, already has 5 videos.
⏭️ Skipping Ennio Morricone, already has 5 videos.
⏭️ Skipping Alexandre Desplat, already has 5 videos.
🎼 Fetching 5 tracks for Thomas Newman...
✅ Added: https://www.youtube.com/watch?v=gqo46lt-8Q4 (3.0 min)
✅ Added: https://www.youtube.com/watch?v=kZbf9T_PCgs (3.0 min)
✅ Added: https://www.youtube.com/watch?v=cc-O24c-q9M (3.0 min)
✅ Added: https://www.youtube.com/watch?v=CPPEw17VmcY (5.0 min)
