In [None]:
!pip install youtube-search-python yt-dlp
!pip install --force-reinstall "httpx<0.28"

In [None]:
import csv
import subprocess
import os
import re
import time
from youtubesearchpython import VideosSearch

# Paths to your input and output CSV files
input_csv = "tracks_features.csv"
output_csv = "new_songs_with_details.csv"

# Folder to store downloaded songs
songs_folder = "songs"
os.makedirs(songs_folder, exist_ok=True)

# Additional columns to add
ADDITIONAL_COLUMNS = ["video_title", "video_url", "file_path"]

# ---------- Utility Functions ----------

def clean_song_data(name, artist):
    """Strip and clean the 'name' and 'artist' fields."""
    clean_name = name.strip()
    clean_artist = artist.strip()
    if clean_artist.startswith("['") and clean_artist.endswith("']"):
        clean_artist = clean_artist[2:-2].strip()
    return clean_name, clean_artist

def sanitize_filename(s):
    """Remove problematic characters and replace spaces with underscores."""
    s = s.strip()
    s = re.sub(r'[^\w\s]', '', s)
    s = s.replace(" ", "_")
    return s

def initialize_output_csv(fieldnames):
    """Write header to the output CSV if it doesn't exist yet."""
    if not os.path.exists(output_csv):
        with open(output_csv, mode="w", newline="", encoding="utf-8") as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()

def get_processed_count():
    """Return how many rows have already been processed (based on output CSV)."""
    if not os.path.exists(output_csv):
        return 0
    with open(output_csv, mode="r", newline="", encoding="utf-8") as infile:
        reader = csv.DictReader(infile)
        return sum(1 for _ in reader)

def append_row_to_csv(row, fieldnames):
    """Immediately append a single row to the output CSV."""
    # Create a new dict that only includes keys in 'fieldnames' (preserving order)
    row_to_write = {col: row.get(col, "") for col in fieldnames}
    try:
        with open(output_csv, mode="a", newline="", encoding="utf-8") as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writerow(row_to_write)
    except Exception as write_error:
        print(f"Error writing row to CSV: {write_error}")

def process_row(row, fieldnames):
    """Download and process a single row, returning the updated row with new fields."""
    try:
        # Check for name/artist values
        name_raw = row.get("name", "").strip()
        artist_raw = row.get("artists", "").strip()
        if not name_raw or not artist_raw:
            print("Skipping row due to missing data:", row)
            row["video_title"] = ""
            row["video_url"] = ""
            row["file_path"] = ""
            return row

        # Clean the song name and artist
        name, artist = clean_song_data(name_raw, artist_raw)
        row["name"] = name
        row["artists"] = artist

        # Build the YouTube search query
        query = f"{name} by {artist}"
        print(f"Searching for: {query}")

        # Perform YouTube search with error handling
        try:
            videosSearch = VideosSearch(query, limit=1)
            result = videosSearch.result()
        except Exception as e:
            print(f"Error searching for {query}: {e}")
            row["video_title"] = ""
            row["video_url"] = ""
            row["file_path"] = ""
            return row

        if result.get("result"):
            video_info = result["result"][0]
            video_title = video_info.get("title", "")
            video_url = video_info.get("link", "")
            print(f"Found video: {video_title} - {video_url}")
            row["video_title"] = video_title
            row["video_url"] = video_url

            # Build the output filename (MP3 format)
            safe_song = sanitize_filename(name)
            safe_artist = sanitize_filename(artist)
            output_filename = os.path.join(songs_folder, f"{safe_song}_{safe_artist}.mp3")

            # Download first 30 seconds of audio using yt-dlp
            command = [
                "yt-dlp",
                "--extract-audio",
                "--audio-format", "mp3",
                "--postprocessor-args", "ffmpeg:-ss 0",
                "-o", output_filename,
                video_url
            ]
            print(f"Downloading audio to {output_filename} ...")
            try:
                subprocess.run(command, check=True)
                row["file_path"] = os.path.abspath(output_filename)
            except subprocess.CalledProcessError as e:
                print(f"Error downloading {query}: {e}")
                if os.path.exists(output_filename):
                    print("Audio file exists despite error; updating CSV accordingly.")
                    row["file_path"] = os.path.abspath(output_filename)
                else:
                    row["file_path"] = ""
        else:
            print(f"No results found for {query}")
            row["video_title"] = ""
            row["video_url"] = ""
            row["file_path"] = ""
    except Exception as general_error:
        print(f"Unexpected error processing row {row}: {general_error}")
        row["video_title"] = ""
        row["video_url"] = ""
        row["file_path"] = ""
    return row

# ---------- Main Script ----------

sleep_interval = 5  # Time in seconds to wait before re-checking for new rows

# Set the desired starting row (e.g., 1000 means start processing from the 1000th row)
DESIRED_START_ROW = 100000

# 1) Read the input CSV to get original fieldnames in order and load rows
with open(input_csv, mode="r", newline="", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    original_fieldnames = reader.fieldnames or []  # The original column order from the input CSV
    all_rows = list(reader)

# 2) Create the final fieldnames list by appending new columns to the original order.
fieldnames = list(original_fieldnames)
for col in ADDITIONAL_COLUMNS:
    if col not in fieldnames:
        fieldnames.append(col)

# 3) Initialize the output CSV (write header if it doesn't exist)
initialize_output_csv(fieldnames)

# 4) Determine how many rows have already been processed
processed_count = get_processed_count()
print(f"Already processed {processed_count} rows.")

# 5) If processed_count is less than DESIRED_START_ROW, override it so that processing starts there.
if processed_count < DESIRED_START_ROW:
    print(f"Setting starting row to {DESIRED_START_ROW}.")
    processed_count = DESIRED_START_ROW

print(f"Resuming from row {processed_count + 1} of the input CSV.")

print("Starting infinite processing loop...")
while True:
    try:
        with open(input_csv, mode="r", newline="", encoding="utf-8") as csvfile:
            reader = csv.DictReader(csvfile)
            # Re-read original fieldnames in case the input CSV changes (optional)
            original_fieldnames = reader.fieldnames or []
            all_rows = list(reader)
        total_rows = len(all_rows)
    except Exception as e:
        print(f"Error reading input CSV: {e}")
        total_rows = 0

    if processed_count < total_rows:
        # Process each new row one by one from the processed_count index onward
        for row in all_rows[processed_count:]:
            updated_row = process_row(row, fieldnames)
            append_row_to_csv(updated_row, fieldnames)
            processed_count += 1
    else:
        print(f"No new rows (processed {processed_count} of {total_rows}). Waiting {sleep_interval} seconds...")
        time.sleep(sleep_interval)
