In [None]:
import re
import time
import pandas as pd

from urllib.parse import quote
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException
)


def get_first_youtube_link(driver, query):

    search_url = "https://www.youtube.com/results?search_query=" + quote(query)
    driver.get(search_url)

    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.ID, "contents"))
        )
    except TimeoutException:
        print(f" timed out for: {query}")
        return None

    # to load waiting
    time.sleep(2)

    try:

        video_elements = driver.find_elements(By.XPATH, '//ytd-video-renderer')
        for video in video_elements:

            # ad skipping
            try:
                video.find_element(By.XPATH, ".//ytd-thumbnail-overlay-ad-tag-renderer")
                # If found, it's likely an ad overlay => skip
                continue
            except NoSuchElementException:
                pass

           # shorts skipping
            try:
                video.find_element(By.XPATH, ".//span[contains(text(), 'Shorts')]")
                continue
            except NoSuchElementException:
                pass

            # some shorts look like normal vidoes but when you click them its actually a short skip those
            try:
                link_element = video.find_element(By.XPATH, ".//a[@id='thumbnail']")
                href = link_element.get_attribute("href")
                if href:
                    if "youtube.com/shorts/" in href.lower():
                        continue
                    return href
            except Exception as e:
                print(f"[ERROR] Retrieving link for query='{query}': {e}")
                continue

    except Exception as e:
        print(f"[ERROR] Unexpected error scanning search results for '{query}': {e}")

    return None


def parse_youtube_metrics(driver, video_url):
    try:
        driver.get(video_url)

        # wait to load page
        time.sleep(3)

        page_source = driver.page_source

        # regex to get views
        view_match = re.search(
            r'"viewCount"\s*:\s*\{\s*"simpleText"\s*:\s*"([\d,\.]+)\s+views"', page_source
        )
        if view_match:
            view_count = view_match.group(1)
        else:
            view_count = None

        # regex to get likes
        like_match = re.search(r'"iconName":"LIKE"\s*,\s*"title":"([^"]+)"', page_source)
        if like_match:
            like_count = like_match.group(1)
        else:
            like_count = None

        return view_count, like_count

    except Exception as e:
        print(f" error {video_url}: {e}")
        return None, None


def main():
    # again change csv this is actually  output from anime_data_collection ipynb
    input_csv = "trial3_copy.csv"
    df = pd.read_csv(input_csv)

    if "full_youtube_uri" not in df.columns:
        df["full_youtube_uri"] = None
    if "youtube_views" not in df.columns:
        df["youtube_views"] = None
    if "youtube_likes" not in df.columns:
        df["youtube_likes"] = None

    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--mute-audio")
    try:
        driver = webdriver.Chrome(options=chrome_options)
    except Exception as e:
        print(" driver : ", e)
        return

    # doing in batch put start if already done upto some index
    start_index = 0

    total_rows = len(df)
    save_every = 50

    for i in range(start_index, total_rows):
        if pd.notnull(df.loc[i, "full_youtube_uri"]):
            print(f"[INFO] Row {i} already has a URL, skipping.")
            continue

        mal_title_jp = str(df.loc[i, "MAL_Title_JP"]).strip()
        mal_title_en = str(df.loc[i, "MAL_Title_EN"]).strip()

        if mal_title_jp and mal_title_jp.lower() != "nan":
            anime_title = mal_title_jp
        elif mal_title_en and mal_title_en.lower() != "nan":
            anime_title = mal_title_en
        else:
            anime_title = ""

        if "song" in df.columns:
            song_part = str(df.loc[i, "song"]).strip()
            if song_part.lower() == "nan":
                song_part = ""
            query = f"{anime_title} {song_part}".strip()
        else:
            query = anime_title

        print(f"\n {i}  '{query}'")
        if not query:
            print(" skipping ")
            df.loc[i, "full_youtube_uri"] = None
            continue

        try:
            video_url = get_first_youtube_link(driver, query)
            if video_url:
                print(f" url :  {video_url}")
                df.loc[i, "full_youtube_uri"] = video_url

                views, likes = parse_youtube_metrics(driver, video_url)
                df.loc[i, "youtube_views"] = views
                df.loc[i, "youtube_likes"] = likes
                print(f" Views: {views}, Likes: {likes}")
            else:
                print(f" no video : '{query}'")
                df.loc[i, "full_youtube_uri"] = None

        except Exception as ex:
            print(f" error {i}: {ex}")

        time.sleep(1)

        if i > 0 and i % save_every == 0:
            backup_csv_name = f"partial_progress_cleaned_{i}.csv"
            df.to_csv(backup_csv_name, index=False)
            print(f" partial {i} -> '{backup_csv_name}'")

    # again change csv name
    output_csv = "youtubeuri_updated_trial3.csv"
    df.to_csv(output_csv, index=False)
    print(f" saved csv '{output_csv}'")

    driver.quit()


if __name__ == "__main__":
    main()


In [None]:
df = pd.read_csv("upto300anime_songs_cleaned_updated.csv")
df.info()

In [None]:
import urllib.parse

def parse_video_id(uri):
    if not isinstance(uri, str):
        return ""
    parts = urllib.parse.urlparse(uri)
    qs = urllib.parse.parse_qs(parts.query)
    return qs.get("v", [""])[0] 

df = pd.read_csv("youtubeuri_updated_trial3_faster.csv")

df["youtube_video_id"] = df["full_youtube_uri"].apply(parse_video_id)

output_csv = "songs_faster.csv"
df.to_csv(output_csv, index=False)

In [None]:
import numpy as np

In [None]:

df["popularity_score"]=np.log1p(df["youtube_views"])

In [None]:
output_csv = "songs_faster_preprocessed.csv"
df.to_csv(output_csv, index=False)

In [None]:
import pandas as pd


df = pd.read_csv("songs_faster_preprocessed.csv")


df = df[df['full_youtube_uri'].notna()]  
df = df[df['full_youtube_uri'].str.strip() != ''] 


df.to_csv("songs_faster_preprocessed_cleaned_run1.csv", index=False)

print("Cleaned dataframe:")
print(df)
