In [None]:
import requests
import os
import json
import random
import time
import subprocess
from urllib.parse import urlparse, unquote
from pathlib import Path
import concurrent.futures

# Define constants
PEXELS_API_KEY = "1wcNYkUdXROazyDGUmiAPqOW1jSGnHH5cOcR8kShhOWFhvuvOdywx9EF"  # Replace with your actual Pexels API key
STORAGE_DIR = "Storage"
BOOKS_DIR = os.path.join(STORAGE_DIR, "temp_texts")
VIDEOS_DIR = os.path.join(STORAGE_DIR, "downloaded_videos")
DOWNLOAD_LOG = os.path.join(STORAGE_DIR, "video_download_log.json")

# Create directories if they don't exist
os.makedirs(VIDEOS_DIR, exist_ok=True)
os.makedirs(BOOKS_DIR, exist_ok=True)

def get_search_terms_from_books():
    """Extract search terms from all book files"""
    search_terms_by_book = {}
    try:
        for filename in os.listdir(BOOKS_DIR):
            if filename.startswith('book_') and filename.endswith('.json'):
                file_path = os.path.join(BOOKS_DIR, filename)
                with open(file_path, 'r', encoding='utf-8') as f:
                    book_data = json.load(f)
                book_num = int(filename.split('_')[1].split('.')[0])
                if 'pexels_videos' in book_data:
                    # Create a list of search terms from the pexels_videos array
                    search_terms = [video_data['search_term'] for video_data in book_data['pexels_videos']]
                    search_terms_by_book[book_num] = {
                        'title': book_data['title'],
                        'search_terms': search_terms
                    }
    except Exception as e:
        print(f"Error reading book files: {e}")
    return search_terms_by_book

def search_pexels_videos(query, per_page=5, orientation="landscape"):
    """Search for videos on Pexels API with the given query"""
    url = "https://api.pexels.com/videos/search"
    params = {
        "query": query,
        "per_page": per_page,
        "orientation": orientation,
        "size": "large"  # Prefer large (high quality) videos
    }
    headers = {"Authorization": PEXELS_API_KEY}
    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        results = response.json()
        return results.get("videos", [])
    except requests.exceptions.RequestException as e:
        print(f"Error searching Pexels API for '{query}': {e}")
        return []

def get_best_video_file(video_data):
    """Select the highest quality video file that's reasonable to download"""
    if not video_data or "video_files" not in video_data:
        return None
    video_files = video_data["video_files"]
    preferred_files = []
    for file in video_files:
        if "height" not in file:
            continue
        if file["height"] >= 720:
            if "file_type" in file and file["file_type"].startswith("video/"):
                preferred_files.append(file)
    if not preferred_files and video_files:
        preferred_files = [file for file in video_files if "file_type" in file and file["file_type"].startswith("video/")]
    preferred_files.sort(key=lambda x: x.get("height", 0), reverse=True)
    return preferred_files[0] if preferred_files else None

def download_video_segment(url, output_path, duration=10):
    """
    Download only the first `duration` seconds of a video using ffmpeg.
    This assumes ffmpeg is installed and available in your PATH.
    """
    command = [
        "ffmpeg",
        "-y",                   # Overwrite output file if it exists
        "-i", url,              # Input URL
        "-t", str(duration),    # Duration (in seconds)
        "-c", "copy",           # Copy codec to avoid re-encoding
        output_path
    ]
    try:
        subprocess.run(command, capture_output=True, check=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error downloading video segment: {e.stderr.decode()}")
        return False

def get_file_extension_from_url(url):
    """Extract file extension from URL"""
    parsed_url = urlparse(url)
    path = unquote(parsed_url.path)
    ext = os.path.splitext(path)[1]
    if not ext:
        ext = ".mp4"
    return ext

def load_download_log():
    """Load the download log to avoid re-downloading videos"""
    if os.path.exists(DOWNLOAD_LOG):
        try:
            with open(DOWNLOAD_LOG, 'r', encoding='utf-8') as f:
                return json.load(f)
        except json.JSONDecodeError:
            return {"downloaded_videos": {}}
    return {"downloaded_videos": {}}

def save_download_log(log_data):
    """Save the download log"""
    with open(DOWNLOAD_LOG, 'w', encoding='utf-8') as f:
        json.dump(log_data, f, indent=4)

def download_videos_for_books(max_videos_per_book=3, max_concurrent_downloads=3):
    """Download videos for all books with search terms"""
    search_terms_by_book = get_search_terms_from_books()
    download_log = load_download_log()
    if "downloaded_videos" not in download_log:
        download_log["downloaded_videos"] = {}

    # Maintain a per-book counter for tasks to ensure unique filenames
    tasks_counter = {}

    download_tasks = []
    for book_num, book_data in search_terms_by_book.items():
        book_str = str(book_num)
        # Create a subfolder for each book
        book_folder = os.path.join(VIDEOS_DIR, f"book_{book_num}")
        os.makedirs(book_folder, exist_ok=True)

        # Initialize counter from log if available, else start at 0
        tasks_counter[book_num] = len(download_log["downloaded_videos"].get(book_str, []))

        # If already reached max videos (as per log), skip
        if tasks_counter[book_num] >= max_videos_per_book:
            print(f"Already have {max_videos_per_book} videos for book {book_num}. Skipping.")
            continue

        for search_term in book_data["search_terms"]:
            # Check counter before queuing another task
            if tasks_counter[book_num] >= max_videos_per_book:
                break
            print(f"Searching for videos with term: '{search_term}' for book {book_num}")
            videos = search_pexels_videos(search_term)
            random.shuffle(videos)
            for video in videos:
                if tasks_counter[book_num] >= max_videos_per_book:
                    break
                video_file = get_best_video_file(video)
                if video_file and "link" in video_file:
                    video_url = video_file["link"]
                    # Skip if this URL has already been downloaded
                    if any(log.get("url") == video_url for logs in download_log["downloaded_videos"].values() for log in logs):
                        print("Video URL already downloaded. Skipping.")
                        continue
                    ext = get_file_extension_from_url(video_url)
                    # Use a naming scheme: vid_1, vid_2, etc.
                    tasks_counter[book_num] += 1
                    vid_index = tasks_counter[book_num]
                    output_filename = f"vid_{vid_index}{ext}"
                    output_path = os.path.join(book_folder, output_filename)
                    download_tasks.append({
                        "url": video_url,
                        "output_path": output_path,
                        "book_num": book_num,
                        "video_id": video.get("id", "unknown"),
                        "video_url": video.get("url", ""),
                        "width": video_file.get("width", 0),
                        "height": video_file.get("height", 0),
                        "search_term": search_term
                    })
                    # Stop after finding one suitable video per search term
                    break

    if download_tasks:
        print(f"Found {len(download_tasks)} videos to download.")
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrent_downloads) as executor:
            future_to_task = {}
            for task in download_tasks:
                future = executor.submit(
                    download_video_segment,
                    task["url"],
                    task["output_path"],
                    10  # duration in seconds
                )
                future_to_task[future] = task
            for future in concurrent.futures.as_completed(future_to_task):
                task = future_to_task[future]
                book_num_str = str(task["book_num"])
                try:
                    success = future.result()
                    if success:
                        print(f"Successfully downloaded video for book {task['book_num']}")
                        if book_num_str not in download_log["downloaded_videos"]:
                            download_log["downloaded_videos"][book_num_str] = []
                        download_log["downloaded_videos"][book_num_str].append({
                            "filename": os.path.basename(task["output_path"]),
                            "url": task["url"],
                            "video_id": task["video_id"],
                            "pexels_url": task["video_url"],
                            "width": task["width"],
                            "height": task["height"],
                            "search_term": task["search_term"],
                            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
                        })
                        save_download_log(download_log)
                    else:
                        print(f"Failed to download video for book {task['book_num']}")
                except Exception as e:
                    print(f"Error processing download for book {task['book_num']}: {e}")
    else:
        print("No new videos to download.")
    return download_log

def main():
    print("🎬 PEXELS VIDEO DOWNLOADER 🎬")
    print("Downloading first 10-second segments for book recommendations...")
    # Download up to 5 videos per book
    download_log = download_videos_for_books(max_videos_per_book=5)
    total_videos = sum(len(videos) for videos in download_log["downloaded_videos"].values())
    print(f"\nSummary: Downloaded {total_videos} videos in total")
    for book_num, videos in download_log["downloaded_videos"].items():
        if videos:
            print(f"Book {book_num}: {len(videos)} videos")

if __name__ == "__main__":
    main()
