In [1]:
import gridfs
import logging
from pymongo import MongoClient
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from clearml import Task

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# MongoDB connection setup
client = MongoClient("mongodb://localhost:27017/")
db = client['clearml']  # Use the ClearML database or create a new one
fs = gridfs.GridFS(db)  # Set up GridFS to store large files

# YouTubeCrawler class for scraping YouTube transcripts
class YouTubeCrawler:
    def __init__(self):
        self.db = db
        self.fs = fs
        self.task = Task.init(project_name='ETL Pipeline', task_name='YouTube Data Ingestion', task_type=Task.TaskTypes.data_processing)

    def extract(self, link: str) -> None:
        # Check if data is already ingested to avoid duplicates
        existing_doc = self.db["youtube_transcripts"].find_one({"link": link})
        if existing_doc:
            logger.info(f"Transcript already exists in the database for: {link}")
            return

        logger.info(f"Scraping YouTube video: {link}")
        try:
            # Extract video ID from the link
            video_id = link.split("v=")[-1].split("&")[0]

            # Fetch transcript using YouTube Transcript API
            try:
                transcript = YouTubeTranscriptApi.get_transcript(video_id)
                transcript_text = " ".join([entry["text"] for entry in transcript])
            except (TranscriptsDisabled, NoTranscriptFound) as e:
                transcript_text = f"Transcript not available for this video. Error: {str(e)}"
            
            # Save the transcript text into GridFS
            file_id = fs.put(transcript_text.encode("utf-8"), filename="youtube_transcript", link=link)
            logger.info(f"Stored transcript in GridFS with file_id: {file_id}")

            # Store metadata of the transcript in MongoDB (e.g., link, file_id)
            self.db["youtube_transcripts"].insert_one({
                "link": link,
                "file_id": file_id,
                "platform": "youtube",
                "metadata": {
                    "video_id": video_id,
                    "transcript_length": len(transcript_text),
                }
            })
            logger.info(f"Metadata for {link} saved successfully.")
        except Exception as e:
            logger.error(f"Error during extraction: {e}")
    
    def fetch_transcript(self, link: str):
        # Retrieve transcript from GridFS using the video link
        document = self.db["youtube_transcripts"].find_one({"link": link})
        if not document:
            logger.info(f"No transcript found for {link}")
            return None
        file_id = document.get("file_id")
        transcript_file = fs.get(file_id)
        return transcript_file.read().decode("utf-8")  # Return transcript as a string

    def print_all_urls(self):
        # Print all URLs stored in the MongoDB database
        urls = self.db["youtube_transcripts"].find({}, {"link": 1})
        for url in urls:
            print(url["link"])

# Example usage
if __name__ == "__main__":
    youtube_crawler = YouTubeCrawler()

    # Example list of YouTube video URLs to process
    youtube_urls = [
        "https://www.youtube.com/watch?v=dQw4w9WgXcQ",  # Example link 1
        "https://www.youtube.com/watch?v=3JZ_D3ELwOQ",  # Example link 2
    ]
    
    # Ingest YouTube data into MongoDB
    for url in youtube_urls:
        youtube_crawler.extract(url)

    # Print all URLs ingested into the database
    youtube_crawler.print_all_urls()


ClearML Task: created new task id=027068805eaf435c921665c875ecd97f
ClearML results page: http://localhost:8080/projects/19bbaae1f62b4cabb5b71164309d5cd5/experiments/027068805eaf435c921665c875ecd97f/output/log
2024-12-07 02:07:00,749 - clearml.Task - INFO - Storing jupyter notebook directly as code


INFO:__main__:Scraping YouTube video: https://www.youtube.com/watch?v=dQw4w9WgXcQ


CLEARML-SERVER new package available: UPGRADE to v1.17.0 is recommended!
Release Notes:
### New Features 
- New ClearML Model dashboard: View all live model endpoints in a single location, complete with real time metrics reporting.
- New UI pipeline run table comparative view: compare plots and scalars of selected pipeline runs
- Improve services agent behavior: If no credentials are specified, agent uses default credentials ([ClearML Server GitHub issue #140](https://github.com/allegroai/clearml-server/issues/140))
- Add UI re-enqueue of failed tasks
- Add UI experiment scalar results table view
- Add "Block running user's scripts in the browser" UI setting option for added security
- Add UI "Reset" to set task installed packages to originally recorded values 
- Add UI edit of default Project default output destination

### Bug Fixes
- Fix broken download links to artifacts stored in Azure ([ClearML Server GitHub issue #247](https://github.com/allegroai/clearml-server/issues/247))
- F

INFO:__main__:Stored transcript in GridFS with file_id: 6753f4156bb21546b08ab1ae
INFO:__main__:Metadata for https://www.youtube.com/watch?v=dQw4w9WgXcQ saved successfully.
INFO:__main__:Scraping YouTube video: https://www.youtube.com/watch?v=3JZ_D3ELwOQ
INFO:__main__:Stored transcript in GridFS with file_id: 6753f4166bb21546b08ab1b1
INFO:__main__:Metadata for https://www.youtube.com/watch?v=3JZ_D3ELwOQ saved successfully.


https://www.youtube.com/watch?v=dQw4w9WgXcQ
https://www.youtube.com/watch?v=3JZ_D3ELwOQ


