In [None]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv(override=True)

# Access environment variables
SELENIUM_URL = os.getenv("SELENIUM_URL", "http://localhost:4444/wd/hub")
print(f"SELENIUM_URL at '{SELENIUM_URL}'")

MONGODB_URI = os.getenv("MONGODB_URI", "mongodb://localhost:27017/")
print(f"MONGODB_URI at '{MONGODB_URI}'")

DATABASE = os.getenv("DATABASE", "video_feed_crawler")
print(f"DATABASE at '{DATABASE}'")


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By


options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
# options.add_argument('headless')

# Connect to the Selenium standalone container
driver = webdriver.Remote(
    command_executor=SELENIUM_URL,  # Or replace 'localhost' with the container host
    options=options
)


In [None]:
url = "https://www.youtube.com/@EonUpdates/videos"

In [None]:
driver.get(url)

title = driver.title

# driver.implicitly_wait(20)

print(title)

In [None]:
body=driver.execute_script("return document.body.innerHTML;")

In [None]:
driver.quit()

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(body, "html.parser")

In [None]:
videos_soup=soup.find_all('ytd-rich-item-renderer')

In [None]:
videos_soup

In [None]:
len(videos_soup)

In [None]:
videos=[]

In [None]:
for video_soup in videos_soup:
    title = video_soup.find(id="video-title").text
    link=video_soup.find(id="video-title-link").get('href', None)
    uid=link.split("/watch?v=")[1].split("&")[0]
    img_src=video_soup.find("img").get('src', None)
    
    if not img_src:
        continue
    
    video={
        "_id": uid,
        "site":0,
        "title":title,
        "cat":0,
        # "link":link,
        "img":img_src
    }
    
    videos.append(video)
    
    

In [None]:
for video in videos:
    print(video)

In [None]:
from pymongo import MongoClient
client = MongoClient(MONGODB_URI)

In [None]:
db = client[DATABASE]

In [None]:
# Get or create collection
from pymongo import ASCENDING, DESCENDING


collection_name = "videos"
if collection_name not in db.list_collection_names():
    db.create_collection(collection_name)

videos_collection = db[collection_name]

# Ensure the compound index exists
index_name = "cat_1_created_at_-1"
existing_indexes = videos_collection.index_information()

if index_name not in existing_indexes:
    videos_collection.create_index([("cat", ASCENDING), ("created_at", DESCENDING)], name=index_name)

print(f"Collection '{collection_name}' and index are ready.")

In [None]:
import datetime


In [None]:
for video in videos:

    video["created_at"]= datetime.datetime.now(tz=datetime.timezone.utc)

    video_id = videos_collection.insert_one(video).inserted_id
    print(video_id)

In [None]:
collection_name = "pages"
if collection_name not in db.list_collection_names():
    db.create_collection(collection_name)

pages_collection = db[collection_name]

# Ensure the compound index exists
index_name = "cat_1_created_at_-1"
existing_indexes = pages_collection.index_information()

if index_name not in existing_indexes:
    pages_collection.create_index([("cat", ASCENDING), ("created_at", DESCENDING)], name=index_name)

print(f"Collection '{collection_name}' and index are ready.")

In [None]:
title="Eon Updates"
link="https://www.youtube.com/@EonUpdates/videos"
page={
    "site":0,
    "title":title,
    "cat":0,
    "link":link
}



page["created_at"]= datetime.datetime.now(tz=datetime.timezone.utc)

page_id = pages_collection.insert_one(page).inserted_id
print(page_id)