# Web screenshot crawler

## Selenium setup

In [None]:
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

from selenium import webdriver

# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=options)

## List of websites

In [None]:
urls = [
    "https://www.nytimes.com/es/",
    "https://www.bbc.com/mundo",
    "https://www.nbc.com/?lang=es"
]


## OneDrive Access

In [None]:
!pip install msal
from msal import PublicClientApplication

client_id = 'your-client-id'
tenant_id = 'your-tenant-id'
authority_url = 'https://login.microsoftonline.com/' + tenant_id
client_secret = 'your-client-secret'

app = PublicClientApplication(client_id, authority=authority_url, client_credential=client_secret)

token_response = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
access_token = token_response['access_token']


## Screenshots and OneDrive Upload

In [None]:
import requests

def upload_to_onedrive(access_token, file_path, file_name):
    headers = {
        "Authorization": "Bearer " + access_token,
        "Content-Type": "application/octet-stream"
    }
    graph_endpoint = f'https://graph.microsoft.com/v1.0/me/drive/root:/Screenshots/{file_name}:/content'
    with open(file_path, "rb") as f:
        data = f.read()
    response = requests.put(graph_endpoint, headers=headers, data=data)
    print(response.json())

# Loop through URLs, take screenshots, and upload
for i, url in enumerate(urls):
    try:
        driver.get(url)
        screenshot_path = f"/content/screenshot_{i}.png"
        driver.save_screenshot(screenshot_path)
        upload_to_onedrive(access_token, screenshot_path, f"screenshot_{i}.png")
    except Exception as e:
        print(f"Failed to process {url}: {str(e)}")

driver.quit()
