In [1]:
import requests
from bs4 import BeautifulSoup
import time
import json

In [2]:
BASE_URL = "https://www.tensorflow.org/tutorials"
HEADERS = {"User-Agent": "Mozilla/5.0"}

In [3]:
def get_tutorial_links():
    res = requests.get(BASE_URL, headers=HEADERS)
    soup = BeautifulSoup(res.content, "html.parser")
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("/tutorials/") and "overview" not in href:
            full_url = "https://www.tensorflow.org" + href
            links.append(full_url)
    return list(set(links))


In [7]:
def scrape_tutorial(url):
    res = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(res.content, "html.parser")
    title = soup.find("h1").text.strip() if soup.find("h1") else ""

    code_snippets = []
    for pre in soup.find_all("pre"):
        code_block = pre.text.strip()
        # Keep only likely Python code
        if any(code_block.startswith(kw) for kw in ("import", "def", "class")) or "tf." in code_block:
            code_snippets.append(code_block)

    return {
        "url": url,
        "title": title,
        "code_snippets": code_snippets
    }

In [8]:
def scrape_all_tutorials():
    tutorial_links = get_tutorial_links()
    all_data = []
    for link in tutorial_links:
        print(f"Scraping: {link}")
        data = scrape_tutorial(link)
        all_data.append(data)
        time.sleep(1.5)
    return all_data

In [10]:
def scrape_tensorflow():
    base_url = "https://www.tensorflow.org/tutorials"
    res = requests.get(base_url, headers=HEADERS)
    soup = BeautifulSoup(res.content, "html.parser")

    links = {
        "https://www.tensorflow.org" + a["href"]
        for a in soup.find_all("a", href=True)
        if a["href"].startswith("/tutorials/") and "overview" not in a["href"]
    }

    data = []
    for url in links:
        res = requests.get(url, headers=HEADERS)
        page = BeautifulSoup(res.content, "html.parser")
        title = page.find("h1").text.strip() if page.find("h1") else ""
        code_blocks = [pre.text.strip() for pre in page.find_all("pre") if "tf." in pre.text or pre.text.strip().startswith(("import", "def", "class"))]
        data.append({"url": url, "title": title, "code_snippets": code_blocks})
        time.sleep(1.2)
    return data


In [9]:
tutorials = scrape_all_tutorials()
with open("tensorflow_tutorials.json", "w") as f:
    json.dump(tutorials, f, indent=2)
print(f"Scraped {len(tutorials)} tutorials.")

Scraping: https://www.tensorflow.org/tutorials/generative/pix2pix
Scraping: https://www.tensorflow.org/tutorials/keras/text_classification_with_hub
Scraping: https://www.tensorflow.org/tutorials/keras/classification
Scraping: https://www.tensorflow.org/tutorials/load_data/text
Scraping: https://www.tensorflow.org/tutorials/keras/overfit_and_underfit
Scraping: https://www.tensorflow.org/tutorials/distribute/save_and_load
Scraping: https://www.tensorflow.org/tutorials/video/transfer_learning_with_movinet
Scraping: https://www.tensorflow.org/tutorials/distribute/dtensor_keras_tutorial
Scraping: https://www.tensorflow.org/tutorials/generative/cyclegan
Scraping: https://www.tensorflow.org/tutorials/generative/dcgan
Scraping: https://www.tensorflow.org/tutorials/distribute/parameter_server_training
Scraping: https://www.tensorflow.org/tutorials/generative/autoencoder
Scraping: https://www.tensorflow.org/tutorials/keras/text_classification
Scraping: https://www.tensorflow.org/tutorials/optimi