In [1]:
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import time
import json

In [2]:
HEADERS = {"User-Agent": "Mozilla/5.0"}

def save_to_json(data, filename):
    with open(filename, "w") as f:
        json.dump(data, f, indent=2)
    print(f"Saved {len(data)} entries to {filename}")

In [3]:
from urllib.parse import urljoin

def scrape_sklearn():
    base_url = "https://scikit-learn.org/stable/tutorial/index.html"
    try:
        res = requests.get(base_url, headers=HEADERS)
        soup = BeautifulSoup(res.content, "html.parser")

        links = set()
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if "tutorial/" in href and href.endswith(".html"):
                full_url = urljoin(base_url, href)
                print(f"Found link: {full_url}")
                links.add(full_url)

    except Exception as e:
        print(f"Error accessing scikit-learn index page: {e}")
        return []

    print(f"Total scikit-learn tutorial links found: {len(links)}")

    data = []
    for url in links:
        try:
            print(f"Scraping: {url}")
            res = requests.get(url, headers=HEADERS)
            page = BeautifulSoup(res.content, "html.parser")
            title = page.find("h1").text.strip() if page.find("h1") else "No title"

            code_blocks = []
            for div in page.find_all("div", class_="highlight-python notranslate"):
                pre = div.find("pre")
                if pre:
                    code = pre.text.strip()
                    if any(kw in code for kw in ["sklearn.", "import", "def", "class"]):
                        code_blocks.append(code)

            print(f"{title}: {len(code_blocks)} code blocks found")
            if code_blocks:
                data.append({
                    "url": url,
                    "title": title,
                    "code_snippets": code_blocks
                })
            time.sleep(1.2)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
    return data


In [6]:
sklearn_data = scrape_sklearn()
save_to_json(sklearn_data, "sklearn_tutorials.json")

Total scikit-learn tutorial links: 4
Scraping: https://scikit-learn.org/stable/tutorial/basic/tutorial.html
404: 0 code blocks found
Scraping: https://scikit-learn.org/stable/tutorial/statistical_inference/index.html
404: 0 code blocks found
Scraping: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
404: 0 code blocks found
Scraping: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
No title: 0 code blocks found


KeyboardInterrupt: 

In [5]:
import requests
from bs4 import BeautifulSoup
import time
import json

def scrape_sklearn():
    links = [
        "https://scikit-learn.org/stable/modules/linear_model.html#",
        "https://scikit-learn.org/stable/modules/lda_qda.html",
        "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html",
        "https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html"
    ]

    print(f"Total scikit-learn tutorial links: {len(links)}")
    data = []

    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    for url in links:
        try:
            print(f"Scraping: {url}")
            res = requests.get(url, headers=headers)
            page = BeautifulSoup(res.content, "html.parser")
            title_tag = page.find("h1")
            title = title_tag.text.strip() if title_tag else "No title"

            code_blocks = []
            for div in page.find_all("div", class_="highlight"):
                pre = div.find("pre")
                if pre:
                    code = pre.text.strip()
                    if any(kw in code for kw in ["sklearn.", "import", "def", "class"]):
                        code_blocks.append(code)

            print(f"{title}: {len(code_blocks)} code blocks found")
            if code_blocks:
                data.append({
                    "url": url,
                    "title": title,
                    "code_snippets": code_blocks
                })
            time.sleep(1.2)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
    return data

def save_to_json(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)
    print(f"Saved {len(data)} entries to {filename}")
