In [51]:
import requests
from bs4 import BeautifulSoup
import csv

url = "https://scikit-learn.org/stable/user_guide.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

data = []

for dt in soup.find_all("li", class_="toctree-l1"):
    title = dt.find("a").text.strip() if dt.find("a") else ""  
    for sub_li in dt.find_all("li", class_="toctree-l2"):  
        link_tag = sub_li.find("a")  
        if link_tag:
            inner_title = link_tag.text.strip()
            href = link_tag.get("href")
            data.append([title, inner_title, href])

with open("scikit_learn.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Title", "Inner Title", "Link"])
    writer.writerows(data)


In [52]:
import pandas as pd

df = pd.read_csv("scikit_learn.csv")
df

Unnamed: 0,Title,Inner Title,Link
0,1. Supervised learning,1.1. Linear Models,modules/linear_model.html
1,1. Supervised learning,1.2. Linear and Quadratic Discriminant Analysis,modules/lda_qda.html
2,1. Supervised learning,1.3. Kernel ridge regression,modules/kernel_ridge.html
3,1. Supervised learning,1.4. Support Vector Machines,modules/svm.html
4,1. Supervised learning,1.5. Stochastic Gradient Descent,modules/sgd.html
...,...,...,...
108,10. Common pitfalls and recommended practices,10.3. Controlling randomness,common_pitfalls.html#controlling-randomness
109,11. Dispatching,11.1. Array API support (experimental),modules/array_api.html
110,"13. External Resources, Videos and Talks",13.1. New to Scientific Python?,presentations.html#new-to-scientific-python
111,"13. External Resources, Videos and Talks",13.2. External Tutorials,presentations.html#external-tutorials


In [53]:
df['Link'] = 'https://scikit-learn.org/stable/' + df['Link']
df

Unnamed: 0,Title,Inner Title,Link
0,1. Supervised learning,1.1. Linear Models,https://scikit-learn.org/stable/modules/linear...
1,1. Supervised learning,1.2. Linear and Quadratic Discriminant Analysis,https://scikit-learn.org/stable/modules/lda_qd...
2,1. Supervised learning,1.3. Kernel ridge regression,https://scikit-learn.org/stable/modules/kernel...
3,1. Supervised learning,1.4. Support Vector Machines,https://scikit-learn.org/stable/modules/svm.html
4,1. Supervised learning,1.5. Stochastic Gradient Descent,https://scikit-learn.org/stable/modules/sgd.html
...,...,...,...
108,10. Common pitfalls and recommended practices,10.3. Controlling randomness,https://scikit-learn.org/stable/common_pitfall...
109,11. Dispatching,11.1. Array API support (experimental),https://scikit-learn.org/stable/modules/array_...
110,"13. External Resources, Videos and Talks",13.1. New to Scientific Python?,https://scikit-learn.org/stable/presentations....
111,"13. External Resources, Videos and Talks",13.2. External Tutorials,https://scikit-learn.org/stable/presentations....


In [54]:
from urllib.parse import urljoin

def fetch_html_content(link):
    page_response = requests.get(link)
    page_soup = BeautifulSoup(page_response.text, "html.parser")
    content_div = page_soup.find("article", class_="bd-article")

    if content_div:
        for img in content_div.find_all('img'):
            img_src = img.get("src")
            if img_src:
                img_url = urljoin(link, img_src)
                img['src'] = img_url  

        return content_div.prettify() 

    else:
        return "No content found"

df["Content"] = df["Link"].apply(fetch_html_content)
df.to_csv("scikit_learn.csv", index=False, encoding="utf-8")


In [55]:
df

Unnamed: 0,Title,Inner Title,Link,Content
0,1. Supervised learning,1.1. Linear Models,https://scikit-learn.org/stable/modules/linear...,"<article class=""bd-article"">\n <section id=""li..."
1,1. Supervised learning,1.2. Linear and Quadratic Discriminant Analysis,https://scikit-learn.org/stable/modules/lda_qd...,"<article class=""bd-article"">\n <section id=""li..."
2,1. Supervised learning,1.3. Kernel ridge regression,https://scikit-learn.org/stable/modules/kernel...,"<article class=""bd-article"">\n <section id=""ke..."
3,1. Supervised learning,1.4. Support Vector Machines,https://scikit-learn.org/stable/modules/svm.html,"<article class=""bd-article"">\n <section id=""su..."
4,1. Supervised learning,1.5. Stochastic Gradient Descent,https://scikit-learn.org/stable/modules/sgd.html,"<article class=""bd-article"">\n <section id=""st..."
...,...,...,...,...
108,10. Common pitfalls and recommended practices,10.3. Controlling randomness,https://scikit-learn.org/stable/common_pitfall...,"<article class=""bd-article"">\n <section id=""co..."
109,11. Dispatching,11.1. Array API support (experimental),https://scikit-learn.org/stable/modules/array_...,"<article class=""bd-article"">\n <section id=""ar..."
110,"13. External Resources, Videos and Talks",13.1. New to Scientific Python?,https://scikit-learn.org/stable/presentations....,"<article class=""bd-article"">\n <section id=""ex..."
111,"13. External Resources, Videos and Talks",13.2. External Tutorials,https://scikit-learn.org/stable/presentations....,"<article class=""bd-article"">\n <section id=""ex..."


In [56]:
print(df['Content'][0])

<article class="bd-article">
 <section id="linear-models">
  <span id="linear-model">
  </span>
  <h1>
   <span class="section-number">
    1.1.
   </span>
   Linear Models
   <a class="headerlink" href="#linear-models" title="Link to this heading">
    #
   </a>
  </h1>
  <p>
   The following are a set of methods intended for regression in which
the target value is expected to be a linear combination of the features.
In mathematical notation, if
   <span class="math notranslate nohighlight">
    \(\hat{y}\)
   </span>
   is the predicted
value.
  </p>
  <div class="math notranslate nohighlight">
   \[\hat{y}(w, x) = w_0 + w_1 x_1 + ... + w_p x_p\]
  </div>
  <p>
   Across the module, we designate the vector
   <span class="math notranslate nohighlight">
    \(w = (w_1,
..., w_p)\)
   </span>
   as
   <code class="docutils literal notranslate">
    <span class="pre">
     coef_
    </span>
   </code>
   and
   <span class="math notranslate nohighlight">
    \(w_0\)
   </span>
   as
   