In [None]:
import requests
from bs4 import BeautifulSoup
import csv

url = "https://javascript.info/document"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

data = []
for section in soup.find_all("ol", class_="lessons-list__lessons"):  
    for link in section.find_all("a"):  # Extract <a> tags
        title = link.text.strip()  # Extract title text
        href = link.get("href")  # Extract href attribute
        data.append([title, href])  # Append to list

with open("java.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Title", "Link"])  
    writer.writerows(data)

Data saved to java_learning_data.csv


In [33]:
import pandas as pd

df = pd.read_csv("java_learning_data.csv")
df

Unnamed: 0,Title,Link
0,"Browser environment, specs",/browser-environment
1,DOM tree,/dom-nodes
2,Walking the DOM,/dom-navigation
3,"Searching: getElement*, querySelector*",/searching-elements-dom
4,"Node properties: type, tag and contents",/basic-dom-node-properties
5,Attributes and properties,/dom-attributes-and-properties
6,Modifying the document,/modifying-document
7,Styles and classes,/styles-and-classes
8,Element size and scrolling,/size-and-scroll
9,Window sizes and scrolling,/size-and-scroll-window


In [None]:
df['Link'] = 'https://javascript.info' + df['Link']

Unnamed: 0,Title,Link
0,"Browser environment, specs",https://javascript.info/browser-environment
1,DOM tree,https://javascript.info/dom-nodes
2,Walking the DOM,https://javascript.info/dom-navigation
3,"Searching: getElement*, querySelector*",https://javascript.info/searching-elements-dom
4,"Node properties: type, tag and contents",https://javascript.info/basic-dom-node-properties
5,Attributes and properties,https://javascript.info/dom-attributes-and-pro...
6,Modifying the document,https://javascript.info/modifying-document
7,Styles and classes,https://javascript.info/styles-and-classes
8,Element size and scrolling,https://javascript.info/size-and-scroll
9,Window sizes and scrolling,https://javascript.info/size-and-scroll-window


In [None]:
from urllib.parse import urljoin  

def crawl_website(url, visited=set()):
    if url in visited:  
        return None
    visited.add(url)

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        content = []

        header = soup.find('h1', class_='main__header-title')
        if header:
            content.append(str(header))  

        content_div = soup.find('div', class_='content')
        if content_div:
            for img in content_div.find_all('img'):
                src = img.get('src')
                if src and src.startswith('/'): 
                    img['src'] = urljoin(url, src)  

            for a_tag in content_div.find_all('a', href=True):
                href = a_tag['href']
                if href.startswith('/'):  
                    a_tag['href'] = urljoin(url, href)  
            content.append(str(content_div))  
        return "\n".join(content)    
    except requests.RequestException as e:
        print(f"Error processing {url}: {e}")
        return None

df["Elements"] = df["Link"].apply(crawl_website)
df = df.drop(columns='Link')
df.to_csv("java_learning_data.csv", index=False)


In [36]:
df

Unnamed: 0,Title,Elements
0,"Browser environment, specs","<h1 class=""main__header-title"">Browser environ..."
1,DOM tree,"<h1 class=""main__header-title"">DOM tree</h1>\n..."
2,Walking the DOM,"<h1 class=""main__header-title"">Walking the DOM..."
3,"Searching: getElement*, querySelector*","<h1 class=""main__header-title"">Searching: getE..."
4,"Node properties: type, tag and contents","<h1 class=""main__header-title"">Node properties..."
5,Attributes and properties,"<h1 class=""main__header-title"">Attributes and ..."
6,Modifying the document,"<h1 class=""main__header-title"">Modifying the d..."
7,Styles and classes,"<h1 class=""main__header-title"">Styles and clas..."
8,Element size and scrolling,"<h1 class=""main__header-title"">Element size an..."
9,Window sizes and scrolling,"<h1 class=""main__header-title"">Window sizes an..."


In [39]:
print(df['Elements'][2])

<h1 class="main__header-title">Walking the DOM</h1>
<div class="content"><article class="formatted" itemscope="" itemtype="http://schema.org/TechArticle"><meta content="Walking the DOM" itemprop="name"/><div itemprop="author" itemscope="" itemtype="http://schema.org/Person"><meta content="iliakan@gmail.com" itemprop="email"/><meta content="Ilya Kantor" itemprop="name"/></div><div itemprop="articleBody"><p>The DOM allows us to do anything with elements and their contents, but first we need to reach the corresponding DOM object.</p>
<p>All operations on the DOM start with the <code>document</code> object. That’s the main “entry point” to DOM. From it we can access any node.</p>
<p>Here’s a picture of links that allow for travel between DOM nodes:</p>
<figure><div class="image" style="width:420px">
<div class="image__ratio" style="padding-top:92.38095238095238%"></div>
<object class="image__image" data="/article/dom-navigation/dom-links.svg" data-use-theme="" height="388" type="image/svg+