In [14]:
import requests
from bs4 import BeautifulSoup
import csv

url = "https://www.postgresql.org/docs/17/sql.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

data = []

for dt in soup.find_all("dt"):
    title = dt.text.strip()
    dd = dt.find_next_sibling("dd")
    if dd:
        for link in dd.find_all("a"):  
            inner_title = link.text.strip()
            href = link.get("href")
            data.append([title, inner_title, href])

with open("sql.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Title", "Inner Title", "Link"])  
    writer.writerows(data)


In [15]:
import pandas as pd

df = pd.read_csv("sql.csv")
df

Unnamed: 0,Title,Inner Title,Link
0,4. SQL Syntax,4.1. Lexical Structure,sql-syntax-lexical.html
1,4. SQL Syntax,4.2. Value Expressions,sql-expressions.html
2,4. SQL Syntax,4.3. Calling Functions,sql-syntax-calling-funcs.html
3,5. Data Definition,5.1. Table Basics,ddl-basics.html
4,5. Data Definition,5.2. Default Values,ddl-default.html
...,...,...,...
122,14. Performance Tips,14.5. Non-Durable Settings,non-durability.html
123,15. Parallel Query,15.1. How Parallel Query Works,how-parallel-query-works.html
124,15. Parallel Query,15.2. When Can Parallel Query Be Used?,when-can-parallel-query-be-used.html
125,15. Parallel Query,15.3. Parallel Plans,parallel-plans.html


In [16]:
df['Link'] = 'https://www.postgresql.org/docs/17/' + df['Link']
df

Unnamed: 0,Title,Inner Title,Link
0,4. SQL Syntax,4.1. Lexical Structure,https://www.postgresql.org/docs/17/sql-syntax-...
1,4. SQL Syntax,4.2. Value Expressions,https://www.postgresql.org/docs/17/sql-express...
2,4. SQL Syntax,4.3. Calling Functions,https://www.postgresql.org/docs/17/sql-syntax-...
3,5. Data Definition,5.1. Table Basics,https://www.postgresql.org/docs/17/ddl-basics....
4,5. Data Definition,5.2. Default Values,https://www.postgresql.org/docs/17/ddl-default...
...,...,...,...
122,14. Performance Tips,14.5. Non-Durable Settings,https://www.postgresql.org/docs/17/non-durabil...
123,15. Parallel Query,15.1. How Parallel Query Works,https://www.postgresql.org/docs/17/how-paralle...
124,15. Parallel Query,15.2. When Can Parallel Query Be Used?,https://www.postgresql.org/docs/17/when-can-pa...
125,15. Parallel Query,15.3. Parallel Plans,https://www.postgresql.org/docs/17/parallel-pl...


In [17]:
def fetch_html_content(link):
    page_response = requests.get(link)
    page_soup = BeautifulSoup(page_response.text, "html.parser")
    content_div = page_soup.find("div", class_="sect1")
    return content_div.prettify() if content_div else "No content found"

df["Content"] = df["Link"].apply(fetch_html_content)
df.to_csv("sql.csv", index=False, encoding="utf-8")

In [18]:
print(df['Content'][122])

<div class="sect1" id="NON-DURABILITY">
 <div class="titlepage">
  <div>
   <div>
    <h2 class="title" style="clear: both">
     14.5. Non-Durable Settings
     <a class="id_link" href="#NON-DURABILITY">
      #
     </a>
    </h2>
   </div>
  </div>
 </div>
 <a class="indexterm" id="id-1.5.13.8.2" name="id-1.5.13.8.2">
 </a>
 <p>
  Durability is a database feature that guarantees the recording of committed transactions even if the server crashes or loses power. However, durability adds significant database overhead, so if your site does not require such a guarantee,
  <span class="productname">
   PostgreSQL
  </span>
  can be configured to run much faster. The following are configuration changes you can make to improve performance in such cases. Except as noted below, durability is still guaranteed in case of a crash of the database software; only an abrupt operating system crash creates a risk of data loss or corruption when these settings are used.
 </p>
 <div class="itemizedlist"