In [3]:
import requests
from bs4 import BeautifulSoup
import csv

url = "https://pytorch.org/tutorials/beginner/basics/intro.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

data = []
for dt in soup.find_all("div", class_="line"):
    title = dt.find("a").text.strip()  
    href = dt.find("a")["href"]  
    data.append([title, href])

with open("pytorch.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Title", "Link"])
    writer.writerows(data)


In [4]:
import pandas as pd

df = pd.read_csv("pytorch.csv")
df

Unnamed: 0,Title,Link
0,Quickstart,quickstart_tutorial.html
1,Tensors,tensorqs_tutorial.html
2,Datasets and DataLoaders,data_tutorial.html
3,Transforms,transforms_tutorial.html
4,Build Model,buildmodel_tutorial.html
5,Automatic Differentiation,autogradqs_tutorial.html
6,Optimization Loop,optimization_tutorial.html
7,"Save, Load and Use Model",saveloadrun_tutorial.html


In [5]:
df['Link'] = 'https://pytorch.org/tutorials/beginner/basics/' + df['Link']
df

Unnamed: 0,Title,Link
0,Quickstart,https://pytorch.org/tutorials/beginner/basics/...
1,Tensors,https://pytorch.org/tutorials/beginner/basics/...
2,Datasets and DataLoaders,https://pytorch.org/tutorials/beginner/basics/...
3,Transforms,https://pytorch.org/tutorials/beginner/basics/...
4,Build Model,https://pytorch.org/tutorials/beginner/basics/...
5,Automatic Differentiation,https://pytorch.org/tutorials/beginner/basics/...
6,Optimization Loop,https://pytorch.org/tutorials/beginner/basics/...
7,"Save, Load and Use Model",https://pytorch.org/tutorials/beginner/basics/...


In [6]:
from urllib.parse import urljoin

def fetch_html_content(link):
    page_response = requests.get(link)
    page_soup = BeautifulSoup(page_response.text, "html.parser")
    content_div = page_soup.find("div", class_="section")

    if content_div:
        for img in content_div.find_all('img'):
            img_src = img.get("src")
            if img_src:
                img_url = urljoin(link, img_src)
                img['src'] = img_url  

        return content_div.prettify() 

    else:
        return "No content found"

df["Content"] = df["Link"].apply(fetch_html_content)
df.to_csv("pytorch.csv", index=False, encoding="utf-8")


In [7]:
df

Unnamed: 0,Title,Link,Content
0,Quickstart,https://pytorch.org/tutorials/beginner/basics/...,"<div class=""section"" id=""quickstart"">\n <h1>\n..."
1,Tensors,https://pytorch.org/tutorials/beginner/basics/...,"<div class=""section"" id=""tensors"">\n <h1>\n T..."
2,Datasets and DataLoaders,https://pytorch.org/tutorials/beginner/basics/...,"<div class=""section"" id=""datasets-dataloaders""..."
3,Transforms,https://pytorch.org/tutorials/beginner/basics/...,"<div class=""section"" id=""transforms"">\n <h1>\n..."
4,Build Model,https://pytorch.org/tutorials/beginner/basics/...,"<div class=""section"" id=""build-the-neural-netw..."
5,Automatic Differentiation,https://pytorch.org/tutorials/beginner/basics/...,"<div class=""section"" id=""automatic-differentia..."
6,Optimization Loop,https://pytorch.org/tutorials/beginner/basics/...,"<div class=""section"" id=""optimizing-model-para..."
7,"Save, Load and Use Model",https://pytorch.org/tutorials/beginner/basics/...,"<div class=""section"" id=""save-and-load-the-mod..."


In [8]:
print(df['Content'][0])

<div class="section" id="quickstart">
 <h1>
  Quickstart
  <a class="headerlink" href="#quickstart" title="Permalink to this heading">
   ¶
  </a>
 </h1>
 <p class="date-info-last-verified" style="color: #6c6c6d; font-size: small;">
  Created On: Feb 09, 2021 | Last Updated: Jan 24, 2025 | Last Verified: Not Verified
 </p>
 <p>
  This section runs through the API for common tasks in machine learning. Refer to the links in each section to dive deeper.
 </p>
 <div class="section" id="working-with-data">
  <h2>
   Working with data
   <a class="headerlink" href="#working-with-data" title="Permalink to this heading">
    ¶
   </a>
  </h2>
  <p>
   PyTorch has two
   <a class="reference external" href="https://pytorch.org/docs/stable/data.html">
    primitives to work with data
   </a>
   :
   <code class="docutils literal notranslate">
    <span class="pre">
     torch.utils.data.DataLoader
    </span>
   </code>
   and
   <code class="docutils literal notranslate">
    <span class="pre">
