In [1]:
import requests
from bs4 import BeautifulSoup

In [7]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers,verify=False)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [8]:
web = Website("https://www.federalshariatcourt.gov.pk/alljud.php")
web.links



['Judgments/S.P.2.L.1979 FSC JUDGEMENT.pdf',
 'Judgments/S.P.NOS.13.L.1979.pdf',
 'Judgments/CRIMINAL APPEAL NQ.4R. OF 1980..pdf',
 'Judgments/SHARIAT PETITION N0.6R OF 1980.pdf',
 'Judgments/CRIMINAL APP NO 9-R OF 1980.pdf',
 'Judgments/CRIMINAL APP No 12-I OF 1980.pdf',
 'Judgments/Sr.P.No.1.K.of.1981.pdf',
 'Judgments/Criminal Appeal No. 3-L OF 1981.pdf',
 'Judgments/Sr.P.No.3.P.of.1981.pdf',
 'Judgments/CRIMINAL APPEAL NO. 10 L OF 1981.pdf',
 'Judgments/Criminal Appeal No. 12-L OF 1981.pdf',
 'Judgments/S.P.NO. 15-L-1981.pdf',
 'Judgments/Criminal Appeal No. 16-L OF 1981.pdf',
 'Judgments/CR.A.NO.20-I-1981.pdf',
 'Judgments/CRIMINAL APP NO 20-I OF 1981.pdf',
 'Judgments/Criminal Appeal No. 23-L OF 1981.pdf',
 'Judgments/Criminal Appeal No. 27-L OF 1981.pdf',
 'Judgments/CRIMINAL APP No 72-I OF 1981.pdf',
 'Judgments/CRIMINAL APP NO 77-1 A 1981.pdf',
 'Judgments/CRIMINAL APP NO 79-I OF 1981.pdf',
 'Judgments/CRIMINAL APP NO 106-I OF 1981.pdf',
 'Judgments/CR.A.NO.114-I-81.pdf',
 'Ju

In [18]:
pdfs = []
for link in web.links:
  if ".pdf" in link:
    pdfs.append(link)

In [19]:
for i in range(len(pdfs)):
  pdfs[i] = f"https://federalshariatcourt.gov.pk/{pdfs[i]}"

In [20]:
pdfs

['https://federalshariatcourt.gov.pk/Judgments/S.P.2.L.1979 FSC JUDGEMENT.pdf',
 'https://federalshariatcourt.gov.pk/Judgments/S.P.NOS.13.L.1979.pdf',
 'https://federalshariatcourt.gov.pk/Judgments/CRIMINAL APPEAL NQ.4R. OF 1980..pdf',
 'https://federalshariatcourt.gov.pk/Judgments/SHARIAT PETITION N0.6R OF 1980.pdf',
 'https://federalshariatcourt.gov.pk/Judgments/CRIMINAL APP NO 9-R OF 1980.pdf',
 'https://federalshariatcourt.gov.pk/Judgments/CRIMINAL APP No 12-I OF 1980.pdf',
 'https://federalshariatcourt.gov.pk/Judgments/Sr.P.No.1.K.of.1981.pdf',
 'https://federalshariatcourt.gov.pk/Judgments/Criminal Appeal No. 3-L OF 1981.pdf',
 'https://federalshariatcourt.gov.pk/Judgments/Sr.P.No.3.P.of.1981.pdf',
 'https://federalshariatcourt.gov.pk/Judgments/CRIMINAL APPEAL NO. 10 L OF 1981.pdf',
 'https://federalshariatcourt.gov.pk/Judgments/Criminal Appeal No. 12-L OF 1981.pdf',
 'https://federalshariatcourt.gov.pk/Judgments/S.P.NO. 15-L-1981.pdf',
 'https://federalshariatcourt.gov.pk/Judgme

In [11]:
import json


In [12]:
with open("links.json","w") as f:
  json.dump(pdfs,f,indent=2)