In [1]:
import requests
import csv
import fitz 
import pdfplumber
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [2]:
BASE_URL = "https://www.sharjah.ac.ae/"
visited_urls = set() 
data = []  
pdf_data = [] 

In [3]:
def get_all_links(url):
    """Extract all internal links and PDF links from a page."""
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        links = set()
        pdf_links = set()

        for a_tag in soup.find_all("a", href=True):
            link = urljoin(url, a_tag["href"])  
            if BASE_URL in link and link not in visited_urls:  
                if link.endswith(".pdf"):
                    pdf_links.add(link)  
                else:
                    links.add(link)
        return links, pdf_links
    except requests.exceptions.RequestException:
        return set(), set()

In [4]:
def scrape_page(url):
    """Extract visible text from a webpage."""
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        text = " ".join(soup.stripped_strings)  # Extract text
        return text
    except requests.exceptions.RequestException:
        return ""

In [5]:
def download_and_extract_pdf_text(pdf_url):
    # downlaod and process the pdf content
    try:
        response = requests.get(pdf_url, timeout=10)
        if response.status_code == 200:
            pdf_filename = "temp.pdf"
            with open(pdf_filename, "wb") as pdf_file:
                pdf_file.write(response.content)

            doc = fitz.open(pdf_filename)
            text = "\n".join([page.get_text("text") for page in doc])

            return text
        else:
            return ""
    except requests.exceptions.RequestException:
        return ""

In [6]:
# Start crawling from the home page till end of ...
to_visit = {BASE_URL}

while to_visit:
    url = to_visit.pop()
    if url in visited_urls:
        continue

    print(f"Scraping: {url}")
    visited_urls.add(url)

    text = scrape_page(url)
    if text:
        data.append([url, text])

    new_links, pdf_links = get_all_links(url)
    to_visit.update(new_links)

    # Process PDFs found on the page
    for pdf_url in pdf_links:
        if pdf_url not in visited_urls:
            print(f"Downloading and extracting PDF: {pdf_url}")
            visited_urls.add(pdf_url)
            pdf_text = download_and_extract_pdf_text(pdf_url)
            if pdf_text:
                pdf_data.append([pdf_url, pdf_text])


Scraping: https://www.sharjah.ac.ae/
Scraping: https://www.sharjah.ac.ae/en/Student-Life/Health-And-Wellness
Scraping: https://www.sharjah.ac.ae/en/Academics/Pharmacy
Scraping: https://www.sharjah.ac.ae/en/Student-Life
Scraping: https://www.sharjah.ac.ae/en/Academics/Public-Policy
Scraping: https://www.sharjah.ac.ae/en/Student-Life/Student-Council
Scraping: https://www.sharjah.ac.ae/en/Careers
Scraping: https://www.sharjah.ac.ae/Academics/Pharmacy##
Scraping: https://www.sharjah.ac.ae/en/+97165050701
Scraping: https://www.sharjah.ac.ae/Student-Life##
Scraping: https://www.sharjah.ac.ae/en/Student-Life/Dining-Hall
Scraping: https://www.sharjah.ac.ae/en/Academics/fa
Downloading and extracting PDF: https://www.sharjah.ac.ae/-/media/project/uos/sites/uos/colleges/fine-arts-and-design/highlights/cfad-annual-faculty-exhibition.pdf
Scraping: https://www.sharjah.ac.ae/en/Academics/Dentistry
Scraping: https://www.sharjah.ac.ae/en/Academics/Sciences
Scraping: https://www.sharjah.ac.ae/en/Academi

In [7]:
#now save webpage data  and pdf content in csv file
with open("website_data.csv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["URL", "Content"])
    writer.writerows(data)

with open("pdf_data.csv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["PDF URL", "Extracted Text"])
    writer.writerows(pdf_data)

print("Data saved to website_data.csv and pdf_data.csv")

Data saved to website_data.csv and pdf_data.csv


In [8]:
# import pandas as pd
# df = pd.read_csv("website_data.csv")
# print(df.head())