In [2]:
import requests  
from bs4 import BeautifulSoup  
import pandas as pd  
from urllib.parse import urljoin, urlparse  # Add urlparse
from PyPDF2 import PdfFileReader  
from io import BytesIO 
import os
  
def get_links(url, base_url):  
    if url.endswith('.pdf'):
        print(f"Skipping {url} as it is a PDF file")
        return []
    else:
        response = requests.get(url)  
        soup = BeautifulSoup(response.text, 'html.parser')  
        links = [urljoin(base_url, a.get('href')) for a in soup.find_all('a', href=True) if (base_url in urljoin(base_url, a.get('href')) and os.path.splitext(urljoin(base_url, a.get('href')))[1] not in ['.jpg', '.jpeg', '.png'])] 
        links = list(set(links))  # Remove duplicates
        return links  
  
  
def scrape_data(url):    
    response = requests.get(url)  
      
    file_type = os.path.splitext(url)[1]  
      
    if file_type == '.pdf':  
        reader = PdfFileReader(BytesIO(response.content))  
        text = ' '.join(page.extractText() for page in reader.pages)  
        title = reader.getDocumentInfo().title  
    else:  
        soup = BeautifulSoup(response.text, 'html.parser')    
        title = url   
        text = soup.text  
          
    return title, text  


def recursive_scrape(url, base_url, seen):  
    if url not in seen:  
        seen.add(url)  
        print(f"Scraping {url}")  
        links = get_links(url, base_url)  
        for link in links:  
            try:
                recursive_scrape(link, base_url, seen)  
            except:
                print(f"Failed to scrape {link}")
  
    
  



In [3]:
base_url = "https://www.uottawa.ca/faculte-genie/etudes-premier-cycle/programmes"  # replace with your base URL  
initial_url = base_url + "/"  # replace with your initial page  

seen = set()  

recursive_scrape(initial_url, base_url, seen)  

data = []  
for url in seen:  
    title, text = scrape_data(url)  
    data.append({"title": title, "text": text})  

df = pd.DataFrame(data)  
df.to_excel("scraped_data.xlsx", index=False)  

Scraping https://www.uottawa.ca/faculte-genie/etudes-premier-cycle/programmes/
Scraping https://www.uottawa.ca/faculte-genie/etudes-premier-cycle/programmes/genie-civil
Scraping https://www.uottawa.ca/faculte-genie/etudes-premier-cycle/programmes#uo-main-content
Scraping https://www.uottawa.ca/faculte-genie/etudes-premier-cycle/programmes/genie-chimique
Scraping https://www.uottawa.ca/faculte-genie/etudes-premier-cycle/programmes
Scraping https://www.uottawa.ca/faculte-genie/etudes-premier-cycle/programmes/genie-mechanique
Scraping https://www.uottawa.ca/faculte-genie/etudes-premier-cycle/programmes/genie-mecanique/sequence-cours
Scraping https://www.uottawa.ca/faculte-genie/etudes-premier-cycle/programmes/genie-electrique
Scraping https://www.uottawa.ca/faculte-genie/etudes-premier-cycle/programmes/genie-electrique/sequence-cours
Scraping https://www.uottawa.ca/faculte-genie/etudes-premier-cycle/programmes/informatique
Scraping https://www.uottawa.ca/faculte-genie/etudes-premier-cycle

In [8]:

df["title"][3]

'https://www.uottawa.ca/faculte-genie/etudes-premier-cycle/programmes/genie-logiciel/sequence-cours'

In [9]:
df["text"][3]

"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSéquences de cours: Génie logiciel | Faculté de génie\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n        Aller au contenu principal\n      \n\n\n\n\n\n\n\n\n\n\n\n\nTertiary navigation\n\n\nÉtudiants actuels\n\n\nProfesseurs et employés\n\n\nDiplômés\n\n\nDonner\n\n\n\n\n\n\n\n\nPrimary navigation\n\n\n\n\n\n\n\n\nclose\nFermer\n\n\n\n\nsearch\n\n\nmenu\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nÉtudes\n\n\n\n\n\n\n\n\n\n\nVie sur le campus\n\n\n\n\n\n\n\n\n\n\nRecherche et innovation\n\n\n\n\n\n\n\n\n\n\nNotre université\n\n\n\n\n\n\n\n\n\n\n\n\nchevron_left\n\n                                Menu principal\n                                \n\n\nÉtudes\n\n\n\n                                  Explorer\n                                \n\n\n\n\n\n\nPourquoi choisir l'Université d'Ottawa\n\n\n\n\nProgrammes et cours\n\n\n\n\nÉtudes de premier cycle\n\n\n\n\nÉtudes supérieures