In [62]:
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import re
import fitz

In [82]:
# Initialize data structures
visited_urls = {}
urls_to_scrape = []
unvalid_urls = ['erasmus.yildiz.edu.tr/en', '/haberler', '/files']
max_duyuru_number = 10

def is_valid_url(url):
    """Check if the URL contains the specified domain and is not a file."""
    if '/erasmus.yildiz.edu.tr' in url and sum([word in url for word in unvalid_urls]) == 0:
        return True
    return False

def normalize_url(url, parent_url):
    """Resolve relative URLs to absolute URLs based on the parent URL."""
    return urljoin(parent_url, url)

def scrape_url(url):
    """Scrape the given URL for links and manage the scraping queue."""
    global max_duyuru_number
    
    if url in visited_urls:
        return

    print(f"Scraping {url}")
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        for link in soup.find_all('a', href=True):
            normalized_url = normalize_url(link['href'], url)
            # Check if the URL is valid and not a PDF
            if is_valid_url(normalized_url):
                if normalized_url not in visited_urls:
                    urls_to_scrape.append(normalized_url)
                    
            # Scrap specified number of Duyuru pages 
            elif '/haberler' in normalized_url and max_duyuru_number > 0:
                if normalized_url not in visited_urls:
                    urls_to_scrape.append(normalized_url)
                    max_duyuru_number -= 1
                    
            elif normalized_url.endswith('.pdf') and 'images' not in normalized_url and 'University' not in normalized_url and 'university' not in normalized_url:
                # Store PDF URLs differently if needed
                visited_urls[normalized_url] = 'PDF'
                
        # Mark the URL as visited
        visited_urls[url] = True
    except Exception as e:
        print(f"Failed to scrape {url} due to {e}")
        
def main(start_url):
    """Initialize the scraping process with a starting URL."""
    urls_to_scrape.append(start_url)
    while urls_to_scrape:
        url = urls_to_scrape.pop(0) # FIFO queue, could be optimized
        scrape_url(url)

In [26]:
# Example starting URL
start_url = 'http://erasmus.yildiz.edu.tr'
# Uncomment the following line to start scraping
main(start_url)

Scraping http://erasmus.yildiz.edu.tr
Scraping https://erasmus.yildiz.edu.tr/
Scraping https://erasmus.yildiz.edu.tr/sayfa/AB/Hakkımızda/625
Scraping https://erasmus.yildiz.edu.tr/sayfa/6
Scraping https://erasmus.yildiz.edu.tr/mansetler/10/Erasmus-Uygulaması-ve-Avrupa-Öğrenci-Kartı-Girişimi
Scraping https://erasmus.yildiz.edu.tr/mansetler/3/Erasmus-
Scraping https://erasmus.yildiz.edu.tr/sayfa/21/1
Scraping https://erasmus.yildiz.edu.tr/sayfa/22/9
Scraping https://erasmus.yildiz.edu.tr/sayfa/23/3
Scraping https://erasmus.yildiz.edu.tr/sayfa/33/5
Scraping https://erasmus.yildiz.edu.tr/sayfa/24/4
Scraping https://erasmus.yildiz.edu.tr/sayfa/32/12
Scraping https://erasmus.yildiz.edu.tr/sayfa/7/2024-2025-Başvuruları-için-Süreç-Takvimi/911
Scraping https://erasmus.yildiz.edu.tr/sayfa/7/Erasmus-Süreç-Takvimi-2023-2024/810
Scraping https://erasmus.yildiz.edu.tr/sayfa/7/Bölüm-Koordinatörleri-Departmental-Coordinators/737
Scraping https://erasmus.yildiz.edu.tr/sayfa/7/Fact-Sheet-of-Host-Univers

Scraping https://erasmus.yildiz.edu.tr/sayfa/FEN-EDEBİYAT-FAKÜLTESİ--GÜNCELLENİYOR-/Türk-Dili-ve-Edebiyatı---Turkish-language-and-literature/575
Scraping https://erasmus.yildiz.edu.tr/sayfa/FEN-EDEBİYAT-FAKÜLTESİ--GÜNCELLENİYOR-/KİMYA-Chemistry/576
Scraping https://erasmus.yildiz.edu.tr/sayfa/FEN-EDEBİYAT-FAKÜLTESİ--GÜNCELLENİYOR-/İSTATİSTİK-Statistics/577
Scraping https://erasmus.yildiz.edu.tr/sayfa/FEN-EDEBİYAT-FAKÜLTESİ--GÜNCELLENİYOR-/FİZİK-Physics/578
Scraping https://erasmus.yildiz.edu.tr/sayfa/KA-107-ICM-ANLAŞMALARI/Avrupa-dışı-ülkeler-ile-anlaşmalar/897
Scraping https://erasmus.yildiz.edu.tr/sayfa/Bilgiler--Güncelleniyor-/Faaliyet-Süreçleri/542
Scraping https://erasmus.yildiz.edu.tr/sayfa/Bilgiler--Güncelleniyor-/Bilgilendirme-Sunumları/549
Scraping https://erasmus.yildiz.edu.tr/sayfa/KA-131-%C3%96%C4%9Frenim-Hareketlili%C4%9Fi--Avrupa-Birli%C4%9Fi-%C3%9Clkeleri-/De%C4%9Fi%C5%9Fiklik--Feragat-ve-Vazge%C3%A7me/884
Scraping https://erasmus.yildiz.edu.tr/sayfa/7/OLS-%C3%87evrimi%C

### Removing some unwanted urls

In [39]:
def remove_unwanted_urls(urls):
    """Remove URLs that end with /number/number."""
    urls_to_remove = [url for url in urls if re.search(r'/\d+/\d+/?$', url)]
    for url in urls_to_remove:
        del urls[url]
    return urls

visited_urls = remove_unwanted_urls(visited_urls)

In [40]:
len(visited_urls)

310

In [70]:
# PyMuPDF
def analyze_pdf_content(url):
    """Analyze the PDF content to determine its text-density."""
    try:
        response = requests.get(url)
        with fitz.open(stream=response.content, filetype="pdf") as doc:
            total_chars = 0
            for page in doc:
                text = page.get_text()
                total_chars += len(text)
            avg_chars_per_page = total_chars / len(doc) if doc.page_count > 0 else 0
        return avg_chars_per_page
    except Exception as e:  # Catch any exception that indicates a failure to process the PDF
        print(f"Error processing {url}: {e}")
        return None  # Indicate failure


def filter_pdfs(visited_urls):
    """Filter out PDFs based on their text-density or if they cannot be processed."""
    min_chars_per_page = 500  # Threshold for average characters per page
    urls_to_remove = []

    for url, status in list(visited_urls.items()):  # Use list() to allow modification during iteration
        if status == 'PDF':
            avg_chars_per_page = analyze_pdf_content(url)
            if avg_chars_per_page is None or avg_chars_per_page < min_chars_per_page:
                urls_to_remove.append(url)

    for url in urls_to_remove:
        del visited_urls[url]
        print(f"Removed {url} due to processing error or low text content.")



In [71]:
filter_pdfs(visited_urls)

Removed https://erasmus.yildiz.edu.tr/media/files/2024-2025 Erasmus+ ka131 öğrenci UA Portal kullanım kılavuzu.pdf due to processing error or low text content.
Removed https://erasmus.yildiz.edu.tr/media/files/2024-2025 erasmus+ ka131 kion başvuru kılavuz(2).pdf due to processing error or low text content.
Removed https://erasmus.yildiz.edu.tr/media/files/2022 dönemi erasmus personel hareketliliği kion kullanm kılavuzu(4).pdf due to processing error or low text content.
Removed https://erasmus.yildiz.edu.tr/media/files/ICM_2021_georgia(1).pdf due to processing error or low text content.
Removed https://erasmus.yildiz.edu.tr/media/files/ErasmusGelenOgrenciBasvuruDegerlendirme.pdf due to processing error or low text content.


In [67]:
len(visited_urls)

244

In [99]:
def remove_other_pdfs(urls):
    urls_to_be_removed = []
    for url in urls:
        if urls[url] == 'PDF':
            if 'Univer' in url or 'univer' in url:
                print(url)
                urls_to_be_removed.append(url)
    for url in urls_to_be_removed:
        del visited_urls[url]
        print(f'deleted {url}')
        
remove_other_pdfs(visited_urls)

https://erasmus.yildiz.edu.tr/media/files/Berlin Institute of Technology (Technische Universität Berlin).pdf
https://erasmus.yildiz.edu.tr/media/files/Technische Universität München.pdf
https://erasmus.yildiz.edu.tr/media/files/Technische Universität Bergakademie Freiberg.pdf
https://erasmus.yildiz.edu.tr/media/files/Université de Lorraine.pdf
https://erasmus.yildiz.edu.tr/media/files/universidade nova de lisboa.pdf
https://erasmus.yildiz.edu.tr/media/files/universite Tercihleri ve Yerlestirme2.pdf
deleted https://erasmus.yildiz.edu.tr/media/files/Berlin Institute of Technology (Technische Universität Berlin).pdf
deleted https://erasmus.yildiz.edu.tr/media/files/Technische Universität München.pdf
deleted https://erasmus.yildiz.edu.tr/media/files/Technische Universität Bergakademie Freiberg.pdf
deleted https://erasmus.yildiz.edu.tr/media/files/Université de Lorraine.pdf
deleted https://erasmus.yildiz.edu.tr/media/files/universidade nova de lisboa.pdf
deleted https://erasmus.yildiz.edu.t

In [100]:
len(visited_urls)

208

In [101]:
visited_urls

{'http://erasmus.yildiz.edu.tr': True,
 'https://erasmus.yildiz.edu.tr/': True,
 'https://erasmus.yildiz.edu.tr/sayfa/AB/Hakkımızda/625': True,
 'https://erasmus.yildiz.edu.tr/sayfa/6': True,
 'https://erasmus.yildiz.edu.tr/mansetler/10/Erasmus-Uygulaması-ve-Avrupa-Öğrenci-Kartı-Girişimi': True,
 'https://erasmus.yildiz.edu.tr/mansetler/3/Erasmus-': True,
 'https://erasmus.yildiz.edu.tr/sayfa/7/2024-2025-Başvuruları-için-Süreç-Takvimi/911': True,
 'https://erasmus.yildiz.edu.tr/sayfa/7/Erasmus-Süreç-Takvimi-2023-2024/810': True,
 'https://erasmus.yildiz.edu.tr/sayfa/7/Bölüm-Koordinatörleri-Departmental-Coordinators/737': True,
 'https://erasmus.yildiz.edu.tr/media/files/Fact Sheet HKA_2023(1).pdf': 'PDF',
 'https://erasmus.yildiz.edu.tr/media/files/2022_08_01_KIT_D_KARLSRU01_ERASMUS_Factsheet(1).pdf': 'PDF',
 'https://erasmus.yildiz.edu.tr/media/files/Hochschule Mainz(1).pdf': 'PDF',
 'https://erasmus.yildiz.edu.tr/media/files/Fact Sheet for Partners 2023_2024_DMUNSTER02.pdf': 'PDF',
 

In [102]:
normal = 0
pdfs = 0
for i in visited_urls:
    if visited_urls[i] == 'PDF':
        
        pdfs+=1
    else:
        normal +=1
print(normal, pdfs)

124 84


In [103]:
for i in visited_urls:
    if visited_urls[i] == 'PDF':
        print(i)

https://erasmus.yildiz.edu.tr/media/files/Fact Sheet HKA_2023(1).pdf
https://erasmus.yildiz.edu.tr/media/files/2022_08_01_KIT_D_KARLSRU01_ERASMUS_Factsheet(1).pdf
https://erasmus.yildiz.edu.tr/media/files/Hochschule Mainz(1).pdf
https://erasmus.yildiz.edu.tr/media/files/Fact Sheet for Partners 2023_2024_DMUNSTER02.pdf
https://erasmus.yildiz.edu.tr/media/files/Cilt1.pdf
https://erasmus.yildiz.edu.tr/media/files/Key Information Sheet 2022_23 D MARBURG01.pdf
https://erasmus.yildiz.edu.tr/media/files/U_Bielefeld_Factsheet_2324.pdf
https://erasmus.yildiz.edu.tr/media/files/Fontys Schools.pdf
https://erasmus.yildiz.edu.tr/media/files/VILNIUS TECH Info-sheet 2023-2024 AUTUMN.pdf
https://erasmus.yildiz.edu.tr/media/files/UNIVERSIDAD CATÓLICA “SANTA TERESA DE JESÚS” DE AVILA.pdf
https://erasmus.yildiz.edu.tr/media/files/inalco – Institut national des langues et civilisations orientales.pdf
https://erasmus.yildiz.edu.tr/media/files/umea(1).pdf
https://erasmus.yildiz.edu.tr/media/files/Eastwest E

### Saving urls dictionary 

In [104]:
import pickle


# Save dictionary to a file
with open('data/erasmus_page_urls.pickle', 'wb') as handle:
    pickle.dump(visited_urls, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # Load dictionary from the file
# with open('data/erasmus_page_urls.pickle', 'rb') as handle:
#     loaded_dict = pickle.load(handle)

# print(loaded_dict)


In [57]:
def extract_text_from_url(url):
    """
    Extract text from the end of the URL, correctly handling /text and /text/number cases.
    """
    # Extracting the path from the URL and splitting it
    path_segments = urlparse(url).path.split('/')
    # Filtering out empty segments and digits-only segments
    text_segments = [segment for segment in path_segments if segment and not segment.isdigit()]

    # The last segment in text_segments should be the text we want to extract
    if text_segments:
        return text_segments[-1]  # Return the last text part
    return None

In [None]:
hi. Code: ```# Initialize data structures
fetched_urls = {}
urls_to_fetch = []
base_url = '/erasmus.yildiz.edu.tr'
unvalid_urls = ['erasmus.yildiz.edu.tr/en', '/haberler', '/files', '/etkinlik', '/duyuru']
max_haber_number = 3
max_duyuru_number = 3
max_etkinlik_number = 3

# ----------------------

def is_valid_url(url):
    """Check if the URL contains the specified domain and is not a file."""
    if base_url in url and sum([word in url for word in unvalid_urls]) == 0:
        return True
    return False

def normalize_url(url, parent_url):
    """Resolve relative URLs to absolute URLs based on the parent URL."""
    return urljoin(parent_url, url)

def fetch_url(url):
    """Fetch the given URL for links and manage the fetching queue."""
    global max_haber_number
    global max_duyuru_number
    global max_etkinlik_number
    
    if url in fetched_urls:
        return

    print(f"Fetching {url}")
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        for link in soup.find_all('a', href=True):
            normalized_url = normalize_url(link['href'], url)
            # Check if the URL is valid and not a PDF
            if is_valid_url(normalized_url):
                if normalized_url not in fetched_urls:
                    urls_to_fetch.append(normalized_url)
                    
            # Fetch specified number of Duyuru pages 
            elif 'haberler' in normalized_url and max_haber_number > 0:
                if normalized_url not in fetched_urls:
                    urls_to_fetch.append(normalized_url)
                    max_haber_number -= 1
                    
            # Fetch specified number of Duyuru pages 
            elif 'duyurular' in normalized_url and max_duyuru_number > 0:
                if normalized_url not in fetched_urls:
                    urls_to_fetch.append(normalized_url)
                    max_duyuru_number -= 1
                    
            # Fetch specified number of Duyuru pages 
            elif 'etkinlik' in normalized_url and max_etkinlik_number > 0:
                if normalized_url not in fetched_urls:
                    urls_to_fetch.append(normalized_url)
                    max_etkinlik_number -= 1
                    
            elif normalized_url.endswith('.pdf') and 'images' not in normalized_url and 'University' not in normalized_url and 'university' not in normalized_url:
                # Store PDF URLs differently if needed
                fetched_urls[normalized_url] = 'PDF'
                
        # Mark the URL as fetched
        fetched_urls[url] = True
    except Exception as e:
        print(f"Failed to fetch {url} due to {e}")
        
def main(start_url):
    """Initialize the scraping process with a starting URL."""
    urls_to_fetch.append(start_url)
    while urls_to_fetch:
        url = urls_to_fetch.pop(0) # FIFO queue, could be optimized
        fetch_url(url)``` can you standardize this into a standard function that takes start_url, base_url, list of unvalid_urls,  and then max_haber_number, max_duyuru_number, max_etkinlik_number with defualt values of 3, and returns a dictionary called fetched_urls ? be careful, organized, accurate 