In [1]:
from urllib.parse import urljoin, urlparse
from collections import deque
import requests
from bs4 import BeautifulSoup
import re
import fitz

In [5]:
class WebUrlFetcher:
    def __init__(self, start_url, base_url, unvalid_urls, unvalid_pdf_words, max_haber_number=3, max_duyuru_number=3, max_etkinlik_number=3, text_density=500):
        self.fetched_urls = {}
        self.urls_to_fetch = [start_url]
        self.base_url = base_url
        self.unvalid_urls = unvalid_urls
        self.unvalid_pdf_words = unvalid_pdf_words
        self.max_haber_number = max_haber_number
        self.max_duyuru_number = max_duyuru_number
        self.max_etkinlik_number = max_etkinlik_number
        self.text_density = text_density

    def is_valid_url(self, url):
        if self.base_url in url and not any(unvalid in url for unvalid in self.unvalid_urls):
            return True
        return False

    def normalize_url(self, url, parent_url):
        return urljoin(parent_url, url)

    def analyze_pdf_content(self, url):
        try:
            response = requests.get(url)
            with fitz.open(stream=response.content, filetype="pdf") as doc:
                total_chars = 0
                for page in doc:
                    text = page.get_text()
                    total_chars += len(text)
                avg_chars_per_page = total_chars / len(doc) if doc.page_count > 0 else 0
            return avg_chars_per_page
        except Exception as e:
            print(f"Error processing {url}: {e}")
            return 0  # Return 0 instead of None to allow comparisons

    def is_valid_pdf_url(self, url):
        avg_chars_per_page = self.analyze_pdf_content(url)
        return not any(word in url for word in self.unvalid_pdf_words) and (avg_chars_per_page is not None and avg_chars_per_page >= self.text_density)

    def fetch_url(self, url):
        if url in self.fetched_urls:
            return

        print(f"Fetching URL: {url}")
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')

            for link in soup.find_all('a', href=True):
                normalized_url = self.normalize_url(link['href'], url)

                if self.is_valid_url(normalized_url) and normalized_url not in self.fetched_urls:
                    self.urls_to_fetch.append(normalized_url)

                elif 'haberler' in normalized_url and self.max_haber_number > 0:
                    self.urls_to_fetch.append(normalized_url)
                    self.max_haber_number -= 1

                elif 'duyurular' in normalized_url and self.max_duyuru_number > 0:
                    self.urls_to_fetch.append(normalized_url)
                    self.max_duyuru_number -= 1

                elif 'etkinlik' in normalized_url and self.max_etkinlik_number > 0:
                    self.urls_to_fetch.append(normalized_url)
                    self.max_etkinlik_number -= 1

                elif normalized_url.endswith('.pdf') and self.is_valid_pdf_url(normalized_url):
                    self.fetched_urls[normalized_url] = 'PDF'

            self.fetched_urls[url] = True
        except Exception as e:
            print(f"Failed to fetch {url} due to {e}")

    def fetch(self):
        while self.urls_to_fetch:
            url = self.urls_to_fetch.pop(0)
            self.fetch_url(url)

        return self.fetched_urls



In [6]:
# Initialize data structures
start_url = 'https://enm.yildiz.edu.tr/'
base_url = '/enm.yildiz.edu.tr'
unvalid_urls = ['enm.yildiz.edu.tr/en', '/haberler', '/files', '/etkinlik', '/duyuru', '.com']
unvalid_pdf_words=[]
max_haber_number = 3
max_duyuru_number = 3
max_etkinlik_number = 3

# ----------------------
# Note: Function calls and method calls are commented out to adhere to instructions.
# To use this class, instantiate it with the necessary parameters and call the `scrape` method.
# Example:
fetcher = WebUrlFetcher(start_url=start_url, base_url=base_url,
                     unvalid_urls=unvalid_urls, unvalid_pdf_words=unvalid_pdf_words)
fetched_urls = fetcher.fetch()

Fetching URL: https://enm.yildiz.edu.tr/
Fetching URL: https://enm.yildiz.edu.tr/#main-content
Fetching URL: https://enm.yildiz.edu.tr/hakkimizda
Fetching URL: https://enm.yildiz.edu.tr/hakkimizda/aday-ogrenciler
Error processing https://www.yildiz.edu.tr/media/files/Y%C4%B1ld%C4%B1z%20Teknik%20%C3%9Cniversitesi%20Tan%C4%B1t%C4%B1m%20Katalo%C4%9Fu.pdf: cannot open broken document
Error processing https://www.yildiz.edu.tr/media/files/makine_fakultesi(5).pdf: cannot open broken document
Fetching URL: https://enm.yildiz.edu.tr/hakkimizda/bolum-baskanligimiz
Fetching URL: https://enm.yildiz.edu.tr/hakkimizda/ders-program-ciktilari-matrisi
Fetching URL: https://enm.yildiz.edu.tr/hakkimizda/ders-plan-ve-icerikleri
Fetching URL: https://enm.yildiz.edu.tr/hakkimizda/egitim-amaclarimiz
Fetching URL: https://enm.yildiz.edu.tr/hakkimizda/misyon-ve-vizyonumuz
Fetching URL: https://enm.yildiz.edu.tr/hakkimizda/program-ciktilarimiz
Fetching URL: https://enm.yildiz.edu.tr/kisiler
Fetching URL: https

In [7]:
len(fetched_urls)

109

In [8]:
for i in fetched_urls:
    if fetched_urls[i] == 'PDF':
        print(i)

https://enm.yildiz.edu.tr/sites/enm.yildiz.edu.tr/images/files/2-PÇ Ders İlişki Matrisi (GENEL) 02_03_2022.pdf
https://enm.yildiz.edu.tr/sites/enm.yildiz.edu.tr/files/endustri-muhendisligi-bolumu-komisyonlari.pdf
https://enm.yildiz.edu.tr/sites/enm.yildiz.edu.tr/images/files/Erasmus_Surecleri.pdf
https://enm.yildiz.edu.tr/sites/enm.yildiz.edu.tr/images/files/O%CC%88g%CC%86renci%20Temsilcilig%CC%86i.pdf
https://enm.yildiz.edu.tr/sites/enm.yildiz.edu.tr/images/files/Ogrenci Konseyi Yonergesi(1).pdf
https://enm.yildiz.edu.tr/sites/enm.yildiz.edu.tr/files/inline-files/Staj%2BYonergesi_%2025092023.pdf
https://enm.yildiz.edu.tr/sites/enm.yildiz.edu.tr/files/2024-02/2023-2024-bahar-donemi-makine-fakultesi-basvuru-ilkeleri-ve-alinan-kararlar.pdf
https://enm.yildiz.edu.tr/sites/enm.yildiz.edu.tr/files/2024-02/makstaj_036-1.pdf
https://enm.yildiz.edu.tr/sites/enm.yildiz.edu.tr/files/2024-02/2023-2024-bahar-donem-staj-takvim-1.pdf
https://enm.yildiz.edu.tr/sites/enm.yildiz.edu.tr/files/inline-

### Saving urls dictionary 

In [9]:
import pickle


# Save dictionary to a file
with open('../data/end_page_urls.pickle', 'wb') as handle:
    pickle.dump(fetched_urls, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # Load dictionary from the file
# with open('data/erasmus_page_urls.pickle', 'rb') as handle:
#     loaded_dict = pickle.load(handle)

# print(loaded_dict)
