In [1]:
from urllib.parse import urljoin, urlparse
from collections import deque
import requests
from bs4 import BeautifulSoup
import re
import fitz

In [2]:
class WebUrlFetcher:
    def __init__(self, start_url, base_url, unvalid_urls, unvalid_pdf_words, max_haber_number=3, max_duyuru_number=3, max_etkinlik_number=3, text_density=500):
        self.fetched_urls = {}
        self.urls_to_fetch = [start_url]
        self.base_url = base_url
        self.unvalid_urls = unvalid_urls
        self.unvalid_pdf_words = unvalid_pdf_words
        self.max_haber_number = max_haber_number
        self.max_duyuru_number = max_duyuru_number
        self.max_etkinlik_number = max_etkinlik_number
        self.text_density = text_density

    def is_valid_url(self, url):
        if self.base_url in url and not any(unvalid in url for unvalid in self.unvalid_urls):
            return True
        return False

    def normalize_url(self, url, parent_url):
        return urljoin(parent_url, url)

    def analyze_pdf_content(self, url):
        try:
            response = requests.get(url)
            with fitz.open(stream=response.content, filetype="pdf") as doc:
                total_chars = 0
                for page in doc:
                    text = page.get_text()
                    total_chars += len(text)
                avg_chars_per_page = total_chars / len(doc) if doc.page_count > 0 else 0
            return avg_chars_per_page
        except Exception as e:
            print(f"Error processing {url}: {e}")
            return 0  # Return 0 instead of None to allow comparisons

    def is_valid_pdf_url(self, url):
        avg_chars_per_page = self.analyze_pdf_content(url)
        return not any(word in url for word in self.unvalid_pdf_words) and (avg_chars_per_page is not None and avg_chars_per_page >= self.text_density)

    def fetch_url(self, url):
        if url in self.fetched_urls:
            return

        print(f"Fetching URL: {url}")
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')

            for link in soup.find_all('a', href=True):
                normalized_url = self.normalize_url(link['href'], url)

                if self.is_valid_url(normalized_url) and normalized_url not in self.fetched_urls:
                    self.urls_to_fetch.append(normalized_url)

                elif 'haberler' in normalized_url and self.max_haber_number > 0:
                    self.urls_to_fetch.append(normalized_url)
                    self.max_haber_number -= 1

                elif 'duyurular' in normalized_url and self.max_duyuru_number > 0:
                    self.urls_to_fetch.append(normalized_url)
                    self.max_duyuru_number -= 1

                elif 'etkinlik' in normalized_url and self.max_etkinlik_number > 0:
                    self.urls_to_fetch.append(normalized_url)
                    self.max_etkinlik_number -= 1

                elif normalized_url.endswith('.pdf') and self.is_valid_pdf_url(normalized_url):
                    self.fetched_urls[normalized_url] = 'PDF'

            self.fetched_urls[url] = True
        except Exception as e:
            print(f"Failed to fetch {url} due to {e}")

    def fetch(self):
        while self.urls_to_fetch:
            url = self.urls_to_fetch.pop(0)
            self.fetch_url(url)

        return self.fetched_urls



In [6]:
# Initialize data structures
start_url = 'https://www.yildiz.edu.tr/'
base_url = '/www.yildiz.edu.tr'
unvalid_urls = ['yildiz.edu.tr/en', 'page=', 'haberler', '/rehber', 'etkinlik', 'duyuru', '.com', '/files']
unvalid_pdf_words=['/files']
max_haber_number = 3
max_duyuru_number = 3
max_etkinlik_number = 3

# ----------------------
# Note: Function calls and method calls are commented out to adhere to instructions.
# To use this class, instantiate it with the necessary parameters and call the `scrape` method.
# Example:
fetcher = WebUrlFetcher(start_url=start_url, base_url=base_url,
                     unvalid_urls=unvalid_urls, unvalid_pdf_words=unvalid_pdf_words)
fetched_urls = fetcher.fetch()

Fetching URL: https://www.yildiz.edu.tr/
Fetching URL: https://www.yildiz.edu.tr/#main-content
Fetching URL: https://www.yildiz.edu.tr/universite
Fetching URL: https://www.yildiz.edu.tr/ar-ge
Fetching URL: https://www.yildiz.edu.tr/egitim
Fetching URL: https://www.yildiz.edu.tr/kampus
Fetching URL: https://www.yildiz.edu.tr/iletisim
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/tarihce
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/misyon-ve-vizyon
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/kampusler
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/kurumsal-kimlik
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/sayilarla-ytu
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/ytu-rektorleri
Fetching URL: https://www.yildiz.edu.tr/universite/yonetim
Fetching URL: https://www.yildiz.edu.tr/universite/yonetim/rektorun-mes

Fetching URL: https://www.yildiz.edu.tr/universite/akademik-kadro
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/tarihce#main-content
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/tarihce#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/misyon-ve-vizyon#main-content
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/misyon-ve-vizyon#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/kampusler#main-content
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/kampusler#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/kurumsal-kimlik#main-content
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/kurumsal-kimlik#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/sayilarla-ytu#main-content
Fetching URL: https://www.yildiz.edu.tr/universite/universitemiz/sayilarla-ytu#appsMenuLink
Fetching URL: https://ww

Fetching URL: https://www.yildiz.edu.tr/kampus/surdurulebilirlik#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/kampus/surdurulebilirlik/akillim-kampus-ve-dijital-donusum
Fetching URL: https://www.yildiz.edu.tr/kampus/surdurulebilirlik/enerji-verimliligi#main-content
Fetching URL: https://www.yildiz.edu.tr/kampus/surdurulebilirlik/enerji-verimliligi#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/kampus/surdurulebilirlik/atik-yonetimi#main-content
Fetching URL: https://www.yildiz.edu.tr/kampus/surdurulebilirlik/atik-yonetimi#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/kampus/surdurulebilirlik/akilli-kampus-ve-dijital-donusum#main-content
Fetching URL: https://www.yildiz.edu.tr/kampus/surdurulebilirlik/akilli-kampus-ve-dijital-donusum#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/kampus/kampus-yasami#main-content
Fetching URL: https://www.yildiz.edu.tr/kampus/kampus-yasami#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/kampus/kampus-yasami/ogrenci-kulupl

Fetching URL: https://www.yildiz.edu.tr/kampus/saglikli-yasam#main-content
Fetching URL: https://www.yildiz.edu.tr/kampus/saglikli-yasam#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/iletisim/kampuslerimiz#main-content
Fetching URL: https://www.yildiz.edu.tr/iletisim/kampuslerimiz#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/universite/akademik-kadro#main-content
Fetching URL: https://www.yildiz.edu.tr/universite/akademik-kadro#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/sayfa/AKADEM%C4%B0K/AKADEM%C4%B0K-B%C4%B0R%C4%B0MLER/122
Fetching URL: https://www.yildiz.edu.tr/kampus/surdurulebilirlik/akillim-kampus-ve-dijital-donusum#main-content
Fetching URL: https://www.yildiz.edu.tr/kampus/surdurulebilirlik/akillim-kampus-ve-dijital-donusum#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/egitim/birlikleri-ve-oduller#main-content
Fetching URL: https://www.yildiz.edu.tr/egitim/birlikleri-ve-oduller#appsMenuLink
Fetching URL: https://www.yildiz.edu.tr/sayfa/AKADEM%C4

In [7]:
len(fetched_urls)

310

In [8]:
for i in fetched_urls:
    if fetched_urls[i] == 'PDF':
        print(i)

http://www.kalite.yildiz.edu.tr/login/sys/admin/subPages/img/Y%C3%96-049-YT%C3%9C%20%C3%96%C4%9Frenci%20Dekanl%C4%B1%C4%9F%C4%B1%20Y%C3%B6nergesi.pdf


### Saving urls dictionary 

In [10]:
import pickle


# Save dictionary to a file
with open('../data/ytu_page_urls.pickle', 'wb') as handle:
    pickle.dump(fetched_urls, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # Load dictionary from the file
# with open('data/ytu_page_urls.pickle', 'rb') as handle:
#     loaded_dict = pickle.load(handle)

# print(loaded_dict)
