In [4]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL of the thesis list
base_url = "https://research.tue.nl/en/studentTheses/"
params = {
  'organisationIds': '49ca0778-93f6-4b7f-95be-d4cbfb3be718',
  'nofollow': 'true',
  'type': '/dk/atira/pure/studentthesis/studentthesistypes/studentthesis/master',
  'page': 0  # This will be changed in the loop
}

# Function to scrape a single page
def scrape_page(page_num):
  # Set the page parameter
  params['page'] = page_num
  # Send the request to the website
  response = requests.get(base_url, params=params)
  # Parse the page content
  soup = BeautifulSoup(response.text, 'html.parser')
  # Find all thesis entries
  theses = soup.find_all('div', class_='rendering_studentthesis_short')

  # Extract title and link for each thesis
  thesis_data = []
  for thesis in theses:
    title_tag = thesis.find('h3', class_='title').find('a')
    title = title_tag.get_text(strip=True)
    link = title_tag['href']
    # Append the full URL to the link
    full_link = base_url + link if not link.startswith('http') else link
    thesis_data.append({
      "Title": title,
      "URL": full_link
    })
  
  return thesis_data

# Function to scrape all pages and save the results
def scrape_all_pages():
  all_theses = []
  for page_num in range(8):  # 0 to 7 pages
    print(f"Scraping page {page_num}...")
    theses = scrape_page(page_num)
    all_theses.extend(theses)
  
  # Save to a JSON file
  with open('thesis_titles_and_links.json', 'w', encoding='utf-8') as f:
    json.dump(all_theses, f, ensure_ascii=False, indent=4)
  
  print(f"Scraped {len(all_theses)} theses in total.")

# Run the scraper
scrape_all_pages()


Scraping page 0...
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraped 400 theses in total.


In [5]:
# import requests
# from bs4 import BeautifulSoup
# import json
# import time

# # Base URL of the thesis list and PDF files
# base_url = "https://research.tue.nl"

# # Function to scrape a single page for thesis titles and URLs
# def scrape_page(page_num):
#   thesis_page_url = f"{base_url}/en/studentTheses/"
#   params = {
#     'organisationIds': '49ca0778-93f6-4b7f-95be-d4cbfb3be718',
#     'nofollow': 'true',
#     'type': '/dk/atira/pure/studentthesis/studentthesistypes/studentthesis/master',
#     'page': page_num
#   }

#   response = requests.get(thesis_page_url, params=params)
#   soup = BeautifulSoup(response.text, 'html.parser')
#   theses = soup.find_all('div', class_='rendering_studentthesis_short')

#   thesis_data = []
#   for thesis in theses:
#     title_tag = thesis.find('h3', class_='title').find('a')
#     title = title_tag.get_text(strip=True)
#     link = title_tag['href']
#     full_link = base_url + link if not link.startswith('http') else link
#     thesis_data.append({
#       "Title": title,
#       "URL": full_link
#     })
  
#   return thesis_data

# # Function to scrape the PDF link from a thesis page
# def scrape_pdf_link(thesis_url):
#   response = requests.get(thesis_url)
#   soup = BeautifulSoup(response.text, 'html.parser')

#   # Look for the documents section
#   documents_section = soup.find('div', class_='documents')
#   if documents_section:
#     pdf_link_tag = documents_section.find('a', class_='link')
#     if pdf_link_tag and pdf_link_tag.get('href').endswith('.pdf'):
#       pdf_relative_url = pdf_link_tag['href']
#       pdf_full_url = base_url + pdf_relative_url  # Get the full PDF URL
#       return pdf_full_url
#   return None  # If no PDF link found

# # Function to scrape all pages and get thesis titles, URLs, and PDF links
# def scrape_all_theses():
#   all_theses = []
#   for page_num in range(8):  # 0 to 7 pages
#     print(f"Scraping page {page_num}...")
#     theses = scrape_page(page_num)
    
#     for thesis in theses:
#       print(f"Scraping PDF link for: {thesis['Title']}")
#       pdf_link = scrape_pdf_link(thesis['URL'])
#       thesis["PDF"] = pdf_link  # Add the PDF link to the JSON entry
#       time.sleep(1)  # Be nice to the server and add a delay

#     all_theses.extend(theses)
  
#   # Save all theses data to a JSON file
#   with open('thesis_titles_links_and_pdfs.json', 'w', encoding='utf-8') as f:
#     json.dump(all_theses, f, ensure_ascii=False, indent=4)

#   print(f"Scraped {len(all_theses)} theses in total.")

# # Run the scraper
# scrape_all_theses()


Scraping page 0...
Scraping PDF link for: /JoiCost/ : berekeningen aan stalen raamwerken
Scraping PDF link for: "...but some are more equal than others": development of an instrument for software performance prioritization
Scraping PDF link for: .NL
Scraping PDF link for: (Ab)using Bitcoin for anti-censorship tool
Scraping PDF link for: (Bio)chemische eigenschappen van fosfaatgemethyleerd DNA
Scraping PDF link for: (Bio)chemische eigenschappen van parallelle DNA-duplices gestabiliseerd door polylysine : structuur, stabiliteit, enzymatische herkenbaarheid
Scraping PDF link for: (C)PO in de gemeentelijke organisatie: het Kavelpaspoort als kritische succesfactor
Scraping PDF link for: (Co)polymerisatie aan het oppervlak van gemodificeerd titaandioxide
Scraping PDF link for: (Co)polymerisatie van methylmethacrylaat aan het oppervlak van titaandioxide
Scraping PDF link for: (Data) refinement calculus met stack voorbeelden
Scraping PDF link for: (De-)aggregatie in geroerde vaten
Scraping PDF

In [2]:
import requests
from bs4 import BeautifulSoup
import json
import time

# Base URL of the thesis list and PDF files
base_url = "https://research.tue.nl"

# Function to scrape a single page for thesis titles and URLs
def scrape_page(page_num):
    thesis_page_url = f"{base_url}/en/studentTheses/"
    params = {
        'organisationIds': '49ca0778-93f6-4b7f-95be-d4cbfb3be718',
        'nofollow': 'true',
        'type': '/dk/atira/pure/studentthesis/studentthesistypes/studentthesis/master',
        'page': page_num
    }

    response = requests.get(thesis_page_url, params=params)
    soup = BeautifulSoup(response.text, 'html.parser')
    theses = soup.find_all('div', class_='rendering_studentthesis_short')

    thesis_data = []
    for thesis in theses:
        title_tag = thesis.find('h3', class_='title').find('a')
        title = title_tag.get_text(strip=True)
        link = title_tag['href']
        full_link = base_url + link if not link.startswith('http') else link
        thesis_data.append({
            "Title": title,
            "URL": full_link
        })
    
    return thesis_data

# Function to scrape the detailed thesis page
def scrape_thesis_details(thesis_url):
    response = requests.get(thesis_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Get the PDF link
    documents_section = soup.find('div', class_='documents')
    pdf_link = None
    if documents_section:
        pdf_link_tag = documents_section.find('a', class_='link')
        if pdf_link_tag and pdf_link_tag.get('href').endswith('.pdf'):
            pdf_relative_url = pdf_link_tag['href']
            pdf_link = base_url + pdf_relative_url

    # Get supervisors
    supervisors = []
    properties_table = soup.find('table', class_='properties')
    if properties_table:
        supervisor_th = properties_table.find('th', string=lambda text: text and 'Supervisor' in text)
        if supervisor_th:
            supervisor_row = supervisor_th.parent
            supervisors_td = supervisor_row.find('td')
            if supervisors_td:
                supervisor_tags = supervisors_td.find_all('a', class_='link person')
                for tag in supervisor_tags:
                    supervisors.append(tag.get_text(strip=True))

    return pdf_link, supervisors


# Function to scrape all pages and get thesis titles, URLs, PDF links, and supervisors
def scrape_all_theses():
    all_theses = []
    for page_num in range(8):  # 0 to 7 pages
        print(f"Scraping page {page_num}...")
        theses = scrape_page(page_num)
        
        for thesis in theses:
            print(f"Scraping details for: {thesis['Title']}")
            pdf_link, supervisors = scrape_thesis_details(thesis['URL'])
            thesis["PDF"] = pdf_link  # Add the PDF link to the JSON entry
            thesis["Supervisors"] = supervisors  # Add supervisors to the JSON entry
            time.sleep(1)  # Be nice to the server and add a delay

        all_theses.extend(theses)
    
    # Save all theses data to a JSON file
    with open('thesis_titles_links_and_pdfs_and_supervisors.json', 'w', encoding='utf-8') as f:
        json.dump(all_theses, f, ensure_ascii=False, indent=4)

    print(f"Scraped {len(all_theses)} theses in total.")

# Run the scraper
scrape_all_theses()


Scraping page 0...
Scraping details for: /JoiCost/ : berekeningen aan stalen raamwerken
Scraping details for: "...but some are more equal than others": development of an instrument for software performance prioritization
Scraping details for: .NL
Scraping details for: (Ab)using Bitcoin for anti-censorship tool
Scraping details for: (Bio)chemische eigenschappen van fosfaatgemethyleerd DNA
Scraping details for: (Bio)chemische eigenschappen van parallelle DNA-duplices gestabiliseerd door polylysine : structuur, stabiliteit, enzymatische herkenbaarheid
Scraping details for: (C)PO in de gemeentelijke organisatie: het Kavelpaspoort als kritische succesfactor
Scraping details for: (Co)polymerisatie aan het oppervlak van gemodificeerd titaandioxide
Scraping details for: (Co)polymerisatie van methylmethacrylaat aan het oppervlak van titaandioxide
Scraping details for: (Data) refinement calculus met stack voorbeelden
Scraping details for: (De-)aggregatie in geroerde vaten
Scraping details for: (

# Downloading pdfs

In [17]:
import requests
import json
import os
import re
import sys
from pathlib import Path

# Constants
INPUT_JSON_FILE = 'thesis_titles_links_and_pdfs_and_supervisors.json'
OUTPUT_JSON_FILE = 'thesis_titles_links_and_pdfs_and_supervisors_updated.json'
PDFS_DIR = 'pdfs'

# Reserved Windows filenames
RESERVED_FILENAMES = {
    "CON", "PRN", "AUX", "NUL",
    "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
    "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9"
}

def sanitize_filename(filename):
    """
    Sanitize the filename by removing or replacing invalid characters.
    This function replaces any character not in the whitelist with an underscore.
    Whitelisted characters are alphanumerics, space, hyphen, underscore, parentheses, and period.
    Additionally, it ensures the filename does not end with a space or dot and is not a reserved name.
    """
    # Define a whitelist of valid characters
    whitelist = r'[^A-Za-z0-9 \-_().]'
    # Replace invalid characters with underscore
    sanitized = re.sub(whitelist, '_', filename)
    # Replace multiple underscores with a single one
    sanitized = re.sub(r'_+', '_', sanitized)
    # Strip leading/trailing underscores and whitespace
    sanitized = sanitized.strip(' _')
    # Ensure filename does not end with a space or dot
    sanitized = sanitized.rstrip(' .')
    # Check for reserved filenames (case-insensitive)
    if sanitized.upper() in RESERVED_FILENAMES:
        sanitized = f"_{sanitized}_"
    # Limit the filename length to 255 characters (common filesystem limit)
    return sanitized[:255]

def make_unique_filename(directory, filename):
    """
    Ensure the filename is unique within the specified directory.
    If the filename already exists, append a counter to make it unique.
    """
    base, extension = os.path.splitext(filename)
    counter = 1
    unique_filename = filename
    while os.path.exists(os.path.join(directory, unique_filename)):
        unique_filename = f"{base}_{counter}{extension}"
        counter += 1
    return unique_filename

def download_pdf(thesis, pdf_url, file_path):
    """
    Downloads a PDF from the given URL and saves it to the specified file path.
    Returns True if download is successful, False otherwise.
    """
    try:
        response = requests.get(pdf_url, timeout=30)
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                f.write(response.content)
            print(f"Download success for '{thesis['Title']}'")
            return True
        else:
            print(f"Download failed for '{thesis['Title']}' - Status Code: {response.status_code}")
            return False
    except requests.exceptions.RequestException as e:
        print(f"Download failed for '{thesis['Title']}' - Error: {e}")
        return False
    except OSError as e:
        print(f"File write failed for '{thesis['Title']}' - Error: {e}")
        return False

def main():
    # Load thesis data with PDF links
    try:
        with open(INPUT_JSON_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Input JSON file '{INPUT_JSON_FILE}' not found.")
        sys.exit(1)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        sys.exit(1)

    # Ensure the 'pdfs' directory exists
    Path(PDFS_DIR).mkdir(parents=True, exist_ok=True)

    # Process each thesis entry
    for thesis in data:
        pdf_url = thesis.get('PDF')
        if pdf_url and isinstance(pdf_url, str) and pdf_url.strip().lower() != 'no pdf found':
            original_title = thesis.get('Title', 'untitled')
            sanitized_title = sanitize_filename(original_title)
            filename = f"{sanitized_title}.pdf"
            # Ensure the filename is unique within the directory
            unique_filename = make_unique_filename(PDFS_DIR, filename)
            file_path = os.path.join(PDFS_DIR, unique_filename)
            relative_path = f"./{PDFS_DIR}/{unique_filename}"

            # Download the PDF
            success = download_pdf(thesis, pdf_url, file_path)

            # Update JSON data based on the download outcome
            if success:
                thesis['file'] = relative_path
            else:
                thesis['file'] = "Download failed"
        else:
            thesis['file'] = "No PDF found"

    # Save the updated JSON data to a new file
    try:
        with open(OUTPUT_JSON_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Updated JSON data saved to '{OUTPUT_JSON_FILE}'.")
    except IOError as e:
        print(f"Error writing to JSON file: {e}")

if __name__ == "__main__":
    main()


Download success for '/JoiCost/ : berekeningen aan stalen raamwerken'
Download success for '"...but some are more equal than others": development of an instrument for software performance prioritization'
Download success for '.NL'
Download success for '(Ab)using Bitcoin for anti-censorship tool'
Download success for '(C)PO in de gemeentelijke organisatie: het Kavelpaspoort als kritische succesfactor'
Download success for '(De)centrale inkoop èn synergie? bij NBM-Amstelland, Infrastructuur en Milieu: een verslag van een afstudeeronderzoek naar de mogelijkheden tot het realiseren van synergie middels coördinatie binnen een gedecentraliseerde inkooporganisatie'
Download success for '(Discrete) strategies for the binary multiplying channel'
Download success for '(Door)breekbaar'
Download success for '(ei)Land : een zoektocht naar stedenbouw en architectuur : van masterplan tot woningplattegrond'
Download success for '(Her)gebruikt bouwen: demontage en hergebruik van geprefabriceerde betone

In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    text = ''
    pdf_document = fitz.open(pdf_path)
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

# Extract text from all PDFs
texts = {}
for thesis in data:
    pdf_path = os.path.join('pdfs', thesis['Title'].replace('/', '_').replace(':', '_') + '.pdf')
    if os.path.exists(pdf_path):
        texts[thesis['Title']] = extract_text_from_pdf(pdf_path)


# Visualise

In [None]:
import json
import networkx as nx
from pyvis.network import Network

# Load the thesis data
with open('thesis_titles_links_and_pdfs_and_supervisors.json', 'r', encoding='utf-8') as f:
    thesis_data = json.load(f)

from collections import Counter

# Dictionary to hold the count of theses per supervisor
supervisor_counts = Counter()

# List to hold the filtered theses data
filtered_thesis_data = []

for thesis in thesis_data:
    thesis_title = thesis['Title']
    supervisors = thesis.get('Supervisors', [])
    
    # Only consider the first supervisor if available
    if supervisors:
        first_supervisor = supervisors[0]
        supervisor_counts[first_supervisor] += 1
        # Update the thesis data to include only the first supervisor
        thesis['Supervisors'] = [first_supervisor]
        filtered_thesis_data.append(thesis)
    else:
        # Optionally, you can exclude theses without supervisors
        pass  # Skip theses without supervisors

# Get the top 20 supervisors
top_supervisors = [supervisor for supervisor, count in supervisor_counts.most_common(20)]

# Filter theses supervised by the top 20 supervisors
top_thesis_data = [thesis for thesis in filtered_thesis_data if thesis['Supervisors'][0] in top_supervisors]

# Create a NetworkX graph
G = nx.Graph()

# Add nodes and edges to the graph
for thesis in top_thesis_data:
    thesis_title = thesis['Title']
    supervisor = thesis['Supervisors'][0]
    
    # Add the thesis node with a 'type' attribute
    G.add_node(thesis_title, type='thesis', title=thesis_title)
    
    # Add supervisor node if it doesn't exist
    if not G.has_node(supervisor):
        G.add_node(supervisor, type='supervisor', title=supervisor)
    
    # Add an edge between the thesis and the supervisor
    G.add_edge(thesis_title, supervisor)

# Create a PyVis network
net = Network(notebook=False, height='750px', width='100%', bgcolor='#ffffff', font_color='black')

# Customize the physics of the network for better layout
net.barnes_hut(gravity=-2000, central_gravity=0.3, spring_length=95, spring_strength=0.01, damping=0.09)

# Add nodes and edges from the NetworkX graph to the PyVis network
for node, data in G.nodes(data=True):
    node_type = data.get('type', 'thesis')
    if node_type == 'thesis':
        net.add_node(node, label=node, title=node, color='lightblue', shape='dot', size=5)
    else:
        net.add_node(node, label=node, title=node, color='orange', shape='dot', size=15)

for source, target in G.edges():
    net.add_edge(source, target)

# Generate the network visualization and save it as an HTML file
net.show('top_supervisors_network.html')


In [None]:
import json

# Replace 'data.json' with the path to your JSON file
with open('thesis_titles_links_and_pdfs_and_supervisors_updated.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Open the markdown file for writing
with open('readme.md', 'w', encoding='utf-8') as md_file:
    for item in data:
        title = item.get('Title', 'No Title')
        url = item.get('URL', 'No URL')
        pdf = item.get('PDF', 'No PDF')
        supervisors = item.get('Supervisors', [])

        md_file.write(f"# {title}\n\n")
        md_file.write(f"- **URL**: [{url}]({url})\n")
        md_file.write(f"- **PDF**: [{pdf}]({pdf})\n")
        if supervisors:
            supervisors_list = ', '.join(supervisors)
            md_file.write(f"- **Supervisors**: {supervisors_list}\n\n")
        else:
            md_file.write(f"- **Supervisors**: None\n\n")
