In [2]:
import nltk
import numpy as np
import random
import string
import bs4 as bs
import urllib.request
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import urllib.parse


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def scrape_faculty_pages(faculty_urls, num_faculty=3):
    """
    Scrapes faculty pages for specific <div> class data and their linked content,
    but only retrieves information for a specified number of faculty members per URL.
    """
    faculty_data = []
    for url in faculty_urls:
        try:
            link = urllib.request.urlopen(url).read()
            data = bs.BeautifulSoup(link, 'lxml')
            divs = data.find_all('div', class_="col-lg-3 col-md-4 col-sm-6 col-12")
            
            # Limit to `num_faculty` faculty members
            count = 0
            for div in divs:
                if count >= num_faculty:
                    break
                div_text = div.get_text(strip=True)
                a_tag = div.find('a', href=True)
                if a_tag:
                    faculty_link = urllib.parse.urljoin(url, a_tag['href'])
                    try:
                        faculty_page = urllib.request.urlopen(faculty_link).read()
                        faculty_data_bs = bs.BeautifulSoup(faculty_page, 'lxml')
                        p_tags = faculty_data_bs.find_all('p')
                        li_tags = faculty_data_bs.find_all('li')
                        additional_text = (
                            " ".join(tag.text.strip() for tag in p_tags) + 
                            " " +
                            " ".join(tag.text.strip() for tag in li_tags)
                        )
                    except Exception as e:
                        print(f"Error scraping faculty page {faculty_link}: {e}")
                        additional_text = ""
                    
                    faculty_data.append({
                        'url': url,
                        'div_text': div_text,
                        'linked_text': additional_text
                    })
                    count += 1
        except Exception as e:
            print(f"Error scraping faculty URL {url}: {e}")
    return faculty_data

# Faculty pages URLs
faculty_urls = [
    "https://lhr.nu.edu.pk/fsm/faculty/",
    "https://lhr.nu.edu.pk/fsc/faculty/",
    "https://lhr.nu.edu.pk/ee/faculty/",
    "https://lhr.nu.edu.pk/cv/faculty/",
    "https://lhr.nu.edu.pk/ss/faculty/"
]

print("Scraping faculty pages...")
faculty_content = scrape_faculty_pages(faculty_urls, num_faculty=3)
print("Faculty content scraping complete.")

# Print the result for each faculty URL
for faculty in faculty_content:
    print(f"URL: {faculty['url']}")
    print(f"Faculty Text: {faculty['div_text']}")
    print(f"Linked Text: {faculty['linked_text']}")
    print("="*80)  # Separator for readability


Scraping faculty pages...
Faculty content scraping complete.
URL: https://lhr.nu.edu.pk/fsm/faculty/
Faculty Text: Dr. Hamid HassanProfessor & HODHEC Approved PhD Supervisorhamid.hassan@nu.edu.pk
Linked Text: Professor
                         & HOD 
                        
                          HEC Approved PhD Supervisor hamid.hassan@nu.edu.pk (042) 111-128-128 Ext:252 Dr. Hamid Hassan is Professor & Director Lahore Campus, at National University of Computer & Emerging Sciences (FAST-NU). Dr Hassan started his professional career in the corporate sector and worked for a leading bank after completing his MBA. He did his MSc in Management and PhD in Quantitative Finance and Management from distinguished Graduate School of Social Systems & Management at the University of Tsukuba, Japan. He studied and taught at the University of Tsukuba, and worked as Post-Doc Research Fellow at the Foreign Researcher Collaboration Program of the University. During his stay in Japan, he conducted r

: 