In [71]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [72]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from urllib.parse import urljoin


BASE_URL = "https://www.daiict.ac.in"
HEADERS = {"User-Agent": "Mozilla/5.0"}

FACULTY_PAGES = {
    "regular": "https://www.daiict.ac.in/faculty",
    "adjunct": "https://www.daiict.ac.in/adjunct-faculty",
}



In [73]:
def extract_section_by_heading(soup, heading_text):
    heading = soup.find(
        "h2",
        class_="rit-titl",
        string=lambda x: x and heading_text.lower() in x.lower()
    )

    if not heading:
        return ""

    content_div = heading.find_parent("div").find_next_sibling("div")
    if not content_div:
        return ""

    ul = content_div.find("ul")
    if ul:
        return [li.get_text(strip=True) for li in ul.find_all("li")]

    return content_div.get_text(" ", strip=True)


In [74]:
all_faculty_data = []

for faculty_type, list_url in FACULTY_PAGES.items():
    print(f"\nScraping {faculty_type} faculty...")

    response = requests.get(list_url, headers=HEADERS, timeout=15)
    soup = BeautifulSoup(response.text, "html.parser")

    faculty_cards = soup.select("div.facultyDetails")

    for card in tqdm(faculty_cards):
        name_tag = card.select_one("h3 a")
        photo_tag = card.select_one(".facultyPhoto img")
        edu_tag = card.select_one(".facultyEducation")
        email_tag = card.select_one(".facultyemail")
        area_tag = card.select_one(".areaSpecialization")

        profile_url = urljoin(BASE_URL, name_tag["href"]) if name_tag else ""

        faculty_record = {
            "faculty_type": faculty_type,
            "name": name_tag.get_text(strip=True) if name_tag else "",
            "profile_url": profile_url,
            "photo_url": urljoin(BASE_URL, photo_tag["src"]) if photo_tag else "",
            "education": edu_tag.get_text(strip=True) if edu_tag else "",
            "email": email_tag.get_text(strip=True) if email_tag else "",
            "area_specialization_short": area_tag.get_text(strip=True) if area_tag else ""
        }

        if profile_url:
            res = requests.get(profile_url, headers=HEADERS, timeout=15)
            prof_soup = BeautifulSoup(res.text, "html.parser")

            bio_div = prof_soup.select_one("div.about")
            faculty_record["biography"] = (
                bio_div.get_text(" ", strip=True) if bio_div else ""
            )

            faculty_record["specialisation"] = extract_section_by_heading(prof_soup, "Specialization")
            faculty_record["teaching"] = extract_section_by_heading(prof_soup, "Teaching")
            faculty_record["research"] = extract_section_by_heading(prof_soup, "Research")
            faculty_record["publications"] = extract_section_by_heading(prof_soup, "Publications")

        all_faculty_data.append(faculty_record)



Scraping regular faculty...


100%|██████████| 67/67 [00:20<00:00,  3.20it/s]



Scraping adjunct faculty...


100%|██████████| 26/26 [00:07<00:00,  3.62it/s]


In [75]:
def scrape_distinguished_faculty():
    url = "https://www.daiict.ac.in/distinguished-professor"
    res = requests.get(url, headers=HEADERS, timeout=15)
    soup = BeautifulSoup(res.text, "html.parser")

    records = []

    cards = soup.select("div.facultyInformation li div.facultyDetails")

    for card in cards:
        area = card.select_one("div.areaSpecialization p")

        records.append({
            "faculty_type": "distinguished",
            "name": "Not Available",
            "profile_url": "",
            "photo_url": "",
            "education": "",
            "email": "",
            "area_specialization_short": area.get_text(" ", strip=True) if area else "",
            "biography": "",
            "specialisation": "",
            "teaching": "",
            "research": "",
            "publications": ""
        })

    return records


In [76]:
distinguished_data = scrape_distinguished_faculty()
all_faculty_data.extend(distinguished_data)

print("Distinguished faculty added:", len(distinguished_data))


Distinguished faculty added: 1


In [86]:
def scrape_international_adjunct_faculty():
    url = "https://www.daiict.ac.in/adjunct-faculty-international"
    res = requests.get(url, headers=HEADERS, timeout=15)
    soup = BeautifulSoup(res.text, "html.parser")

    records = []

    # Each international adjunct faculty is a Drupal article
    articles = soup.select("article")

    print("International adjunct articles found:", len(articles))

    for art in articles:
        img = art.select_one("img")
        area = art.select_one(".field--name-field-area-specialization")

        records.append({
            "faculty_type": "international_adjunct",
            "name": img.get("alt", "Not Available").strip() if img else "Not Available",
            "profile_url": "",
            "photo_url": urljoin(BASE_URL, img["src"]) if img and img.get("src") else "",
            "education": "",
            "email": "",
            "area_specialization_short": area.get_text(" ", strip=True) if area else "",
            "biography": "",
            "specialisation": "",
            "teaching": "",
            "research": "",
            "publications": ""
        })

    return records


In [87]:
international_adjunct_data = scrape_international_adjunct_faculty()
print("Length:", len(international_adjunct_data))
international_adjunct_data[:1]


International adjunct articles found: 0
Length: 0


[]

In [83]:
international_adjunct_data = scrape_international_adjunct_faculty()

print("Type:", type(international_adjunct_data))
print("Length:", len(international_adjunct_data))
print("Sample:", international_adjunct_data[:1])


International adjunct cards found: 11
Type: <class 'list'>
Length: 11
Sample: [{'faculty_type': 'international_adjunct', 'name': 'Anil Maheshwari', 'profile_url': '', 'photo_url': 'https://www.daiict.ac.in/sites/default/files/faculty_image/Anil_Maheshwari.jpg', 'education': '', 'email': '', 'area_specialization_short': 'Design, Analysis and Implementation of Algorithms for Problems arising in Computational Geometry, Graph Theory, Discrete Mathematics, and Data Science.', 'biography': '', 'specialisation': '', 'teaching': '', 'research': '', 'publications': ''}]


In [79]:
def scrape_professor_of_practice():
    url = "https://www.daiict.ac.in/professor-practice"
    res = requests.get(url, headers=HEADERS, timeout=15)
    soup = BeautifulSoup(res.text, "html.parser")

    records = []

    cards = soup.select("div.facultyDetails")
    print("Professor of Practice cards found:", len(cards))  # debug

    for card in cards:
        img = card.select_one(".facultyPhoto img")
        area = card.select_one(".areaSpecialization p")

        records.append({
            "faculty_type": "professor_of_practice",
            "name": img["alt"].strip() if img and img.get("alt") else "Not Available",
            "profile_url": "",
            "photo_url": urljoin(BASE_URL, img["src"]) if img else "",
            "education": "",
            "email": "",
            "area_specialization_short": area.get_text(" ", strip=True) if area else "",
            "biography": "",
            "specialisation": "",
            "teaching": "",
            "research": "",
            "publications": ""
        })

    return records


In [80]:
prof_practice_data = scrape_professor_of_practice()
all_faculty_data.extend(prof_practice_data)

print("Professor of Practice added:", len(prof_practice_data))


Professor of Practice cards found: 4
Professor of Practice added: 4


In [81]:
import json

with open("faculty_raw.json", "w", encoding="utf-8") as f:
    json.dump(all_faculty_data, f, indent=2, ensure_ascii=False)

print("✅ faculty_raw.json updated")
print("Total records saved:", len(all_faculty_data))


✅ faculty_raw.json updated
Total records saved: 98


In [82]:
from collections import Counter

with open("faculty_raw.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Total records:", len(data))
print(Counter(d.get("faculty_type", "unknown") for d in data))


Total records: 98
Counter({'regular': 67, 'adjunct': 26, 'professor_of_practice': 4, 'distinguished': 1})
