In [1]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import requests
from bs4 import BeautifulSoup
import json

FACULTY_URLS = {
    "regular": "https://www.daiict.ac.in/faculty",
    "adjunct": "https://www.daiict.ac.in/adjunct-faculty",
    "international_adjunct": "https://www.daiict.ac.in/adjunct-faculty-international",
    "distinguished": "https://www.daiict.ac.in/distinguished-professor",
    "professor_of_practice": "https://www.daiict.ac.in/professor-practice"
}

HEADERS = {"User-Agent": "Mozilla/5.0"}
OUTPUT_FILE = "faculty_raw.json"


def scrape_faculty_page(url, faculty_type):
    response = requests.get(url, headers=HEADERS, timeout=15)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    records = []

    # ✅ CORRECT parent container
    outer_div = soup.find("div", class_="facultyInformation")
    if not outer_div:
        print(f"[WARN] No facultyInformation container for {faculty_type}")
        return records

    faculty_cards = outer_div.find_all("div", class_="facultyDetails")

    for card in faculty_cards:
        # Handle name block variations
        personal_div = (
            card.find("div", class_="personalDetails")
            or card.find("div", class_="personalDetail")
            or card.find("div", class_="personalDetailsGer")
        )

        name = None
        if personal_div:
            h3 = personal_div.find("h3")
            name = h3.get_text(strip=True) if h3 else None

        # Education
        edu_div = card.find("div", class_="facultyEducation")
        education = edu_div.get_text(" ", strip=True) if edu_div else None

        # Contact info
        phone = address = email = None
        contact_div = card.find("div", class_="contactDetails")
        if contact_div:
            phone_tag = contact_div.find("span", class_="facultyNumber")
            address_tag = contact_div.find("span", class_="facultyAddress")
            email_tag = contact_div.find("span", class_="facultyemail")

            phone = phone_tag.get_text(strip=True) if phone_tag else None
            address = address_tag.get_text(" ", strip=True) if address_tag else None
            email = email_tag.get_text(strip=True) if email_tag else None

        # Specialization
        area_div = card.find("div", class_="areaSpecialization")
        specialization = area_div.get_text(" ", strip=True) if area_div else None

        records.append({
            "faculty_type": faculty_type,
            "name": name,
            "education": education,
            "phone": phone,
            "address": address,
            "email": email,
            "specialization": specialization
        })

    return records


def main():
    all_faculty_data = []

    for faculty_type, url in FACULTY_URLS.items():
        print(f"\nScraping {faculty_type} faculty...")
        records = scrape_faculty_page(url, faculty_type)
        print(f"Records scraped: {len(records)}")
        all_faculty_data.extend(records)

    print("\nDEBUG: Total records in memory:", len(all_faculty_data))
    print("DEBUG: Writing JSON file...")

    with open("faculty_raw.json", "w", encoding="utf-8") as f:
        json.dump(all_faculty_data, f, indent=2, ensure_ascii=False)

    print("✅ faculty_raw.json CREATED successfully")



In [4]:
from collections import Counter
import json

with open("faculty_raw.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Total records:", len(data))
print("Records by faculty type:")
print(Counter(d.get("faculty_type", "unknown") for d in data))


Total records: 109
Records by faculty type:
Counter({'regular': 67, 'adjunct': 26, 'international_adjunct': 11, 'professor_of_practice': 4, 'distinguished': 1})
