# 🩺 Pakistan Doctor Scraper using Selenium

This script uses **Selenium** to scrape detailed information about doctors across **100+ cities** in Pakistan from **Google Maps**, aiming to collect **50,000+ records**.

---

## 🚀 Features
- Covers major cities from all provinces: Punjab, Sindh, KP, Balochistan, AJK, and GB.
- Searches for 40+ medical specializations (e.g., Cardiologists, Gynecologists, Pediatricians).
- Automatically scrolls and navigates Google Maps results.
- Stores data in `pakistan_doctors_50000.csv` with:
  - **Doctor Name**
  - **Specialization**
  - **Hospital/Clinic**
  - **City**
  - **Rating**

---

## 🧰 Requirements
```bash
pip install selenium webdriver-manager


In [4]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# **🌍 More Cities for Better Coverage**
cities = [
    # Punjab
    "Lahore", "Faisalabad", "Rawalpindi", "Multan", "Gujranwala", "Sialkot", "Bahawalpur", "Sargodha",
    "Sahiwal", "Sheikhupura", "Jhang", "Rahim Yar Khan", "Okara", "Kasur", "Chiniot", "Gujrat", "Mandi Bahauddin",
    "Dera Ghazi Khan", "Vehari", "Hafizabad", "Attock", "Khanewal", "Bahawalnagar", "Muzaffargarh", "Narowal",
    "Jhelum", "Bhakkar", "Lodhran", "Pakpattan", "Toba Tek Singh", "Chakwal", "Khushab", "Mianwali", "Kamoke",
    "Pattoki", "Kot Addu", "Shujabad", "Fort Abbas", "Depalpur", "Kot Radha Kishan", "Kamalia", "Ahmedpur East",
    "Samundri", "Kabirwala", "Burewala",

    # Sindh
    "Karachi", "Hyderabad", "Sukkur", "Larkana", "Nawabshah", "Mirpur Khas", "Jacobabad", "Shikarpur", "Badin",
    "Khairpur", "Dadu", "Ghotki", "Umerkot", "Sanghar", "Tando Allahyar", "Tando Adam", "Tando Muhammad Khan",
    "Thatta", "Kashmore", "Kandhkot", "Matiari", "Hala", "Shahdadkot", "Sehwan", "Jamshoro", "Sujawal",
    "Kotri", "Tharparkar", "Mehar",

    # Khyber Pakhtunkhwa
    "Peshawar", "Mardan", "Mingora", "Abbottabad", "Kohat", "Dera Ismail Khan", "Swat", "Mansehra", "Charsadda",
    "Bannu", "Nowshera", "Swabi", "Hangu", "Tank", "Lakki Marwat", "Chitral", "Dir", "Battagram", "Shangla",
    "Haripur", "Toru", "Tangi",

    # Balochistan
    "Quetta", "Gwadar", "Turbat", "Chaman", "Khuzdar", "Hub", "Dera Murad Jamali", "Zhob", "Pishin", "Kalat",
    "Sibi", "Mastung", "Loralai", "Nushki", "Dalbandin", "Ziarat", "Kharan", "Qila Saifullah", "Qila Abdullah",

    # Azad Jammu and Kashmir (AJK)
    "Muzaffarabad", "Mirpur", "Kotli", "Bagh", "Rawalakot", "Poonch", "Neelum", "Haveli",

    # Gilgit-Baltistan
    "Gilgit", "Skardu", "Hunza", "Diamer", "Ghizer", "Ghanche", "Shigar", "Kharmang", "Nagar", "Astore"
]


# **🔍 Expanded Search Queries to Cover More Doctors**
search_queries = [
    "Best Doctors in ", "Top Hospitals in ", "Specialist Doctors in ", "Clinics in ",  
    "Medical Specialists in ", "General Physicians in ", "Surgeons in ", "Pediatricians in ",  
    "Gynecologists in ", "Dermatologists in ", "ENT Specialists in ", "Cardiologists in ",  
    "Neurologists in ", "Nephrologists in ", "Endocrinologists in ", "Gastroenterologists in ",  
    "Oncologists in ", "Hematologists in ", "Pulmonologists in ", "Hepatologists in ",  
    "Rheumatologists in ", "Orthopedic Surgeons in ", "Urologists in ", "Ophthalmologists in ",  
    "Psychiatrists in ", "Psychologists in ", "Allergists in ", "Immunologists in ",  
    "Infectious Disease Specialists in ", "Pain Management Specialists in ",  
    "Rehabilitation Specialists in ", "Plastic Surgeons in ", "Geriatricians in ",  
    "Sports Medicine Specialists in ", "Emergency Medicine Specialists in ",  
    "Sleep Medicine Specialists in ", "Palliative Care Specialists in ",  
    "Medical Geneticists in ", "Neonatologists in "
]

# **🚀 Chrome Options**
chrome_options = Options()
chrome_options.add_argument("--headless")  # **Run in Background**
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--log-level=3")

service = Service(ChromeDriverManager().install())  # Automatically install ChromeDriver
driver = webdriver.Chrome(service=service, options=chrome_options)


# **📂 CSV File to Store Data**
filename = "pakistan_doctors_50000.csv"
with open(filename, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Doctor Name", "Specialization", "Hospital/Clinic", "City", "Rating"])

    total_records = 0
    target_records = 50000  # **Target 50,000+ Records**

    for city in cities:
        if total_records >= target_records:
            break  # **Stop once 50,000 records are reached**

        for query in search_queries:
            if total_records >= target_records:
                break
            
            print(f"\n🔍 Searching: {query} {city}")

            # **🔍 Open Google Maps & Search**
            driver.get("https://www.google.com/maps")
            time.sleep(3)

            search_box = driver.find_element(By.ID, "searchboxinput")
            search_box.clear()
            search_box.send_keys(query + city)
            search_box.send_keys(Keys.RETURN)
            time.sleep(5)

            city_records = 0
            max_records_per_city = 500  # **Increase Limit to 500 Per City**

            # **🔄 Scrolling Mechanism to Load More Results**
            for _ in range(10):  # **Scroll 10 times**
                driver.execute_script("document.querySelector('div[role=\"feed\"]').scrollBy(0, 1000);")
                time.sleep(2)

            retry_attempts = 3  # **Retry if no results**
            while city_records < max_records_per_city and total_records < target_records:
                results = driver.find_elements(By.CLASS_NAME, "Nv2PK")

                if not results and retry_attempts > 0:
                    print("⚠️ No results found, retrying...")
                    time.sleep(3)
                    retry_attempts -= 1
                    continue

                for result in results:
                    try:
                        name = result.find_element(By.CLASS_NAME, "qBF1Pd").text.strip()
                        if not name.startswith("Dr"):
                            continue  # **Ensure Only Doctor Listings**

                        address = result.find_element(By.CLASS_NAME, "W4Efsd").text.strip()
                        details = result.text.split("\n")

                        # **🔬 Extract Specialization**
                        specialization = "Unknown"
                        if len(details) > 1:
                            specialization = details[1]

                        # **🏥 Extract Hospital/Clinic Name**
                        hospital = "Not Available"
                        if "Clinic" in address or "Hospital" in address or "Medical Center" in address:
                            hospital = address
                        elif len(details) > 2:
                            hospital = details[2]

                        # **⭐ Extract Rating**
                        rating = "Not Available"
                        try:
                            rating = result.find_element(By.CLASS_NAME, "MW4etd").text.strip()
                        except:
                            pass  # **If No Rating, Skip**

                        # **📌 Save Data to CSV**
                        writer.writerow([name, specialization, hospital, city, rating])
                        print(f"✅ {name} - {specialization} - {hospital} - {city} - ⭐ {rating}")

                        city_records += 1
                        total_records += 1
                        if city_records >= max_records_per_city or total_records >= target_records:
                            break

                    except Exception as e:
                        print(f"❌ Error: {e}")
                        continue

                try:
                    # **➡️ Click Next Page for More Results**
                    next_button = driver.find_element(By.CSS_SELECTOR, '[aria-label="Next"]')
                    next_button.click()
                    time.sleep(5)
                except:
                    print(f"🔚 No more pages for {city}")
                    break

print(f"\n✅ Total {total_records} Records Successfully Saved!\n📂 File: {filename}")
driver.quit()



🔍 Searching: Best Doctors in  Lahore
✅ Dr. Mujahid Israr | Best Gastroenterologist in Lahore - 4.7(75) - Gastroenterologist ·  · National Hospital & Medical Center, 132/2 Street 123 - Lahore - ⭐ 4.7
✅ Dr. Khurshid Alam | Best ENT Specialist Lahore - 4.2(96) - Otolaryngologist ·  · 152-G, Doctor's Hospital, 1 Canal Rd - Lahore - ⭐ 4.2
✅ Dr. Hafiz Abdul Momin | Expert Urologist in Lahore - Penile Implant & Kidney Stone Specialist | MBBS, FCPS (Urology) - 4.8(472) - Urologist · Clinx Pharmacy, Chugtai Medical Center, Near41-A Shalimar Link Road, opposite Sahlimar Hospital - Lahore - ⭐ 4.8
✅ Dr. Fahmina Ashfaq - Sugar, Medical & Blood Pressure Specialist Lady Doctor - 4.9(113) - Doctor ·  · Block, Omar hospital and cardiac centre, D2 - Lahore - ⭐ 4.9
✅ Dr. Samia Khan | Best General Physician in Lahore - 5.0(20) - General practitioner ·  · Chughtai Medical Center, 154 CAA - Lahore - ⭐ 5.0
✅ Dr. Maria Farooq | Gynecologist | Infertility Specialist |Laparoscopic Surgeon|cosmetic & aesthe

JavascriptException: Message: javascript error: Cannot read properties of null (reading 'scrollBy')
  (Session info: chrome=134.0.6998.177)
Stacktrace:
	GetHandleVerifier [0x010FC7F3+24435]
	(No symbol) [0x01082074]
	(No symbol) [0x00F506E3]
	(No symbol) [0x00F570D7]
	(No symbol) [0x00F595C5]
	(No symbol) [0x00FE0225]
	(No symbol) [0x00FBD7BC]
	(No symbol) [0x00FDF20A]
	(No symbol) [0x00FBD5B6]
	(No symbol) [0x00F8C54F]
	(No symbol) [0x00F8D894]
	GetHandleVerifier [0x014070A3+3213347]
	GetHandleVerifier [0x0141B0C9+3295305]
	GetHandleVerifier [0x0141558C+3271948]
	GetHandleVerifier [0x01197360+658144]
	(No symbol) [0x0108B27D]
	(No symbol) [0x01088208]
	(No symbol) [0x010883A9]
	(No symbol) [0x0107AAC0]
	BaseThreadInitThunk [0x75EE5D49+25]
	RtlInitializeExceptionChain [0x77AACE3B+107]
	RtlGetAppContainerNamedObjectPath [0x77AACDC1+561]


# ⚙️ How It Works
Launches Chrome in headless mode.

Iterates over cities and search queries.

Performs search, scrolls results, and scrapes relevant details.

Saves only listings starting with "Dr".

Automatically stops after collecting 50,000 records.

### 🧼 Doctor Dataset Cleaning & Categorization

This notebook performs data cleaning and intelligent classification for a dataset of doctors in Pakistan. The main objectives of this code are:

- 📥 **Load the raw doctors dataset** from a CSV file.
- 🧹 **Clean the hospital/clinic addresses** by removing unwanted characters and standardizing the format.
- 🔍 **Extract doctor specializations** using a combination of:
  - Known medical titles and specialties (e.g., *Cardiologist*, *Dermatologist*)
  - Disease-related keywords (e.g., *heart*, *skin*, *cancer*)
  - Contextual clues from the doctor's name, address, or city fields
- 🧠 **Apply intelligent pattern-matching** using regular expressions (`regex`) to assign the most likely medical category where it is missing or marked as "Unspecified".
- 📊 **Convert ratings to numeric format** and ensure consistency in all fields.
- 💾 **Export a cleaned and categorized version** of the dataset to `cleaned_doctors.csv`.

By the end of this process, the dataset becomes cleaner, more structured, and better suited for analysis or use in healthcare-related applications.


In [1]:
import pandas as pd
import re

# Load dataset
df = pd.read_csv("doctors.csv") 

# Function to clean address/details
def clean_address(hospital_clinic_field):
    if pd.isna(hospital_clinic_field) or hospital_clinic_field.strip() == "":
        return "Location not listed"
    hospital_clinic_field = re.sub(r"[^a-zA-Z0-9,+.\-/\s]", "", hospital_clinic_field)
    hospital_clinic_field = re.sub(r"\s+", " ", hospital_clinic_field).strip()
    return hospital_clinic_field

# Expanded list of specializations and diseases to match using regex
specializations = [
    "Dermatologist", "Dental Surgeon", "Orthopedic Surgeon", "General Physician", 
    "Cardiologist", "Neurologist", "Psychiatrist", "Pediatrician", "Gynecologist", 
    "Plastic Surgeon", "Rheumatologist", "Pulmonologist", "Nephrologist", 
    "Hematologist", "Oncologist", "Ophthalmologist", "ENT Specialist", 
    "Endocrinologist", "Surgeon", "Gastroenterologist", "Allergist", "Infectious Disease Specialist",
    "Urologist", "Gastrointestinal Specialist", "Hepatologist", "Immunologist", "Rehabilitation Specialist", 
    "Pain Management Specialist", "Sports Medicine Specialist", "Emergency Medicine Specialist", 
    "Sleep Medicine Specialist", "Palliative Care Specialist", "Neonatologist", "Obstetrician", 
    "Family Medicine Specialist", "Family Physician", "Clinical Psychologist", "Dermatopathologist", 
    "Rheumatology Specialist", "Toxicologist", "Medical Geneticist", "Endoscopic Specialist", 
    "Pathologist", "Microbiologist", "Geriatric Specialist", "Addiction Specialist", "Thoracic Surgeon", 
    "Vascular Surgeon", "Pain Specialist", "Mental Health Specialist", "Plastic Surgeon", 
    "Geriatrics", "Acupuncture Specialist", "Psychotherapist", "Physical Therapist", "Naturopathic Doctor"
]

# List of common disease-related terms to capture
disease_terms = {
    "eye": "Ophthalmologist",
    "heart": "Cardiologist",
    "diabetes": "Diabetologist",
    "cancer": "Oncologist",
    "skin": "Dermatologist",
    "lung": "Pulmonologist",
    "kidney": "Nephrologist",
    "liver": "Hepatologist",
    "mental health": "Psychiatrist",
    "brain": "Neurologist",
    "obesity": "Endocrinologist",
    "blood": "Hematologist",
    "allergy": "Allergist",
    "infection": "Infectious Disease Specialist",
    "arthritis": "Rheumatologist",
    "pregnancy": "Gynecologist",
    "spinal": "Orthopedic Surgeon",
    "gastro": "Gastroenterologist",
    "sleep": "Sleep Medicine Specialist",
    "rehab": "Rehabilitation Specialist",
    "neurology": "Neurologist",
    "dentistry": "Dental Surgeon",
    "pediatrics": "Pediatrician",
    "obstetrics": "Obstetrician",
    "geriatric": "Geriatrician",
    "toxicology": "Toxicologist",
    "psychology": "Psychologist",
    "physiotherapy": "Physical Therapist",
}

# Define surgeon types to capture full specialization (Orthopedic, Dental, etc.)
surgeon_types = [
    "orthopedic", "dental", "neuro", "plastic", "cardiac", "vascular", "thoracic", "spinal"
]

# Normalize and search for specializations and diseases in the Name, Address, or City
def extract_specializations(name, address, city, current_category):
    name_lower = name.lower()
    address_lower = address.lower()
    city_lower = city.lower()

    # Check for diseases using disease_terms dictionary
    for term, category in disease_terms.items():
        if re.search(rf"\b{term}\b", name_lower) or re.search(rf"\b{term}\b", address_lower) or re.search(rf"\b{term}\b", city_lower):
            return category if current_category in ["Unspecified", "Unknown", None] else current_category

    # Search for skin-related terms (starting or ending with "skin")
    skin_terms = ["skin", "dermatologist", "skin specialist", "skin care", "skin doctor"]
    for term in skin_terms:
        if re.search(rf"\b{term}\b", name_lower) or re.search(rf"\b{term}\b", address_lower) or re.search(rf"\b{term}\b", city_lower):
            return "Dermatologist" if current_category in ["Unspecified", "Unknown", None] else current_category

    # Search for child-related terms (pediatric and similar terms)
    child_terms = ["pediatrician", "child specialist", "pediatric care", "pediatrics", "child doctor"]
    for term in child_terms:
        if re.search(rf"\b{term}\b", name_lower) or re.search(rf"\b{term}\b", address_lower) or re.search(rf"\b{term}\b", city_lower):
            return "Pediatrician" if current_category in ["Unspecified", "Unknown", None] else current_category

    # Search for surgeon types (Orthopedic, Dental, etc.) and assign full specialization
    for surgeon in surgeon_types:
        if re.search(rf"\b{surgeon}\b", name_lower) or re.search(rf"\b{surgeon}\b", address_lower) or re.search(rf"\b{surgeon}\b", city_lower):
            # Return the full specialization (e.g., "Orthopedic", "Dental")
            return surgeon.title() + " Surgeon" if current_category in ["Unspecified", "Unknown", None] else current_category

    # Search for other general specializations and diseases
    for specialization in specializations:
        if re.search(rf"\b{specialization.lower()}\b", name_lower) or re.search(rf"\b{specialization.lower()}\b", address_lower) or re.search(rf"\b{specialization.lower()}\b", city_lower):
            if current_category in ["Unspecified", "Unknown", None]:
                return specialization.title()

    return current_category

# Clean columns
df["Name"] = df["Name"].str.replace(r"Dr\.?\s+", "Dr ", regex=True).str.strip()
df["Category"] = df["Category"].fillna("Unspecified").replace("Unknown", "Unspecified")
df["Address/Details"] = df["Address/Details"].apply(clean_address)
df["City"] = df["City"].fillna("Unspecified").str.strip()
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce")

# Update 'Category' with specializations (only if 'Category' is 'Unspecified' or 'Unknown')
df["Category"] = df.apply(lambda row: extract_specializations(row["Name"], row["Address/Details"], row["City"], row["Category"]), axis=1)

# Final cleaned format
cleaned_df = df[["Name", "Category", "Address/Details", "City", "Rating"]]

# Save cleaned data
cleaned_df.to_csv("cleaned_doctors.csv", index=False)

print("✅ Doctors data cleaned, specializations and disease categories extracted from Name, Address, and City into 'Category' column and saved in 'cleaned_doctors_dataa.csv'")


✅ Doctors data cleaned, specializations and disease categories extracted from Name, Address, and City into 'Category' column and saved in 'cleaned_doctors_dataa.csv'


In [2]:
# Check how many ratings are unknown (NaN)
unknown_ratings_count = df["Rating"].isna().sum()

# Print the result
print(f"Number of unknown ratings: {unknown_ratings_count}")


Number of unknown ratings: 0


In [3]:
# Impute missing ratings with the mean rating
mean_rating = df["Rating"].mean()
df["Rating"] = df["Rating"].fillna(mean_rating)

# Round the 'Rating' column to 2 decimal places
df["Rating"] = df["Rating"].round(2)

# Save the cleaned data
df.to_csv("cleaned_doctors.csv", index=False)

print("✅ Ratings rounded to 2 decimal places and data saved successfully.")

✅ Ratings rounded to 2 decimal places and data saved successfully.
