# Bangalore Doctors Web Scraping with Google Maps Links

This notebook scrapes doctor information from Practo for Bangalore only and includes Google Maps links for each doctor.

In [None]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import urllib.parse

In [None]:
def generate_google_maps_link(doctor_name, location, city):
    """
    Generate a Google Maps search link for a doctor based on their name and location
    """
    # Clean and format the search query
    search_query = f"{doctor_name} doctor {location} {city}"
    # URL encode the search query
    encoded_query = urllib.parse.quote_plus(search_query)
    # Create Google Maps search URL
    maps_url = f"https://www.google.com/maps/search/{encoded_query}"
    return maps_url

In [None]:
# Initialize dataframe with all columns including Google Maps link
df = pd.DataFrame({
    'Name': [''], 
    'Speciality': [''], 
    'Degree': [''], 
    'Year_of_experience': [''], 
    'Location': [''], 
    'City': [''], 
    'dp_score': [''], 
    'npv': [''], 
    'consultation_fee': [''],
    'google_maps_link': ['']
})

# Focus only on Bangalore
city = 'Bangalore'

# All specialities as in original code
Speciality = [
    'Cardiologist', 'Chiropractor', 'Dentist', 'Dermatologist', 
    'Dietitian/Nutritionist', 'Gastroenterologist', 'bariatric surgeon', 
    'Gynecologist', 'Infertility Specialist', 'Neurologist', 'Neurosurgeon', 
    'Ophthalmologist', 'Orthopedist', 'Pediatrician', 'Physiotherapist', 
    'Psychiatrist', 'Pulmonologist', 'Rheumatologist', 'Urologist'
]

print(f"Starting web scraping for {city} doctors...")
print(f"Total specialities to process: {len(Speciality)}")

In [None]:
# Set up Chrome options for better performance
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in background
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")

# Main scraping loop - focused only on Bangalore
for j in Speciality:
    print(f"\nProcessing {j} specialists in {city}...")
    
    try:
        driver = webdriver.Chrome(options=chrome_options)
        url = f"https://www.practo.com/search/doctors?results_type=doctor&q=%5B%7B%22word%22%3A%22{j}%22%2C%22autocompleted%22%3Atrue%2C%22category%22%3A%22subspeciality%22%7D%5D&city={city}"
        driver.get(url)
        time.sleep(3)
        
        # Scroll to load all doctors
        scroll_pause_time = 2 
        screen_height = driver.execute_script("return window.screen.height;") 
        A = 1

        while True:
            driver.execute_script("window.scrollTo(0, {screen_height}*{A});".format(screen_height=screen_height, A=A))
            A += 1
            time.sleep(scroll_pause_time)
      
            scroll_height = driver.execute_script("return document.body.scrollHeight;")
       
            if (screen_height) * A > scroll_height:
                break
     
        soup = BeautifulSoup(driver.page_source, 'lxml')
        postings = soup.find_all('div', class_='u-border-general--bottom')
        
        doctors_found = 0
        
        for post in postings:
            try:
                link = post.find('div', class_='listing-doctor-card').find('a').get('href')
                link_full = 'https://www.practo.com' + link
                driver.get(link_full)
                time.sleep(2)
                soup2 = BeautifulSoup(driver.page_source, 'lxml')

                # Initialize variables
                name = "N/A"
                Degree = "N/A"
                Year_of_experience = "N/A"
                Location = "N/A"
                dp_score = "N/A"
                npv = "N/A"
                consultant_fee = "N/A"
                
                # Extract doctor information
                try:
                    name = soup2.find('h1', class_='c-profile__title u-bold u-d-inlineblock').text.strip()
                except:
                    pass
                    
                try:
                    Degree = soup2.find('p', class_='c-profile__details').text.strip()
                except:
                    pass
                    
                try:
                    Year_of_experience = soup2.find('div', class_='c-profile__details').find_all('h2')[-1].text.strip()
                except:
                    pass
                    
                try:
                    Location = soup2.find('h4', class_='c-profile--clinic__location').text.strip()
                except:
                    pass
                    
                try:
                    dp_score = soup2.find('span', class_='u-green-text u-bold u-large-font').text.strip()
                except:
                    pass
                    
                try:
                    npv = soup2.find('span', class_='u-smallest-font u-grey_3-text').text.strip()
                except:
                    pass
                    
                try:
                    consultant_fee = soup2.find('span', class_='u-strike').text.strip()
                except:
                    try:
                        consultant_fee = soup2.find('div', class_='u-f-right u-large-font u-bold u-valign--middle u-lheight-normal').text.strip()
                    except:
                        pass
                
                # Generate Google Maps link
                google_maps_link = generate_google_maps_link(name, Location, city)
                
                # Append data to dataframe using pd.concat instead of deprecated append
                new_row = pd.DataFrame({
                    'Name': [name], 
                    'Speciality': [j], 
                    'Degree': [Degree], 
                    'Year_of_experience': [Year_of_experience], 
                    'Location': [Location], 
                    'City': [city], 
                    'dp_score': [dp_score], 
                    'npv': [npv], 
                    'consultation_fee': [consultant_fee],
                    'google_maps_link': [google_maps_link]
                })
                
                df = pd.concat([df, new_row], ignore_index=True)
                doctors_found += 1
                
            except Exception as e:
                print(f"Error processing doctor: {str(e)}")
                continue
        
        print(f"Found {doctors_found} {j} specialists in {city}")
        
    except Exception as e:
        print(f"Error processing {j}: {str(e)}")
    
    finally:
        try:
            driver.quit()
        except:
            pass
        
    # Small delay between specialities
    time.sleep(2)

print(f"\nCompleted scraping! Total doctors found: {len(df) - 1}")

In [None]:
# Remove the initial empty row
df = df[1:].reset_index(drop=True)

# Display summary
print(f"Final dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()

In [None]:
# Display sample of Google Maps links
print("Sample Google Maps links generated:")
for i in range(min(5, len(df))):
    print(f"{df.iloc[i]['Name']} - {df.iloc[i]['google_maps_link']}")

In [None]:
# Save the enhanced dataset
output_file = '../DATA/bangalore_doctors_with_maps.csv'
df.to_csv(output_file, index=False)
print(f"\nDataset saved to: {output_file}")
print(f"Total records: {len(df)}")
print(f"Columns included: {', '.join(df.columns)}")

In [None]:
# Data validation and statistics
print("Dataset Statistics:")
print(f"Total doctors: {len(df)}")
print(f"\nSpeciality distribution:")
print(df['Speciality'].value_counts())
print(f"\nMissing values per column:")
print(df.isnull().sum())