In [None]:
# CODE STEP 1 - to collect database of basic doctor information on all psychologists on doctolib offering first-time consultations

In [None]:
# Webscraping doctolib for basic doctor information - excluding next first-time psychology appointment information 
#                                                    (in second jupyter notebook called Selenium doctolib Part II)

In [None]:
import os

# Getting the base path of the current working directory
base_path = os.getcwd()

# Creating a folder for outputs
folder_name = 'Doctolib Scraping Output'

# Create the full path by appending the folder name to the base path
full_path = os.path.join(base_path, folder_name)

In [2]:
# !!! Please run first before running other cells !!!
# Defining human functions
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
import random
import os
import time
import re
import pandas as pd
import itertools

# scrolling down function
def scrolling_down():
    for _ in itertools.repeat(None, random.randint(23,24)):
        driver.execute_script("window.scrollBy(0, +250);") # scrolling to the end of the page
        time.sleep(random.uniform(0.5, 1.0))
# scrolling_down() # to call function

# scrolling up function followed by scrolling down
def scrolling_up():
    random_number = random.randint(5,10)
    for _ in itertools.repeat(None, random_number):
        driver.execute_script("window.scrollBy(0, -250);") # scrolling to the end of the page
        time.sleep(random.uniform(0.5, 1.0))
    for _ in itertools.repeat(None, random_number):
        driver.execute_script("window.scrollBy(0, +250);") # scrolling to the end of the page
        time.sleep(random.uniform(0.5, 1.0))
# scrolling_up() # to call function


In [None]:
# Extracting doctor names and addresses over several pages (all information contained in the JSON file)
# -----------------------------------------------------------------------------------------------------------------------------------------------------
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
import random
import os
import time
import re
import pandas as pd
import itertools
import json

## Creating a proxy server to avoid detection (to avoid being banned by doctolib)
# Define the proxy server 
PROXY = "IpOfTheProxy:PORT" 
 
# Set ChromeOptions() 
options = webdriver.ChromeOptions() 
 
# Add the proxy as argument 
options.add_argument("--proxy-server=%s" % PROXY) 

#url = 'https://www.doctolib.fr/psychologue/france?page=400ref_visit_motive_id=592' # URL post-filters starting at a specific page
url = 'https://www.doctolib.fr/psychologue/france?&ref_visit_motive_id=592' # URL post-filters (psychologists in France, offering first-time psychology appointments)
driver = webdriver.Chrome(options=options)
driver.maximize_window()

# Create Chromeoptions instance 
options = webdriver.ChromeOptions() 

# Disable browser extensions
options.add_argument("--disable-extensions")

# Disable GPU acceleration
options.add_argument("--disable-gpu")

# Disable infobars (notification popups)
options.add_argument("--disable-infobars")

# Disable the 'chrome is being controlled by automated test software' infobar
options.add_argument("--disable-automation-infobar")

# Disable image loading to speed up page loading (optional)
options.add_argument("--blink-settings=imagesEnabled=false")

# Adding argument to disable the AutomationControlled flag (another measure to avoid being banned by doctolib by removing the recognition of a web page as being automatically controlled)
options.add_argument("--disable-blink-features=AutomationControlled") 
 
# Exclude the collection of enable-automation switches 
options.add_experimental_option("excludeSwitches", ["enable-automation"]) 
 
# Turn-off userAutomationExtension 
options.add_experimental_option("useAutomationExtension", False) 
 
# Setting the driver path and requesting a page 
driver = webdriver.Chrome(options=options) 
 
# Changing the property of the navigator value for webdriver to undefined 
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") 

driver.get(url)

time.sleep(random.uniform(5,10)) # wait 5-10 seconds

cookiequestion=driver.find_element("xpath","//*[@id='didomi-notice-disagree-button']") # Clicking disagree to cookies to remove banner
cookiequestion.click()

scrolling_down()

# Waiting for all elements on page to load
print("Before waiting for the page to load")
WebDriverWait(driver, 100).until(lambda driver: driver.execute_script("return document.readyState") == "complete") 
print("After waiting for the page to load")

time.sleep(random.uniform(5,10)) # wait 5-10 seconds

# Locate the script tag containing the JSON data
script_elements = driver.find_elements("xpath",'//script[@type="application/ld+json"]')

# Extract the text content of the script tag
json_data = script_elements[2].get_attribute("innerHTML")

#print(json_data) # uncomment for sanity checking

# Parse the JSON-like data
parsed_data = json.loads(json_data)

# Display the parsed data
#print(parsed_data) # uncomment for sanity checking

# Initialize an empty list to store doctor information
doctors_list = []

# Extract relevant information from search results and append to the list
for doctor_info in parsed_data:
    name = doctor_info.get('name', '')
    address = doctor_info.get('address', {}).get('streetAddress', '')
    zipcode = doctor_info.get('address', {}).get('postalCode', '')
    city = doctor_info.get('address', {}).get('addressLocality', '')
    speciality = doctor_info.get('medicalSpecialty', '')
    practice_id = doctor_info.get('url', '').split('/')[-1]  # Assuming practice ID is the last part of the URL

    # Append data to the list
    doctors_list.append({
        'Name': name,
        'Address': address,
        'Zipcode': zipcode,
        'City': city,
        'Speciality': speciality,
        'Practice ID': practice_id
    })

# Display the list of doctors
print(doctors_list)

# Lists of pages for human behaviour (to avoid detection as a bot)
list_of_waiting_pages = [] # Creating a random list of pages to wait at
for i in range(1,75):
    list_of_waiting_pages.append(random.randint(300,440))
# print(list_of_waiting_pages) # uncomment to check

list_of_scrollingup_pages = [] # Creating a random list of pages to scroll up
for i in range(1,102):
    list_of_scrollingup_pages.append(random.randint(300,440))
# print(list_of_scrollingup_pages) # uncomment to check

for i in range(1, 200): # looping through next 200 pages
    
    time.sleep(random.uniform(180,300)) # Wait 3-5 mins
    
    if i in list_of_waiting_pages:
        time.sleep(random.uniform(600, 900)) # Wait 10-15 mins if one of the randomly selected waiting pages  
        
    xpath_expression = "//button[contains(@class, 'next-link') and contains(@class, 'dl-button') and contains(@class, 'dl-button-tertiary-primary') and contains(@class, 'Tappable-inactive') and .//span[@class='dl-button-label' and text()='Suivant']]"

    # Wait for the next page button ('Suivant') to be visible on the page
    next_button = WebDriverWait(driver, 40).until(EC.visibility_of_element_located((By.XPATH, xpath_expression)))

    next_button.click()

    #print("Before waiting for page " + str(i) + " to load") # Waiting until all elements on the ith page load + acts as a progress tracker
    WebDriverWait(driver, 80).until(lambda driver: driver.execute_script("return document.readyState") == "complete")
    print("After waiting for page " + str(i) + " to load")
    
    time.sleep(random.uniform(5,10)) # Wait 5-10 seconds

    scrolling_down()
        
    # Locate the script tag containing the JSON data
    script_elements = driver.find_elements("xpath",'//script[@type="application/ld+json"]')

    # Extract the text content of the script tag
    json_data = script_elements[2].get_attribute("innerHTML")

    #print(json_data) # Uncomment for sanity checking

    # Parse the JSON-like data
    parsed_data = json.loads(json_data)

    # Display the parsed data
    #print(parsed_data) # Uncomment for sanity checking

    # Extract relevant information from search results and append to the list
    for doctor_info in parsed_data:
        name = doctor_info.get('name', '')
        address = doctor_info.get('address', {}).get('streetAddress', '')
        zipcode = doctor_info.get('address', {}).get('postalCode', '')
        city = doctor_info.get('address', {}).get('addressLocality', '')
        speciality = doctor_info.get('medicalSpecialty', '')
        practice_id = doctor_info.get('url', '').split('/')[-1]  # Assuming practice ID is the last part of the URL

    # Append data to the list in a dictionary format (list of dictionaries)
        doctors_list.append({
            'Name': name,
            'Address': address,
            'Zipcode': zipcode,
            'City': city,
            'Speciality': speciality,
            'Practice ID': practice_id
        })

    if i in list_of_scrollingup_pages:
        scrolling_up()

# Convert to pandas DataFrame
df = pd.DataFrame(doctors_list)

# Display the DataFrame
print(df)

# Creating a file name for outputs
file_name = 'Doctolib_jsontry.csv'

# Create the full path by appending the file name
excel_file_path = os.path.join(full_path, folder_name)

df.to_csv(excel_file_path, index=False, encoding='latin-1') # saving the dataframe of doctor information as a csv

time.sleep(random.uniform(30,35)) # Wait 30-35 seconds

# Close the WebDriver
driver.quit()


Before waiting for the page to load
After waiting for the page to load
[{'Name': 'Marine Besnard Fessler', 'Address': '106 Rue Gambetta', 'Zipcode': '44000', 'City': 'Nantes', 'Speciality': 'Psychologue', 'Practice ID': 'marine-besnard-nantes'}, {'Name': 'Edith Arnouil deu', 'Address': '3f Chemin de Papoula', 'Zipcode': '33640', 'City': 'Portets', 'Speciality': 'Psychologue', 'Practice ID': 'edith-arnouil-deu'}, {'Name': 'Clara Costenoble', 'Address': '48 Rue Jean-Pierre Timbaud', 'Zipcode': '75011', 'City': 'Paris', 'Speciality': 'Psychologue', 'Practice ID': 'clara-costenoble'}, {'Name': 'Laurie-Anne Claveri', 'Address': '9 Avenue du Pont Juvénal', 'Zipcode': '34000', 'City': 'Montpellier', 'Speciality': 'Psychologue', 'Practice ID': 'laurie-anne-claveri'}, {'Name': 'Karine LAGIER', 'Address': '5 Rue de Silly', 'Zipcode': '92100', 'City': 'Boulogne-Billancourt', 'Speciality': 'Psychologue', 'Practice ID': 'karine-lagier-boulogne-billancourt'}, {'Name': 'Marie-Thérèse GALANT', 'Addres

In [5]:
# If code interrupts midway
# Saving partial dataframe as csv - recoding for if encoding error (i.e., no special characters that are not recognised in the latin-1 encoding)

#print(df.iloc[75]) # indexing with pandas dataframe for debugging

# Convert to DataFrame
df = pd.DataFrame(doctors_list)

# Define the function to replace '\u200b' with ''
def replace_and_encode(value):
    replacements = {u'\u200b': '',   # Zero width space
                    u'\u0153': 'oe', # Latin character: œ
                    u'\u0101': 'a',  # Latin Small Letter A with Macron
                    u'\u2019': ''}   # Right single quotation mark: '
    
    for old_value, new_value in replacements.items():
        value = value.replace(old_value, new_value)
    
    return value

# Apply the function to every element of the DataFrame
df_encoded = df.apply(lambda x: x.map(replace_and_encode))

# Creating a file name for outputs
file_name = 'Doctolib_jsontry_partial.csv'

# Create the full path by appending the file name
excel_file_path = os.path.join(full_path, folder_name)

df_encoded.to_csv(excel_file_path, index=False, encoding='latin-1') # Convert the pandas dataframe to csv

In [None]:
# Warning !
# Note: if collecting partial files, it will be necessary to eliminate duplicates as doctolib uses rotating search results on each page. 
# i.e., using df.drop_duplicates(subset=['Practice ID'])

In [None]:
# ----------------------------------------------------------------- END -------------------------------------------------------------------------------

In [20]:
# Sanity checking - extracting a single json file
# -----------------------------------------------------------------------------------------------------------------------------------------------------
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
import random
import os
import time
import re
import pandas as pd
import itertools
import json

## Creating a proxy server to avoid detection (to avoid being banned by doctolib)
# Define the proxy server 
PROXY = "IpOfTheProxy:PORT" 
 
# Set ChromeOptions() 
options = webdriver.ChromeOptions() 
 
# Add the proxy as argument 
options.add_argument("--proxy-server=%s" % PROXY) 

url = 'https://www.doctolib.fr/psychologue/france?ref_visit_motive_id=592' # URL post-filters (psychologists in France, offering first-time psychology appointments)
driver = webdriver.Chrome(options=options)
driver.maximize_window()

# Create Chromeoptions instance 
options = webdriver.ChromeOptions() 
 
# Adding argument to disable the AutomationControlled flag (another measure to avoid being banned by doctolib by removing the recognition of a web page as being automatically controlled)
options.add_argument("--disable-blink-features=AutomationControlled") 
 
# Exclude the collection of enable-automation switches 
options.add_experimental_option("excludeSwitches", ["enable-automation"]) 
 
# Turn-off userAutomationExtension 
options.add_experimental_option("useAutomationExtension", False) 
 
# Setting the driver path and requesting a page 
driver = webdriver.Chrome(options=options) 
 
# Changing the property of the navigator value for webdriver to undefined 
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") 

driver.get(url)

time.sleep(random.uniform(5,10)) # wait 5-10 seconds

cookiequestion=driver.find_element("xpath","//*[@id='didomi-notice-disagree-button']") # Clicking disagree to cookies to remove banner
cookiequestion.click()

scrolling_down()

# Wait for the elements with the specified class to be present on the page
print("Before waiting for the page to load")
WebDriverWait(driver, 100).until(lambda driver: driver.execute_script("return document.readyState") == "complete") # Waiting for all elements on page to load
print("After waiting for the page to load")

time.sleep(random.uniform(5,10)) # wait 5-10 seconds

# Locate the script tag containing the JSON data
script_elements = driver.find_elements("xpath",'//script[@type="application/ld+json"]')

# Extract the text content of the script tag
json_data = script_elements[2].get_attribute("innerHTML")

#print(json_data)

# Parse the JSON-like data
parsed_data = json.loads(json_data)

# Display the parsed data
print(parsed_data)

scrolling_up()

time.sleep(random.uniform(5,10)) # Wait 5-10 seconds

# Close the WebDriver
driver.quit()


Before waiting for the page to load
After waiting for the page to load
[{'@context': 'http://schema.org/', '@type': 'Physician', 'name': 'Anne Marie TCHAKRIAN', 'medicalSpecialty': 'Psychologue', 'legalName': None, 'url': '/psychologue/aix-en-provence/tchakrian-anne-marie', 'address': {'@type': 'PostalAddress', 'name': '', 'streetAddress': '120 Avenue Joseph Villevieille', 'postalCode': '13100', 'addressLocality': 'Aix-en-Provence'}, 'paymentAccepted': 'Cash, Check'}, {'@context': 'http://schema.org/', '@type': 'Physician', 'name': 'Charles-Émile Caplain', 'medicalSpecialty': 'Psychologue', 'legalName': None, 'url': '/psychologue/versailles/charles-emile-caplain', 'address': {'@type': 'PostalAddress', 'name': '', 'streetAddress': 'Rue Yves le Coz', 'postalCode': '78000', 'addressLocality': 'Versailles'}, 'paymentAccepted': 'Credit card'}, {'@context': 'http://schema.org/', '@type': 'Physician', 'name': 'Laurie Ann BRETINIERE', 'medicalSpecialty': 'Psychologue', 'legalName': None, 'url'