In [1]:
from urllib.parse import quote
from bs4 import BeautifulSoup
import requests
import re
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementNotInteractableException
from itertools import chain
from tqdm import tqdm
import pandas as pd
import googlemaps
from datetime import datetime
import numpy as np
import re

GMAP_KEY='GOOGLE_MAPS_API_KEY'


In [2]:
def default_waiter(driver, class_name):
    WebDriverWait(driver, 15).until(lambda d: d.find_element(By.CLASS_NAME, class_name))

In [3]:
def get_address(driver):
    address=''
    try:
        default_waiter(driver, 'service__information')
        address=driver.find_element(By.CLASS_NAME, 'service__information-content.test--service__address').text.replace('\n','')
    except NoSuchElementException:
        address=''
    except TimeoutException:
        address=''
    return address
    
        

In [4]:
def get_coordinates(address):
    lat=None
    lon=None
    city=None
    if address=='':
        return lat, lon, city
    else:
        gmaps = googlemaps.Client(key=GMAP_KEY)
        geocode_result = gmaps.geocode(address)
        lat_northeast=geocode_result[0]['geometry']['viewport']['northeast']['lat']
        lon_northeast=geocode_result[0]['geometry']['viewport']['northeast']['lng']
        lat_southwest=geocode_result[0]['geometry']['viewport']['southwest']['lat']
        lon_southwest=geocode_result[0]['geometry']['viewport']['southwest']['lng']
        lat=np.mean([lat_northeast,lat_southwest])
        lon=np.mean([lon_northeast,lon_southwest])
        dicts=geocode_result[0]['address_components']
        for d in dicts:
            for k in d.keys():
                if d[k]==['postal_town']:
                    city=d['long_name']
        
    return lat, lon, city
    
    
    
    
    

In [5]:
def get_contact(driver):
    phone_num=''
    link_rel=''
    email=''
    try:
        default_waiter(driver, 'service__information')
        contact_data=driver.find_elements(By.CLASS_NAME, 'service__information-content')
    except NoSuchElementException:
        phone_num=''
        link_rel=''
        email=''
    except TimeoutException:
        phone_num=''
        link_rel=''
        email=''
        
    
    
    for c in contact_data:
        mail_regex=r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        if c.text.replace(' ','').isdigit()==True:
            phone_num=c.text.replace(' ','')
            
        elif re.match('^https?:\/\/(.*)',str(c.get_attribute('href')))!=None:
            link_rel=c.get_attribute('href')
            
        elif re.fullmatch(mail_regex, c.text)!=None:
            
            email=c.text.lower()
            
    return phone_num, link_rel, email
            
        
        
        
    

In [6]:
def get_opening_hours(driver):
    open_hours_dict={}
    try:
        open_hours_dict['id']=driver.current_url.split('/')[-1]
        default_waiter(driver, 'opening-hours__toggle')
        opening_hours_button=driver.find_element(By.XPATH, "//button[@title='Show Open Hours' and @type='button']")
        opening_hours_button.click()
        time.sleep(2)
        
        days=driver.find_elements(By.CLASS_NAME, 'opening-hours__label')
        for day in days:
        
            hours=day.find_element(By.XPATH, 'following-sibling::div').text
            open_hours_dict[day.text]=[hours]
    except NoSuchElementException:
        open_hours_dict={}
    except TimeoutException:
        open_hours_dict={}
        
        
    return pd.DataFrame(open_hours_dict)

In [7]:
def get_description(driver):
    desc=''
    try:
        default_waiter(driver, 'service__content-wrap')
        desc_item=driver.find_element(By.CLASS_NAME, 'service__content-wrap')
        desc=desc_item.text.replace('\n',' ').strip()
    except NoSuchElementException:
        desc=''
    except TimeoutException:
        desc=''
    return desc

In [8]:
def get_charity_company_nums(driver):
    charity_number=''
    company_number=''
    try:
        default_waiter(driver, 'service__information')
        c_num=driver.find_elements(By.CLASS_NAME, 'leading-none')
        for n in c_num:
            if re.match('Charity Number', n.text)!=None:
                charity_number=n.find_element(By.XPATH, 'following-sibling::div').text
            elif re.match('Company Number', n.text)!=None:
                company_number=n.find_element(By.XPATH, 'following-sibling::div').text
    except NoSuchElementException:
        charity_number=''
        company_number=''
    except TimeoutException:
        charity_number=''
        company_number=''
    return charity_number, company_number
        
    

In [9]:
def scroll_page(driver):
    time.sleep(2)  # Allow 2 seconds for the web page to open
    scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
    screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
    i = 1

    while True:
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
        # Break the loop when the height we need to scroll to is larger than the total scroll height
        if (screen_height) * i > scroll_height:
            break

In [12]:
def get_services_list(place, concern, national=False):
    services_list=[]
    
    driver = webdriver.Chrome(executable_path='/home/slawomir/chrome/chromedriver')
    driver.get('https://hubofhope.co.uk/')
    default_waiter(driver, 'home__location-form')
    location=driver.find_element(By.XPATH, '//*[@id="serviceName"]')
    location.send_keys(place)
    search_button=driver.find_element(By.XPATH, '//*[@id="ember14"]/div/div[4]/div/div[2]/form/div/button')
    search_button.click()
    default_waiter(driver, 'concerns__container.container--huge.container--centered')
    concern_button=driver.find_element(By.XPATH, "//button[text()='{}' and @type='button']".format(concern))
    wait = WebDriverWait(driver, timeout=10, ignored_exceptions=[ElementNotInteractableException])
    element = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[text()='{}' and @type='button']".format(concern))))
    concern_button.click()
    default_waiter(driver, 'concerns__tags-container')
    result=driver.find_element(By.XPATH, "//button[text()='Show your results' and @type='button']")
    
    result.click()
    default_waiter(driver, 'page-services')
    print(driver.current_url)
    if national:
        time.sleep(2)
        result_national=driver.find_element(By.CSS_SELECTOR, '#ember48 > div.page-services > div > div.services__sidebar-container > div > div.services-sidebar__locality-container > div > button.locality__toggle.locality__national')
        result_national.click()
        time.sleep(2)
    scroll_page(driver)
    time.sleep(2)
    services=driver.find_elements(By.CLASS_NAME, '__f8bee')
    
    for service in services:
        dbt_service_link=service.find_element(By.CLASS_NAME, 'ember-view.service-card__link').get_attribute('href')
        dbt_service_name=service.find_element(By.CLASS_NAME, 'service-card__title').text
        services_list.append((dbt_service_link, dbt_service_name))
    return services_list
    

    
    
    

In [None]:
services=get_services_list('United Kingdom', 'Self-Harm', national=False)

In [16]:
from tqdm import tqdm
def get_services(services):
    services_details=[]
    opening_hours_dfs=[]
    
    for service in tqdm(services):
        
        service_name=service[1]
        service_url=service[0]
        service_id=service_url.split('/')[-1]
        driver = webdriver.Chrome(executable_path='/path/to/chromedriver')
        driver.get(service_url)
        address=get_address(driver)
        coor=(get_coordinates(address)[0],get_coordinates(address)[1])
        city=get_coordinates(address)[2]
        phone_num, link_rel, email = get_contact(driver)
        desc=get_description(driver)
        charity_number, company_number = get_charity_company_nums(driver)
        open_hours_df=get_opening_hours(driver)
        services_details.append((service_id,service_name,address,coor,city,phone_num,link_rel,email,desc,charity_number,company_number))
        opening_hours_dfs.append(open_hours_df)
    df_services=pd.DataFrame(services_details, columns=['id', 'name', 'address', 'coor','city','phone','website','email','description','charity_number','company_number'])
    opening_hours_all_df=pd.concat(opening_hours_dfs)
    df_all=pd.merge(df_services, opening_hours_all_df, how='left', on='id' )
    df_all.to_csv('/path/to/dbt_services_all_national.csv')
    return df_all
    
        
        
        
        
        
        
        

In [None]:
df=get_services(services)