# Download all tender results files
[HTML elements](https://blog.hubspot.com/website/html-elements)

In [1]:
# from fake_useragent import UserAgent
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
from random import choice
from datetime import datetime
import time
import random
import urllib3
import warnings
import bs4
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import os
import shutil
import logging
import glob
import itertools
import re
from typing import List

# os.chdir(r'G:\\REA\\Working files\\land-bidding\\Table extraction')
with open('user_agent.txt') as f:
    ua_list = [ua.strip() for ua in f]
    f.close()

root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)
handler = logging.FileHandler(filename='scraping.log', mode='w', encoding='utf-8')
formatter = logging.Formatter("%(asctime)s %(name)s:%(levelname)s:%(message)s")
handler.setFormatter(formatter)
root_logger.addHandler(handler)

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
warnings.filterwarnings("ignore")
original_wd = os.getcwd()


def random_time_delay():
    time.sleep(random.uniform(15, 30))


class WebScraper:

    def __init__(self, save_path: str = None, logger=None):
        self.save_path = save_path
        self.driver = self.open_browser()
        self.logger = logger

    def open_browser(self):
        chrome_options = uc.ChromeOptions()
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--disable-extensions")
        chrome_options.add_argument('--disable-notifications')
        chrome_options.add_argument("--mute-audio")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--ignore-certificate-errors')
        chrome_options.add_argument(f"user-agent={choice(ua_list)}")
        chrome_options.add_experimental_option('prefs',
                                               {"download.default_directory": self.save_path,
                                                "download.prompt_for_download": False,
                                                "download.directory_upgrade": True,
                                                "plugins.always_open_pdf_externally": True
                                                }
                                               )
        driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
        return driver

    def log(self, msg: str, level='info'):
        print(msg)
        if self.logger:
            try:
                if level == 'warning':
                    self.logger.warning(msg)
                elif level == 'error':
                    self.logger.error(msg)
                else:
                    self.logger.info(msg)
            except:
                pass
        return msg

    def search(self, land_parcel: str):
        self.log(f'{self.land_parcel}: Searching...')
        try:
            # go to home page of ura gov site
            self.driver.get('https://www.ura.gov.sg/maps/')
            random_time_delay()

            # click on "view government land sales site"
            self.driver.find_element(by=By.XPATH,
                                     value='//*[@id="us-c-ip"]/div[1]/div[1]/div[4]/div[3]/div[6]/div[2]').click()
            random_time_delay()

            # search land parcel
            self.driver.find_element(by=By.CLASS_NAME, value='us-s-txt').send_keys(land_parcel)

        except:
            self.log(f'{land_parcel}: Error occurred when searching', 'error')

    def extract_url(self):
        source = self.driver.page_source
        soup = bs4.BeautifulSoup(source, 'html.parser')
        url = None

        for i in soup.find_all('a'):
            try:
                if ('Tender-Results' in i['href']):
                    url = i['href']
                    random_time_delay()
                    break
            except:
                pass

        return url

    def get_url(self, land_parcel: str) -> [List[str], List[str], List[str]]:
        self.land_parcel = land_parcel

        try:
            self.search(land_parcel)
            random_time_delay()

            # get the number of searched results
            results = self.driver.find_elements(by=By.XPATH, value='//a[@data-parentid="0" and @data-type="service"]')

            url_list = []
            awardDates = []
            nameList = []
            if len(results) == 0:
                self.log(f"{self.land_parcel}: No URA sales site result", "warning")

            elif len(results) == 1:
                self.log(f"{self.land_parcel}: 1 URA sales site searched")
                # Click on result
                random_time_delay()
                self.driver.find_element(by=By.CLASS_NAME, value='us-sr-result').click()

                # Get url for Tender Result pdf
                self.log(f"{self.land_parcel}: Extracting URL...")
                random_time_delay()
                url = self.extract_url()
                if url and pd.notna(url):
                    url_list.append(url)
                    self.log("Done")
                else:
                    self.log("Invalid URL", "warning")
                random_time_delay()

                # get the exact land parcel name and the dates of award
                try:
                    land_parcel_name = self.driver.find_element(by=By.CLASS_NAME, value="us-ip-poi-a-title").text
                    random_time_delay()
                except:
                    land_parcel_name = self.land_parcel

                try:
                    award_date = self.driver.find_element(by=By.XPATH,
                                                          value='//*[@id="us-c-ip"]/div[3]/div[1]/div[2]/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]').text
                    award_date = datetime.strptime(award_date, '%d %B %Y').strftime("%Y%m%d")
                except:
                    award_date = 'Unknown'

                awardDates.append(award_date)
                nameList.append(land_parcel_name)


            else:
                self.log(f"{self.land_parcel}: {len(results)} URA sales sites searched")
                for id_ in range(len(results)):  # 0 1, len = 2
                    self.log(f"{self.land_parcel}: Extracting #{id_ + 1} URL...")

                    # click one by one
                    random_time_delay()
                    self.driver.find_element(by=By.XPATH,
                                             value=f'//a[@data-type="service" and @data-id="{id_}"]').click()

                    # Get url for Tender Result pdf
                    random_time_delay()
                    url = self.extract_url()
                    if url and pd.notna(url):
                        url_list.append(url)
                        self.log("Done")
                    else:
                        self.log("Invalid URL", "warning")
                    random_time_delay()

                    # get the exact land parcel name and the dates of award
                    try:
                        land_parcel_name = self.driver.find_element(by=By.CLASS_NAME, value="us-ip-poi-a-title").text
                        random_time_delay()
                    except:
                        land_parcel_name = self.land_parcel

                    try:
                        award_date = self.driver.find_element(by=By.XPATH,
                                                              value='//*[@id="us-c-ip"]/div[3]/div[1]/div[2]/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]').text
                        award_date = datetime.strptime(award_date, '%d %B %Y').strftime("%Y%m%d")
                    except:
                        award_date = 'Unknown'

                    awardDates.append(award_date)
                    nameList.append(land_parcel_name)

                    # get back to the search results
                    if id_ < len(results) - 1:  # 1
                        random_time_delay()
                        self.search(land_parcel)

            self.log(f"{self.land_parcel}: {len(url_list)} valid URLs retrieved")
            return [url_list, awardDates, nameList]

        except:
            self.log(f"{self.land_parcel}: Error occurred when getting URL list", "error")
            pass

    def download(self, url_list: List[str], id_list: List[str] = None, name_list: List[str] = None):
        if not os.path.exists(os.path.join(self.save_path, 'temp')):
            os.makedirs(os.path.join(self.save_path, 'temp'))
        destination = os.path.join(self.save_path, 'temp')
        if not id_list:
            id_list = list(range(len(url_list)))

        if not name_list:
            name_list = [self.land_parcel]*len(url_list)

        for i in range(len(url_list)):
            _id_ = id_list[i]
            _exactName_ = name_list[i]

            # make sure the save path has no pdf file (just serve as a mid-point), if not, move these files to another folder
            existing_pdf = [file for file in os.listdir(self.save_path) if '.pdf' in file]
            if len(existing_pdf) > 0:
                if not os.path.exists(os.path.join(self.save_path, 'redundant')):
                    os.makedirs(os.path.join(self.save_path, 'redundant'))
                redundant_path = os.path.join(self.save_path, 'redundant')

                for file in existing_pdf:
                    source_redundant = os.path.join(self.save_path, file)
                    desti_redundant = os.path.join(redundant_path, file)
                    shutil.move(source_redundant, desti_redundant)

            random_time_delay()
            try:
                self.driver.get(url_list[i])

                random_time_delay()
                pdf = [file for file in os.listdir(self.save_path) if '.pdf' in file]
                if len(pdf) > 0:
                    pdf_file = pdf[0]
                    if len(pdf) > 1:
                        self.log(f"{self.land_parcel}_{i}: Multiple PDF downloaded or redundant files, chose the first one", "warning")
                    source_path = os.path.join(self.save_path, pdf_file)
                    # remove illegal punc in filename
                    illegal_punc = '[/\:*?"<>|]'

                    try:
                        file_name = re.sub(illegal_punc, '+', _exactName_)
                    except:
                        file_name = re.sub(illegal_punc, '+', self.land_parcel)

                    full_file_name = f"{file_name}_{_id_}.pdf"

                    # make sure there's no duplicated file name
                    filelist = os.listdir(destination)
                    occurrence = filelist.count(full_file_name)
                    if occurrence:
                        k = 0
                        full_file_name = f"{file_name}_{_id_}_0.pdf"
                        while filelist.count(full_file_name):
                            k += 1
                            full_file_name = f"{file_name}_{_id_}_{k}.pdf"

                    desti_path = os.path.join(destination, full_file_name)
                    shutil.move(source_path, desti_path)
                    self.log(f"{self.land_parcel}_{i}: Tender details saved in <{full_file_name}>")

                else:
                    self.log(f"{self.land_parcel}_{i}: No PDF downloaded", "warning")
            except:
                self.log(f"{self.land_parcel}_{i}: Error occurred when downloading", "error")

        self.log(f'{self.land_parcel}: Process ended', '\n')

    def scrape(self, landParcels: List[str]):
        try:
            for land_parcel in tqdm(landParcels):
                [url_list, id_list, name_list] = self.get_url(land_parcel)
                self.download(url_list, id_list, name_list)
            self.log("All done")
            self.driver.quit()
        except:
            self.log(f'{self.land_parcel}: Error occurred when passing URL to download function', "error")


if __name__ == "__main__":
    # read in land parcel list
    gls = pd.read_csv(r'G:\REA\Working files\land-bidding\land_sales_full_data\ready for uploading\gls_no_detail.csv')
    ura_gls = gls[gls.source == 'ura'].reset_index(drop=True)
    landParcels = list(ura_gls.land_parcel.reset_index(drop=True).apply(lambda x: x.replace('/', ' / ')).unique())

    # start scraping
    save_path = r'G:\REA\Working files\land-bidding\Table extraction\tenderer_details_ura'
    scraper = WebScraper(save_path=save_path, logger=root_logger)
    scraper.scrape(landParcels[287:])
    # scraper.scrape(['Chestnut Avenue'])



Current google-chrome version is 107.0.5304
Get LATEST chromedriver version for 107.0.5304 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/107.0.5304.62/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\13051\.wdm\drivers\chromedriver\win32\107.0.5304.62]


  0%|          | 0/41 [00:00<?, ?it/s]

Bedok South Avenue 1: Searching...
Bedok South Avenue 1: 1 URA sales site searched
Bedok South Avenue 1: Extracting URL...
Invalid URL
Bedok South Avenue 1: 0 valid URLs retrieved
Bedok South Avenue 1: Process ended
Commonwealth Avenue West / Faber Heights: Searching...
Commonwealth Avenue West / Faber Heights: 1 URA sales site searched
Commonwealth Avenue West / Faber Heights: Extracting URL...
Invalid URL
Commonwealth Avenue West / Faber Heights: 0 valid URLs retrieved
Commonwealth Avenue West / Faber Heights: Process ended
Hillview Avenue / Bukit Batok Town Park: Searching...
Hillview Avenue / Bukit Batok Town Park: 1 URA sales site searched
Hillview Avenue / Bukit Batok Town Park: Extracting URL...
Invalid URL
Hillview Avenue / Bukit Batok Town Park: 0 valid URLs retrieved
Hillview Avenue / Bukit Batok Town Park: Process ended
Bencoolen Street / Albert Street: Searching...
Bencoolen Street / Albert Street: 2 URA sales sites searched
Bencoolen Street / Albert Street: Extracting #1 U

In [2]:
landParcels.index('Bedok South Avenue 1')

287

In [15]:
filelist = ['abc_20000001_0.pdf', '21431243', 'asffw.txt', 'abc_20000001_1.pdf', 'abc_20000001.pdf']
file_name = 'abc'
_id_ = '20000001'
full_file_name = f"{file_name}_{_id_}.pdf"
occurrence = filelist.count(full_file_name)
if occurrence:
    i = 0
    full_file_name = f"{file_name}_{_id_}_0.pdf"
    while filelist.count(full_file_name):
        i += 1
        full_file_name = f"{file_name}_{_id_}_{i}.pdf"
print(full_file_name)

abc_20000001_2.pdf


In [1]:
# from fake_useragent import UserAgent
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc
from random import choice
import time
import random
import urllib3
import warnings
import bs4
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import os
import shutil
import glob
import itertools
import re
from typing import List


# os.chdir(r'G:\\REA\\Working files\\land-bidding\\Table extraction')
with open('user_agent.txt') as f:
    ua_list = [ua.strip() for ua in f]
    f.close()

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
warnings.filterwarnings("ignore")
original_wd = os.getcwd()

def random_time_delay():
    time.sleep(random.uniform(15, 30))


class web_scraper:

    def __init__(self, save_path: str=None):
        self.save_path = save_path
        self.driver = self.open_browser()


    def open_browser(self):
        chrome_options = uc.ChromeOptions()
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--disable-extensions")
        chrome_options.add_argument('--disable-notifications')
        chrome_options.add_argument("--mute-audio")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--ignore-certificate-errors')
        chrome_options.add_argument(f"user-agent={choice(ua_list)}")
        chrome_options.add_experimental_option('prefs',
                                               {"download.default_directory": self.save_path,
                                                "download.prompt_for_download": False,
                                                "download.directory_upgrade": True,
                                                "plugins.always_open_pdf_externally": True
                                                }
        )
        driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
        return driver


    def search(self, land_parcel: str):
        print(f'{self.land_parcel}: Searching...')
        try:
            # go to home page of ura gov site
            self.driver.get('https://www.ura.gov.sg/maps/')
            random_time_delay()

            # click on "view government land sales site"
            self.driver.find_element_by_xpath('//*[@id="us-c-ip"]/div[1]/div[1]/div[4]/div[3]/div[6]/div[2]').click()
            random_time_delay()

            # search land parcel
            self.driver.find_element_by_class_name('us-s-txt').send_keys(land_parcel)

        except:
            print(f'{land_parcel}: Error occurred when searching')


    def extract_url(self):
        source = self.driver.page_source
        soup = bs4.BeautifulSoup(source, 'html.parser')
        url = None

        for i in soup.find_all('a'):
            try:
                if('Tender-Results' in i['href']):
                    url = i['href']
                    random_time_delay()
                    break
            except:
                pass


        return url


    def get_url(self, land_parcel: str)-> List[str]:
        self.land_parcel = land_parcel

        try:
            self.search(land_parcel)
            random_time_delay()

            # get the number of searched results
            results = self.driver.find_elements_by_xpath('//a[@data-type="service"]')

            url_list = []
            if len(results) == 0:
                print(f"{self.land_parcel}: No searched result")

            elif len(results) == 1:
                print(f"{self.land_parcel}: 1 result searched")
                # Click on result
                random_time_delay()
                self.driver.find_element_by_class_name('us-sr-result').click()

                # Get url for Tender Result pdf
                print(f"{self.land_parcel}: Extracting URL...")
                random_time_delay()
                url = self.extract_url()
                if url and pd.notna(url):
                    url_list.append(url)
                    print("Done")
                else:
                    print("Invalid URL")

            else:
                print(f"{self.land_parcel}: {len(results)} results searched")
                for id_ in range(len(results)): # 0 1, len = 2
                    print(f"{self.land_parcel}: Extracting #{id_+1} URL...")

                    # click one by one
                    random_time_delay()
                    self.driver.find_element_by_xpath(f'//a[@data-type="service" and @data-id="{id_}"]').click()

                    # Get url for Tender Result pdf
                    random_time_delay()
                    url = self.extract_url()
                    if url and pd.notna(url):
                        url_list.append(url)
                        print("Done")
                    else:
                        print("Invalid URL")

                    # get back to the search results
                    if id_ < len(results)-1: # 1
                        random_time_delay()
                        self.search(land_parcel)

            print(f"{self.land_parcel}: {len(url_list)} valid URLs retrieved")
            return url_list

        except:
            print(f"{self.land_parcel}: Error occurred when getting URL list")
            pass


    def download(self, url_list: List[str]):
        if os.path.exists(os.path.join(self.save_path, 'temp')) is False:
            os.makedirs(os.path.join(self.save_path, 'temp'))
        destination = os.path.join(self.save_path, 'temp')

        for i in range(len(url_list)):
            random_time_delay()
            try:
                self.driver.get(url_list[i])

                random_time_delay()
                pdf = [file for file in os.listdir(self.save_path) if '.pdf' in file]
                if len(pdf) > 0:
                    pdf_file = pdf[0]

                    if len(pdf) > 1:
                        print(f"{self.land_parcel}_{i}: Multiple PDF downloaded, chose the first one")

                    source_path = os.path.join(self.save_path, pdf_file)
                    desti_path = os.path.join(destination, f"{self.land_parcel}_{i}.pdf")
                    shutil.move(source_path, desti_path)
                    print(f"{self.land_parcel}_{i}: Tender details saved in <{self.land_parcel}_{i}.pdf>")

                else:
                    print(f"{self.land_parcel}_{i}: No PDF downloaded")
            except:
                print(f"{self.land_parcel}_{i}: Error occurred when downloading")

        print(f'{self.land_parcel}: Process ended', '\n')


# scraper = web_scraper(save_path='G:\\REA\\Working files\\land-bidding\\Table extraction\\tenderer_details_ura')
# land_parcel = 'Lentor Hills Road (Parcel B)'
# scraper.download(['https://www.ura.gov.sg/-/media/Corporate/Land-Sales/Tender-Results/2022/pr22-32a.pdf'])

In [2]:
save_path = r'G:\REA\Working files\land-bidding\Table extraction\tenderer_details_ura'
scraper = web_scraper(save_path=save_path)



Current google-chrome version is 107.0.5304
Get LATEST chromedriver version for 107.0.5304 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/107.0.5304.62/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\13051\.wdm\drivers\chromedriver\win32\107.0.5304.62]


In [3]:
land_parcel = 'Clementi Avenue 1'

In [4]:
scraper.download(scraper.get_url(land_parcel))

Lentor Central: Searching...
Lentor Central: 3 results searched
Lentor Central: Extracting 1 URL...
Done
Lentor Central: Searching...
Lentor Central: Extracting 2 URL...
Done
Lentor Central: Searching...
Lentor Central: Extracting 3 URL...
Invalid URL
Lentor Central: Searching...
Lentor Central: 2 valid URLs retrieved
Lentor Central_0: Tender details saved in <Lentor Central_0.pdf>
Lentor Central_1: Tender details saved in <Lentor Central_1.pdf>
Lentor Central: Process ended 



In [242]:
save = scraper.save_path
source = os.path.join(save, 'pr22-32a.pdf')
destination = os.path.join(save, 'temp')
desti = os.path.join(destination, 'lentor hills p b_0.pdf')
shutil.move(source, desti)

'G:\\REA\\Working files\\land-bidding\\Table extraction\\tenderer_details_ura\\temp\\lentor hills p b_0.pdf'

In [46]:
try:
    # driver.find_element_by_class_name('us-sr-title')
    results = driver.find_elements_by_xpath('//a[@data-type="service"]')
    random_time_delay()

    if len(results) == 0:
        print(f"No result for {land_parcel}")
    elif len(results) == 1:
        print(results[0].text)
        # Click on result
        driver.find_element_by_class_name('us-sr-result').click()
        random_time_delay()

        # Get url for Tender Result pdf
        source = driver.page_source
        soup = bs4.BeautifulSoup(source, 'html.parser')
        for i in soup.find_all('a'):
            try:
                if('Tender-Results' in i['href']):
                    url = i['href']
                    random_time_delay()
                    break
            except:
                pass
    else:
        print(f"Multiple results for {land_parcel}")
        for id in range(len(results)):
            driver.find_elements_by_xpath('//a[@data-type="service" and @data-id="0"]')[id].click()



except:
    pass

url

Lentor Hills Road (Parcel B)


'https://www.ura.gov.sg/-/media/Corporate/Land-Sales/Tender-Results/2022/pr22-32a.pdf'

In [71]:
driver.find_element_by_xpath(f'//a[@data-type="service" and @data-id="{1}"]').click()

In [76]:
f'//a[@data-type="service" and @data-id="{1}"]'

'//a[@data-type="service" and @data-id="1"]'

In [55]:
driver.quit()

### Download PDF

In [63]:
driver.get('https://www.ura.gov.sg/-/media/Corporate/Land-Sales/Tender-Results/2022/pr22-32a.pdf')

In [218]:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

download_dir = r"G:\REA\Working files\land-bidding\Table extraction\tenderer_details_ura"

chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument(f"user-agent={choice(ua_list)}")
chrome_options.add_experimental_option('prefs',  {
    "download.default_directory": download_dir,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "plugins.always_open_pdf_externally": True
}
                                       )

browser = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options)
browser.get('https://www.ura.gov.sg/-/media/Corporate/Land-Sales/Tender-Results/2022/pr22-32a.pdf')



Current google-chrome version is 107.0.5304
Get LATEST chromedriver version for 107.0.5304 google-chrome
Driver [C:\Users\13051\.wdm\drivers\chromedriver\win32\107.0.5304.62\chromedriver.exe] found in cache


In [219]:
save_path = r"G:\REA\Working files\land-bidding\Table extraction\tenderer_details_ura"
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument(f"user-agent={choice(ua_list)}")
chrome_options.add_experimental_option('prefs',
                                       {"download.default_directory": save_path,
                                        "download.prompt_for_download": False,
                                        "download.directory_upgrade": True,
                                        "plugins.always_open_pdf_externally": True
                                        }
                                       )
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
driver.get('https://www.ura.gov.sg/-/media/Corporate/Land-Sales/Tender-Results/2022/pr22-32a.pdf')



Current google-chrome version is 107.0.5304
Get LATEST chromedriver version for 107.0.5304 google-chrome
Driver [C:\Users\13051\.wdm\drivers\chromedriver\win32\107.0.5304.62\chromedriver.exe] found in cache


# Testing

In [3]:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager


download_dir = r"G:\REA\Working files\land-bidding\Table extraction\tenderer_details_ura"

chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("--mute-audio")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument(f"user-agent={choice(ua_list)}")
chrome_options.add_experimental_option('prefs',  {
    "download.default_directory": download_dir,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "plugins.always_open_pdf_externally": True
}
                                       )

browser = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options)



Current google-chrome version is 107.0.5304
Get LATEST chromedriver version for 107.0.5304 google-chrome
Driver [C:\Users\13051\.wdm\drivers\chromedriver\win32\107.0.5304.62\chromedriver.exe] found in cache


In [4]:
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
from random import choice
import time
import random
import urllib3
import warnings
import bs4
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import os
import shutil
import glob
import itertools
import re
from typing import List

In [5]:
browser.get('https://www.ura.gov.sg/maps/')

In [6]:
browser.find_element_by_xpath('//*[@id="us-c-ip"]/div[1]/div[1]/div[4]/div[3]/div[6]/div[2]').click()

In [7]:
browser.find_element_by_class_name('us-s-txt').send_keys('Chestnut Avenue')

In [7]:
results = browser.find_elements(by=By.XPATH, value='//a[@data-parentid="0" and @data-type="service"]')

In [23]:
browser.find_element(by=By.XPATH, value=f'//a[@data-parentid="0" and@data-type="service" and @data-id="0"]').click()

In [24]:
browser.find_element(by=By.CLASS_NAME, value="us-ip-poi-a-title").text

'Chestnut Avenue'

In [15]:
date = browser.find_element(by=By.XPATH, value='//*[@id="us-c-ip"]/div[3]/div[1]/div[2]/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]').text

In [8]:
scraper1 = WebScraper()



Current google-chrome version is 107.0.5304
Get LATEST chromedriver version for 107.0.5304 google-chrome
Driver [C:\Users\13051\.wdm\drivers\chromedriver\win32\107.0.5304.62\chromedriver.exe] found in cache


In [10]:
res = scraper.get_url('Chestnut Avenue')

Chestnut Avenue: Searching...
Chestnut Avenue: Error occurred when searching
Chestnut Avenue: Error occurred when getting URL list


In [24]:
from datetime import datetime
datetime.strptime('1 March 2000', '%d %B %Y').strftime("%Y%m%d")

'20000301'

In [27]:
l = [1,1,1]
if not l:
    l = list(range(3))
l

[1, 1, 1]