In [1]:
import logging
import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
from concurrent.futures import ThreadPoolExecutor


In [2]:
def create_logger():
    logFormatter = logging.Formatter("%(asctime)s - %(levelname)s:  %(message)s")
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    fileHandler = logging.FileHandler("{0}/{1}.log".format('logs', now), mode='w')
    fileHandler.setFormatter(logFormatter)
    fileHandler.setLevel(logging.INFO)
    logger.addHandler(fileHandler)

    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(logFormatter)
    consoleHandler.setLevel(logging.INFO)
    logger.addHandler(consoleHandler)
    return logger


In [3]:
logger = create_logger()

In [4]:
# setup chrome to run headless
chrome_options = webdriver.chrome.options.Options()
chrome_options.add_argument("--incognito")
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--start-maximized")

driver = webdriver.Chrome(r"C:/Users/gaven.yeh/Downloads/chromedriver_win32/chromedriver.exe", options=chrome_options)

def scrape_data():
    master_candidate_dicts = []

    populate_master_candidate_dicts(master_candidate_dicts)
    
    driver.close()

    return master_candidate_dicts

def populate_master_candidate_dicts(master_candidate_dicts):
    driver.get('https://dashboard.spr.gov.my') # home page
    states = driver.find_elements_by_xpath('//div[@class="stats_details"]//a')
    state_dicts = [{
        "state_name": state.text,
        "state_link": state.get_attribute('href')    
    } for state in states]
    logger.info("state_dicts: {}".format(state_dicts))

    for state in state_dicts:
        get_constituency_links(state, master_candidate_dicts)
        

def get_constituency_links(state, master_candidate_dicts): 
    state_link = state['state_link']
    driver.get(state_link)
    constituencies = driver.find_elements_by_xpath('//div[@class="state_container_right"]//a')
    constituency_dicts = [{
        "constituency_name": constituency.text,
        "constituency_link": constituency.get_attribute('href')    
    } for constituency in constituencies]
    logger.info("constituency_dicts: {}".format(constituency_dicts))

    for constituency in constituency_dicts:
        get_candidate_results(state['state_name'], constituency['constituency_name'], constituency['constituency_link'], master_candidate_dicts)


def get_candidate_results(state_name, constituency_name, constituency_link, master_candidate_dicts):
    driver.get(constituency_link)
    candidate_details = driver.find_elements_by_xpath('//li[contains(@class, "calculation_candidate")]/h4')
    candidate_dicts = [{
            "state": state_name,
            "constituency": constituency_name,
            "result": candidate_detail.find_elements_by_xpath('.//*')[0].text,
            "candidate_name": candidate_detail.find_elements_by_xpath('.//*')[2].text,
            "coalition": candidate_detail.find_elements_by_xpath('.//*')[3].text,
            "no_of_votes": candidate_detail.find_elements_by_xpath('.//*')[5].text
        } for candidate_detail in candidate_details]
    logger.info("candidate_dicts: {}".format(candidate_dicts))
    master_candidate_dicts.extend(candidate_dicts)


candidate_dicts = scrape_data()

df = pd.DataFrame(candidate_dicts)

df.head()

df.to_csv('output/MY_GE15.csv', index=False)

2022-12-15 18:05:09,156 - INFO:  state_dicts: [{'state_name': 'JOHOR', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/01'}, {'state_name': 'KEDAH', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/02'}, {'state_name': 'KELANTAN', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/03'}, {'state_name': 'MELAKA', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/04'}, {'state_name': 'NEGERI SEMBILAN', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/05'}, {'state_name': 'PAHANG', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/06'}, {'state_name': 'PULAU PINANG', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/07'}, {'state_name': 'PERAK', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/08'}, {'state_name': 'PERLIS', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/09'}, {'state_name': 'SELANGOR', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/10'}, {'state_name': 'TERENGGANU', 'state_link': 'htt

In [5]:
# Failed attempt at concurrency

# import multiprocessing.pool

# # setup chrome to run headless
# chrome_options = webdriver.chrome.options.Options()
# chrome_options.add_argument("--incognito")
# # chrome_options.add_argument("--headless")
# # chrome_options.add_argument("--start-maximized")

# driver = webdriver.Chrome(r"C:/Users/gaven.yeh/Downloads/chromedriver_win32/chromedriver.exe", options=chrome_options)

# def scrape_data():
#     master_candidate_dicts = []

#     populate_master_candidate_dicts(master_candidate_dicts)
    
#     return master_candidate_dicts

# def populate_master_candidate_dicts(master_candidate_dicts):
#     driver.get('https://dashboard.spr.gov.my') # home page
#     states = driver.find_elements_by_xpath('//div[@class="stats_details"]//a')
#     state_dicts = [{
#         "state_name": state.text,
#         "state_link": state.get_attribute('href')    
#     } for state in states]
#     logger.info("state_dicts: {}".format(state_dicts))
#     driver.close()
#     # declare global pool
#     global pool
#     # assign the global pool
#     pool = multiprocessing.pool.ThreadPool(len(state_dicts)//2)

#     args = [(state, master_candidate_dicts) for state in state_dicts]

#     # issue top level tasks to pool and wait
#     pool.starmap(get_constituency_links, args)
#     # close the pool
#     pool.close()
        

# def get_constituency_links(state, master_candidate_dicts): 
#     state_link = state['state_link']
#     state_driver = webdriver.Chrome(r"C:/Users/gaven.yeh/Downloads/chromedriver_win32/chromedriver.exe", options=chrome_options)
#     state_driver.get(state_link)
#     constituencies = state_driver.find_elements_by_xpath('//div[@class="state_container_right"]//a')
#     constituency_dicts = [{
#         "constituency_name": constituency.text,
#         "constituency_link": constituency.get_attribute('href')    
#     } for constituency in constituencies]
#     logger.info("constituency_dicts: {}".format(constituency_dicts))

#     for constituency in constituency_dicts:
#         get_candidate_results(state['state_name'], constituency['constituency_name'], constituency['constituency_link'], master_candidate_dicts, state_driver)
#     state_driver.close()


# def get_candidate_results(state_name, constituency_name, constituency_link, master_candidate_dicts, state_driver):
#     state_driver.get(constituency_link)
#     candidate_details = state_driver.find_elements_by_xpath('//li[contains(@class, "calculation_candidate")]/h4')
#     candidate_dicts = [{
#             "state": state_name,
#             "constituency": constituency_name,
#             "result": candidate_detail.find_elements_by_xpath('.//*')[0].text,
#             "candidate_name": candidate_detail.find_elements_by_xpath('.//*')[2].text,
#             "coalition": candidate_detail.find_elements_by_xpath('.//*')[3].text,
#             "no_of_votes": candidate_detail.find_elements_by_xpath('.//*')[5].text
#         } for candidate_detail in candidate_details]
#     logger.info("candidate_dicts: {}".format(candidate_dicts))
#     master_candidate_dicts.extend(candidate_dicts)

# def set_up_threads(state_dicts, master_candidate_dicts):
#     with ThreadPoolExecutor(max_workers=5) as executor:
#         return executor.map(get_constituency_links,    
#                             state_dicts, master_candidate_dicts,
#                             timeout = 1800)

# candidate_dicts = scrape_data()

# df = pd.DataFrame(candidate_dicts)

# df.head()

# df.to_csv('output/MY_GE15.csv', index=False)