In [1]:
import logging
import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd

In [2]:
def create_logger():
    logFormatter = logging.Formatter("%(asctime)s - %(levelname)s:  %(message)s")
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    fileHandler = logging.FileHandler("{0}/{1}.log".format('logs', now), mode='w')
    fileHandler.setFormatter(logFormatter)
    fileHandler.setLevel(logging.INFO)
    logger.addHandler(fileHandler)

    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(logFormatter)
    consoleHandler.setLevel(logging.INFO)
    logger.addHandler(consoleHandler)
    return logger


In [3]:
logger = create_logger()

In [4]:
# setup chrome to run headless
chrome_options = webdriver.chrome.options.Options()
chrome_options.add_argument("--incognito")
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--start-maximized")

driver = webdriver.Chrome(r"C:/Users/gaven.yeh/Downloads/chromedriver_win32/chromedriver.exe", options=chrome_options)

def scrape_data():
    candidate_dicts = []

    state_dicts = get_state_links()
    for state in state_dicts:
        constituency_dicts = get_constituency_links(state['state_link']) 
        for constituency in constituency_dicts:
            candidate_dicts.extend(get_candidate_results(state['state_name'], constituency['constituency_name'], constituency['constituency_link']))
    
    driver.close()

    return candidate_dicts

def get_state_links():
    driver.get('https://dashboard.spr.gov.my') # home page
    states = driver.find_elements_by_xpath('//div[@class="stats_details"]//a')
    state_dicts = [{
        "state_name": state.text,
        "state_link": state.get_attribute('href')    
    } for state in states]
    logger.info("state_dicts: {}".format(state_dicts))
    return state_dicts


def get_constituency_links(state_link): 
    driver.get(state_link)
    constituencies = driver.find_elements_by_xpath('//div[@class="state_container_right"]//a')
    constituency_dicts = [{
        "constituency_name": constituency.text,
        "constituency_link": constituency.get_attribute('href')    
    } for constituency in constituencies]
    logger.info("constituency_dicts: {}".format(constituency_dicts))
    return constituency_dicts


def get_candidate_results(state_name, constituency_name, constituency_link):
    driver.get(constituency_link)
    candidate_details = driver.find_elements_by_xpath('//li[contains(@class, "calculation_candidate")]/h4')
    candidate_dicts = [{
            "state": state_name,
            "constituency": constituency_name,
            "result": candidate_detail.find_elements_by_xpath('.//*')[0].text,
            "candidate_name": candidate_detail.find_elements_by_xpath('.//*')[2].text,
            "coalition": candidate_detail.find_elements_by_xpath('.//*')[3].text,
            "no_of_votes": candidate_detail.find_elements_by_xpath('.//*')[5].text
        } for candidate_detail in candidate_details]
    logger.info("candidate_dicts: {}".format(candidate_dicts))
    return candidate_dicts

candidate_dicts = scrape_data()

df = pd.DataFrame(candidate_dicts)

df.head()

df.to_csv('output/MY_GE15.csv', index=False)

2022-11-29 23:51:06,016 - INFO:  state_dicts: [{'state_name': 'JOHOR', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/01'}, {'state_name': 'KEDAH', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/02'}, {'state_name': 'KELANTAN', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/03'}, {'state_name': 'MELAKA', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/04'}, {'state_name': 'NEGERI SEMBILAN', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/05'}, {'state_name': 'PAHANG', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/06'}, {'state_name': 'PULAU PINANG', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/07'}, {'state_name': 'PERAK', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/08'}, {'state_name': 'PERLIS', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/09'}, {'state_name': 'SELANGOR', 'state_link': 'https://dashboard.spr.gov.my/#!/parliament/10'}, {'state_name': 'TERENGGANU', 'state_link': 'htt

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,"{'state': 'JOHOR', 'constituency': 'P.140: SEG...","{'state': 'JOHOR', 'constituency': 'P.140: SEG...","{'state': 'JOHOR', 'constituency': 'P.140: SEG...","{'state': 'JOHOR', 'constituency': 'P.140: SEG...",,,,,,
1,"{'state': 'JOHOR', 'constituency': 'P.141: SEK...","{'state': 'JOHOR', 'constituency': 'P.141: SEK...","{'state': 'JOHOR', 'constituency': 'P.141: SEK...","{'state': 'JOHOR', 'constituency': 'P.141: SEK...","{'state': 'JOHOR', 'constituency': 'P.141: SEK...",,,,,
2,"{'state': 'JOHOR', 'constituency': 'P.142: LAB...","{'state': 'JOHOR', 'constituency': 'P.142: LAB...","{'state': 'JOHOR', 'constituency': 'P.142: LAB...",,,,,,,
3,"{'state': 'JOHOR', 'constituency': 'P.143: PAG...","{'state': 'JOHOR', 'constituency': 'P.143: PAG...","{'state': 'JOHOR', 'constituency': 'P.143: PAG...",,,,,,,
4,"{'state': 'JOHOR', 'constituency': 'P.144: LED...","{'state': 'JOHOR', 'constituency': 'P.144: LED...","{'state': 'JOHOR', 'constituency': 'P.144: LED...","{'state': 'JOHOR', 'constituency': 'P.144: LED...","{'state': 'JOHOR', 'constituency': 'P.144: LED...","{'state': 'JOHOR', 'constituency': 'P.144: LED...",,,,


OSError: Cannot save file into a non-existent directory: '\output'