# Data Extraction Demo

Trying the `selenium` package to extract weather data.

In [1]:
import pandas as pd
import numpy as np
import os
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException, NoAlertPresentException, TimeoutException

## Initializing webpage

Sources:
1. https://www.techbeamers.com/selenium-webdriver-waits-python/
2. https://stackoverflow.com/questions/43801836/how-do-i-fix-the-typeerror-raised-when-trying-to-find-an-element-using-selenium
3. https://stackoverflow.com/questions/14826888/python-os-path-join-on-a-list
4. https://stackoverflow.com/questions/52049929/how-to-change-the-download-location-using-python-and-selenium-webdriver

In [2]:
# creating a download path
download_path = os.getcwd().split(os.sep)[:-1:]
download_path = os.path.join(os.sep, *download_path, 'data', '')

# changing default download directory
options = webdriver.ChromeOptions() 
prefs = {
"download.default_directory": download_path,
"download.prompt_for_download": False,
"download.directory_upgrade": True
}
options.add_experimental_option('prefs', prefs)

In [3]:
# starting chrome webdriver
try:
    d = webdriver.Chrome(options=options)
except WebDriverException:
    d = webdriver.Chrome(ChromeDriverManager().install(), options=options)

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103


 


[WDM] - Driver [/Users/jacobjohn/.wdm/drivers/chromedriver/mac64/83.0.4103.39/chromedriver] found in cache


In [4]:
d.get('https://climate.northwestknowledge.net/NWTOOLBOX/formattedDownloads.php')

In [5]:
def wait_until_clickable(xpath, wait_time=15):
    """
    Wait until element found by xpath is clickable
    
    Args
    ----
    xpath : str
        path to element
    wait_time : int
        wait time with default of 15 seconds
    """
    try:
        WebDriverWait(d, wait_time).until(
            EC.element_to_be_clickable((By.XPATH, xpath))
        )
    except(NoAlertPresentException, TimeoutException) as py_ex:
        print("Alert not present")
        print(py_ex)
        print(py_ex.args)

## Selecting Location & Product

In [6]:
set_location_button = "//button[@class='btn btn-large btn-default'][@data-dismiss='modal']"
wait_until_clickable(set_location_button, wait_time=40)
d.find_element_by_xpath(set_location_button).click()

In [7]:
# # fill lat and long
# lat_input = "//input[@id='pointLat']"
# wait_until_clickable(lat_input)
# d.find_element_by_xpath(lat_input).clear()
# d.find_element_by_xpath(lat_input).send_keys(str(43.7324))

# long_input = "//input[@id='pointLong']"
# wait_until_clickable(long_input)
# d.find_element_by_xpath(long_input).clear()
# d.find_element_by_xpath(long_input).send_keys(str(117.7324))

In [8]:
# fill in geolocation
geo_location = "//input[@id='address']"
wait_until_clickable(geo_location, wait_time=10)
d.find_element_by_xpath(geo_location).clear()
d.find_element_by_xpath(geo_location).send_keys("karnal, haryana") # district, state : str

# click on set location
set_location_button = "//input[@value='SET LOCATION'][@class='btn btn-large btn-primary pull-right']"
wait_until_clickable(set_location_button, wait_time=5)
d.find_element_by_xpath(set_location_button).click()

In [9]:
# select product
d.find_element_by_xpath("//select[@id='product']/option[@value='metdata']").click()

## Selecting Columns

Source:
1. https://stackoverflow.com/questions/7867537/how-to-select-a-drop-down-menu-value-with-selenium-using-python

In [10]:
columns = ['Min Temperature',
           'Max Temperature',
           'Precipitation', 
           'Min Rel. Humidity',
           'Max Rel. Humidity']

n_cols = len(columns)+1

assert n_cols <= 8

In [11]:
# change number of columns
d.find_element_by_xpath(f"//select[@name='numCol']/option[text()='{n_cols}']").click()

In [12]:
# Select desired columns
for i in range(1, n_cols):
    print(i, columns[i-1])
    d.find_elements_by_xpath(f"//tr/td/select[@id='varCSV{i}']/option[text()='{columns[i-1]}']")[0].click()

1 Min Temperature
2 Max Temperature
3 Precipitation
4 Min Rel. Humidity
5 Max Rel. Humidity


## Downloading CSV

Sources:
1. https://stackoverflow.com/questions/22646031/selenium-wait-until-element-is-not-visible

In [13]:
# download csv
d.find_elements_by_xpath("//button[@class='btn btn-large btn-primary pull-right'][@id='form-button']")[0].click()

In [14]:
# wait until progress bar disappears
progress_bar_close = "//button[@class='btn btn-default btn-close']"
progress_bar = "//div[@class='progress progress-striped active']"

wait_until_clickable(progress_bar_close)
print("Progress bar appeared")

WebDriverWait(d, 300).until(
    EC.invisibility_of_element_located((By.XPATH, progress_bar))
)
print("Download has started")

Progress bar appeared
Download has started
