In [1]:
from selenium import webdriver
import time
import datetime
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.select import Select
import pandas as pd
import glob


In [2]:
def grain_data_ingesting():

    gecko = Service(executable_path=GeckoDriverManager().install())

    op = FirefoxOptions()
    op.add_argument('-headless')
    op.set_preference("browser.download.folderList", 2)
    op.set_preference("browser.download.manager.showWhenStarting", False)
    op.set_preference("browser.download.dir", r"C:\Users\Asus\Desktop\Explore\Internship")
    op.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
    driver = webdriver.Firefox(service = gecko, options=op)

    driver.get('http://webapps.daff.gov.za/amis/Link.amis?method=GrainMarket')


    # Get the current date
    current_date = datetime.datetime.now()

    # Calculate the date for a week prior
    month_prior = current_date - datetime.timedelta(days=8)

    # Extract the date from the datetime object and format it as a string
    month_prior_date = month_prior.date()
    # date_string = month_prior_date.strftime("%Y,%m,%d")
    string_date = str(month_prior_date.year) + "," + str(month_prior_date.month) + "," + str(month_prior_date.day)
    # Store the current date and a month prior
    # print("Date for a month prior: ", date_string, string_date)
    q_month = month_prior_date.month
    c_month = current_date.month

    market = driver.find_element(By.ID, "cbSearchMarket")

    market_element = Select(market)
    market_option = market_element.options
    market_size = len(market_option)

    market_name = []
    market_code = []
    market_type = []

    f = 1
    while f <= 10:
    #try:
        driver.find_element(By.XPATH, "//img[@alt='Click here to select sale start date.']").click()
        if q_month != c_month:
            driver.find_element(By.XPATH, "//a[@href='javascript:changeCalendarControlMonth(-1);']").click()
        driver.find_element(By.XPATH, "//a[@href='javascript:setCalendarControlDate({})']".format(string_date)).click()
        driver.find_element(By.XPATH, "//select[@id='cbSearchMarket']/option[{}]".format(f)).click()
        
        # this selects the viewmarket option
        driver.find_element(By.NAME, "btnViewMarket").click()
        time.sleep(5)

        # Finds the outer div element containing market info
        outer_div = driver.find_element(By.ID , "popUpDivPDF")

        # Switches the frame to the object within said div
        driver.switch_to.frame(outer_div.find_element(By.TAG_NAME, 'object'))

        # # Find the table element within the nested div/object
        table = driver.find_element(By.CSS_SELECTOR, "table:first-child + table")

        # # Find the element within the table and assigns them to variables
        m_n = table.find_element(By.XPATH, "//tr[2]/td[2]")
        m_c = table.find_element(By.XPATH, "//tr[3]/td[2]")
        m_t = table.find_element(By.XPATH, "//tr[4]/td[2]")
        
        # Extracts the text within the elements
        mrkt_name = m_n.text
        mrkt_code = m_c.text
        mrkt_type = m_t.text

        # switches the frame back to the default view and closes the popup window
        driver.switch_to.default_content()
        driver.find_element(By.XPATH, "//img[@title='Click here to close window.']").click()

        market_name.append(mrkt_name)
        market_code.append(mrkt_code)
        market_type.append(mrkt_type)
        driver.find_element(By.NAME, "btnDBSearch").click()
        download = driver.find_element(By.NAME, "btnPrint")
        download.click()
        time.sleep(5)
        f += 1
    #except:
    #    driver.refresh()
    driver.close()
    return market_name, market_code, market_type

def grain_pre_processing(market_name, market_code, market_type):

    path = r'C:\Users\Asus\Desktop\Explore\Internship' 
    # Create a python list containing the path/names.xls of all excel files
    all_files = glob.glob(path + '/*.xls')
    
    # initialise a counter variable for the market info stored in a list
    i = -1

    # initialise an empty list that will store the contents of the created dataframes
    dataframes = []
    for file in all_files:
        i += 1
        # read in each data file stored in the path's folder
        df = pd.read_excel(file, header=1)
        if df.empty:
            continue
        df = df.drop(columns='Unnamed: 12')
        # df = df.dropna()
        df['Volatility'] = df['Volatility'].str.strip()
        df['Value'] = df['Value'].str.strip()
        df['Bid'] = df['Bid'].str.strip()
        df['Offer'] = df['Offer'].str.strip()
        df['Market to Market'] = df['Market to Market'].str.strip()
        df['First'] = df['First'].str.strip()
        df['Last'] = df['Last'].str.strip()
        df['High'] = df['High'].str.strip()
        df['Low'] = df['Low'].str.strip()
        df['Conts'] = df['Conts'].str.strip()

        # Adds market info to the table organically
        df['market_name'] = market_name[i]
        df['market_code'] = market_code[i]
        df['market_type'] = market_type[i]

        dataframes.append(df)
    merged_df = pd.concat(dataframes, ignore_index=True)
    merged_df.to_csv("grain_merged.csv", index= False)


In [4]:
market_name, market_code, market_type = grain_data_ingesting()

In [5]:
grain_pre_processing(market_name, market_code, market_type)