In [20]:

import queue
import threading
import time

import pandas
from lxml import html
from selenium.webdriver import Chrome, Edge
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.wait import WebDriverWait
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.microsoft import EdgeChromiumDriverManager


def GetChromeBrowser(isHeadless=True):
    opt = Options()
    if isHeadless:
        opt.add_argument("--headless")
    opt.add_argument("--mute-audio")
    opt.add_argument("--disable-notifications")
    capabilities = DesiredCapabilities.CHROME
    capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
    try:
        return Chrome(service=Service(ChromeDriverManager().install()), options=opt)
    except Exception as err:
        print(err)


def GetEdgeBrowser(isHeadless=True):
    options = EdgeOptions()
    if isHeadless:
        options.add_argument("--headless")
    options.add_argument("--mute-audio")
    options.add_argument("--disable-notifications")

    try:
        return Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)
    except Exception as err:
        print(err)


def GrabData(url):
    driver = GetEdgeBrowser(False)
    driver.get(url)

    states_input_xpath = '//select[@id="j_idt31_input"]/option'
    rto_input_xpath = '//select[@id="selectedRto_input"]/option'
    y_axis_xpath = '//select[@id="yaxisVar_input"]/option'
    x_axis_xpath = '//select[@id="xaxisVar_input"]/option'
    year_xpath = '//select[@id="selectedYear_input"]/option'

    try:
        states_element = WebDriverWait(driver, 10).until(
            EC.visibility_of_all_elements_located((By.XPATH, states_input_xpath)))
    except:
        pass

    page_source = driver.page_source
    doc = html.fromstring(page_source)

    states_options = doc.xpath(states_input_xpath)
    states_dic = {''.join(_.xpath('@value')).lower(): ''.join(_.xpath('text()')).lower() for _ in states_options}
    states_codes_list = [''.join(_.xpath('@value')).lower() for _ in states_options]
    states_list = [''.join(_.xpath('text()')).lower() for _ in states_options]

    rto_options = doc.xpath(rto_input_xpath)
    rto_list = [''.join(_.xpath('text()')).lower() for _ in rto_options]

    yaxis_options = doc.xpath(y_axis_xpath)
    yaxis_list = [''.join(_.xpath('text()')).lower() for _ in yaxis_options]

    xaxis_options = doc.xpath(x_axis_xpath)
    xaxis_list = [''.join(_.xpath('text()')).lower() for _ in xaxis_options]

    year_options = doc.xpath(year_xpath)
    year_list = [''.join(_.xpath('text()')).lower() for _ in year_options]

    return states_dic, states_codes_list, states_list, rto_list, yaxis_list, xaxis_list, year_list


url = 'https://vahan.parivahan.gov.in/vahan4dashboard/vahan/view/reportview.xhtml'
states_dic, states_codes_list, states_list, rto_list, yaxis_list, xaxis_list, year_list = GrabData(url)
print(states_dic)



# def qToDf(q):
#     l = []
#     while not q.empty():
#         l.append(q.get())
#     [q.put(_) for _ in l]
#     df = pandas.DataFrame(l)
#     return df
#
#
# df = qToDf(resultQ)
# df.to_csv("sample scrapped.csv", index=False)



In [None]:

from selenium.webdriver import ActionChains

resultQ = queue.Queue()
errorQ = queue.Queue()


def ChangeSelectOption(driver, xpath, index):
    try:
        states_element = WebDriverWait(driver, 15).until(
            EC.visibility_of_all_elements_located((By.XPATH, '//select[@id="j_idt31_input"]')))
        action_chains = ActionChains(driver)
        # Move the cursor to the element
        action_chains.move_to_element(states_element).click().perform()
    except Exception as e:
        print('selector error')
        print(e)
    print(xpath, index)
    try:
        wait = WebDriverWait(driver, 10)
        wait.until(
            lambda driver: driver.find_element(By.XPATH, xpath))
        select_element = driver.find_element(By.XPATH, xpath)
        select_element.click()
        select = Select(select_element)
        select.select_by_index(index)
    except Exception as e:
        print(e)
        pass


def Scrapper(driver, url, data=None):
    driver.get(url)

    states_select_xpath = '//select[@id="j_idt31_input"]'
    rto_select_xpath = '//select[@id="selectedRto_input"]'
    y_axis_select_xpath = '//select[@id="yaxisVar_input"]'
    x_axis_select_xpath = '//select[@id="xaxisVar_input"]'
    year_select_xpath = '//select[@id="selectedYear_input"]'

    refresh_xpath = "//button[@id='j_idt61']"

    try:
        states_element = WebDriverWait(driver, 10).until(
            EC.visibility_of_all_elements_located((By.XPATH, states_select_xpath)))
    except:
        pass
    try:
        #states index
        ChangeSelectOption(driver, states_select_xpath, data['state'])

        #rto index
        ChangeSelectOption(driver, rto_select_xpath, data['rto'])

        #y index
        ChangeSelectOption(driver, y_axis_select_xpath, data['y'])

        #x index
        ChangeSelectOption(driver, x_axis_select_xpath, data['x'])

        #year index
        ChangeSelectOption(driver, year_select_xpath, data['year'])
    except Exception as e:
        print('1')
        print(e)

    refresh_button = driver.find_element(By.XPATH, refresh_xpath)
    refresh_button.click()

    aria_busy_element_xpath = "//div[@id='combTablePnl']"
    try:
        wait = WebDriverWait(driver, 10)
        wait.until(
            lambda driver: driver.find_element(By.XPATH, aria_busy_element_xpath).get_attribute("aria-busy") == "false")
        page_source = driver.page_source
        doc = html.fromstring(page_source)
        table_xpath = "//table[@id='my_table']"
        table_element = doc.xpath(table_xpath)

        df = pandas.read_html(html.tostring(table_element[0]))[0]
        df.to_csv('temp.csv')
    except Exception as e:
        print(e)
        pass


jobs = queue.Queue()


def WorkerLoop(progress_bar: tqdm):
    # driver = GetChromeBrowser(False)
    driver = GetEdgeBrowser(False)
    while not jobs.empty():
        data = jobs.get()
        url = 'https://vahan.parivahan.gov.in/vahan4dashboard/vahan/view/reportview.xhtml'
        try:
            Scrapper(driver, url, data)
        except:
            errorQ.put({'url': url})
            pass
        time.sleep(1)
        progress_bar.update()


job = {}
job['state'] = state_index = states_codes_list.index('ap')
job['rto'] = rto_index = 2
job['y'] = y_index = yaxis_list.index('fuel')
job['x'] = x_index = xaxis_list.index('month wise')
job['year'] = year_index = 5

jobs.put(job)

ignore_year = ['select year', 'till today']
ignore_tag_1 = 'all vahan4 running office'

threads = []
pbar = tqdm(total=2)

for worker in range(1):
    thread = threading.Thread(target=WorkerLoop, args=(pbar,))
    thread.start()
    threads.append(thread)

for t in threads:
    t.join()


 50%|█████     | 1/2 [04:15<04:15, 255.17s/it]
