In [None]:
# using requests lib to get html of each car detail page
import pandas as pd
import requests as rq
import time
import random

def getHTMLtext(url):
    header = {"user-agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299'}
    try:
        response = rq.get(url,headers=header,timeout=30)
        response.raise_for_status() # If status != 200, cause HTTPError
    except:
        return "get page html Error"
    else:
        response.encoding = 'utf-8'
        return response.text

def generate_request(dataframe):
    homepage = 'https://jpxkc.cbex.com'

    for durl in dataframe['deep_url']:
        url = homepage + durl
        yield getHTMLtext(url)

def main(dataframe):
    lst_html = []
    count = 0
    for item in generate_request(dataframe):
        lst_html.append([dataframe['title'][count], item])
        count += 1
        time.sleep(1 + random.random() * 2)
    return lst_html

'''
INITIATION
'''
dtfm_car = pd.read_pickle('car_info_download_comparLst.pkl')
lst_html = main(dtfm_car)

# homepage = 'https://jpxkc.cbex.com'
# d_url = dataframe['deep_url'][0]
# htmltext = getHTMLtext(homepage + d_url)

In [None]:
# Final Version of extracting information from html
# asyncronization try failed: 
import pandas as pd
from lxml import etree
import re

def extract_table(html: str):
    table = pd.read_html(io=html)
    for dtfm in table:
        if re.search(r'标的名称', dtfm.to_string(columns=[0])):
            return dtfm

def extract_accname_via_etree(html: str, extr_len=100):
    '''
    actually accn could only be contained in ct4 or ct5, not any ohter place
    '''
    h_etree = etree.HTML(html)
    for i in range(4,6):
        elem:list = h_etree.xpath(f'//*[@id="bd_detail_tab_ct{i}"]')
        e2str = etree.tostring(elem[0], method='text', encoding='utf-8').decode('utf-8')
        try:
            idx = e2str.index('户名')
        except ValueError:
            if not i == 5:
                continue
            return 'Not Found "Account Name" Signal'
        else:
            return e2str[idx + 3 : idx + extr_len].split('，')[0]

def main(lst_html:list):
    lst1 = []
    lst2 = []
    count = 0
    for html in lst_html:
        lst1.append(extract_table(html))
        lst2.append(extract_accname_via_etree(html))
        count += 1
        print(f'\r{1/275:>2.0f}%',end='')
    return lst1, lst2

if __name__=='__main__':
    lst_infoTable, lst_accname = main(lst_text)

In [None]:
# Try to look up 'deposit values' when requesting HTML with pyppeteer;
# This is also a try of asynchronious program, but seems no obvious enhancement;
# This functional module has been saved in D:\haruk\Documents\PythonCodes\test_async_get_info_from_html.py (code has been provisioned)
import nest_asyncio
nest_asyncio.apply()
import asyncio
from pyppeteer import launch
import random
import pandas as pd

def screen_size():
    import tkinter
    tk = tkinter.Tk()
    width = tk.winfo_screenwidth()
    height = tk.winfo_screenheight()
    tk.quit()
    return width, height

async def page_control(url, browser):
    page = await browser.newPage()
    # await page.setViewport(viewport={"width": width, "height": height})
    await page.setJavaScriptEnabled(enabled=True)
    await page.setUserAgent(
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
        '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299')
    xpathPromise = asyncio.ensure_future(page.waitForXPath('/html/body/div[6]/div[2]/div[1]/div[2]/div[6]/ul/li[1]/span[3]'))
    try: # Notify waitForXPath timeout default 30 sec => raise Exception
        await page.goto(url)  # indirectly cause a navigation(?)
        await xpathPromise  # Wait until element which matches xpath appears on page
    except TimeoutError:
        result = 'TimeoutError'
    else:
        target = await page.xpath('/html/body/div[6]/div[2]/div[1]/div[2]/div[6]/ul/li[1]/span[3]')
        result = await (await target[0].getProperty('textContent')).jsonValue()
        # task = await asyncio.create_task(target[0].getProperty('textContent'))
        # done, _ = await asyncio.wait({task})
        # jshandle = done.result()
        # result = await jshandle.jsonValue()
    # await asyncio.sleep(3)
    # await page_close(browser) # page_close() should surrender to main()
    await page.close()
    return result

async def page_close(browser, preserve=1):
    prs_pages = await browser.pages()
    for _page in prs_pages:
        prs = len(prs_pages)
        while prs > preserve:
            await _page.close()

def create_lst_task_url(dataframe, lst_tsk_len:int):
    homepage = 'https://jpxkc.cbex.com'
    srs_url = dataframe['deep_url']
    srs_tit = dataframe['title']
    lst_task_url = []
    lst_title = []
    for i in range(len(srs_url)//lst_tsk_len):
        imlst_1 = []
        imlst_2 = []
        for k in range(lst_tsk_len):
            imlst_1.append(homepage + srs_url[i * lst_tsk_len + k])
            imlst_2.append(srs_tit[i * lst_tsk_len + k])
        lst_task_url.append(imlst_1)
        lst_title.append(imlst_2)
    imlst_1 = []
    imlst_2 = []
    for i in range(len(srs_url) - len(srs_url)%lst_tsk_len, len(srs_url)):
        imlst_1.append(homepage + srs_url[i])
        imlst_2.append(srs_tit[i])
    lst_task_url.append(imlst_1)
    lst_title.append(imlst_2)
    return lst_task_url, lst_title

async def main(lst_task_url):
    # lst_task_url should be a 2D url list; len of second dimens defines concurrently running no.
    browser = await launch({'headless': False, 'userDataDir': r'D:/haruk/Documents/PythonCodes/Temp', 'args': ['--no-sandbox'], 'dumpio': True})
    # encapsule coroutine (lst_task)
    lst_task = []
    for item in lst_task_url[:2]:
        imlst = []
        for subitem in item:
            imlst.append(page_control(url=subitem, browser=browser))
        lst_task.append(imlst)
    # execute tasks
    lst_result = []
    for i in range(len(lst_task)):
        result = await asyncio.gather(*lst_task[i])
        lst_result.append(result)
        # await page_close(browser) # seems browser.pages() cannot connect to the page that has already been assigned a handle name
    # await browser.close() # Please manually close the chrome
    return lst_result

'''
INITIATION
'''
lst_task_url, lst_title = create_lst_task_url(dataframe=dtfm_car, lst_tsk_len=8)

loop = asyncio.get_event_loop()
lst_depoVal = loop.run_until_complete(main(lst_task_url))

# width, height = screen_size()
# p_url = 'https://jpxkc.cbex.com/jpxkc/prj/detail/157387.html'

# await asyncio.wait([
    #     page.click('a.my-link'),
    #     page.waitForNavigation(),
    # ])

### The first successful run (functional success) yield following Exception:

This may be due to ipython interactive module has already got a run of async loop (?)

> D:\Program Files\Python\Python39\lib\site-packages\pyppeteer\util.py:29:  
> 
> RuntimeWarning: coroutine 'page_control' was never awaited  
>   `gc.collect()`  
> RuntimeWarning: Enable tracemalloc to get the object allocation traceback  


In [17]:
# This and following cell are strings provision and data concatenation
import re

def split_string_1(dtfm: pd.DataFrame):
    srs_a = dtfm['拟提供的文件']
    lst1, lst2, lst3 = [], [], []
    for i in srs_a:
        temp = i.split('、')
        for i in range(len(temp)):
            temp[i] = re.sub(r'[\s123；]',repl='',string=temp[i])
        lst1.append(temp[1])
        lst2.append(temp[2])
        lst3.append(temp[3])
    dct = {'file_1': lst1, 'file_2': lst2, 'file_3':lst3}
    dtfm_app = pd.DataFrame(dct)
    dtfm = pd.concat(objs=[dtfm, dtfm_app], axis=1)
    return dtfm

srs_a = dtfm[dtfm.columns[11]]


def split_string_2(srs:pd.Series):
    lst_r1 = []
    lst_r2 = []
    for item in srs:
        lst_idx = []
        for i in range(1,7):
            try:
                idx = re.search(f'{i}、',item).span()[0]
            except:
                pass
            else:
                lst_idx.append(idx+1)
        lst_val = []
        lst_nam = []
        for i in range(len(lst_idx)-1):
            if lst_idx[i]:
                targ = item[lst_idx[i]: lst_idx[i+1]-1].split('：')
                try:
                    name = targ[0]
                    value = targ[1]
                except:
                    name = None
                    value = targ[0]
                lst_nam.append(name)               
                lst_val.append(value)
        targ_last = item[lst_idx[-1]:].split('：')
        try:
            name = targ_last[0]
            value = targ_last[1]
        except:
            name = None
            value = targ_last[0]
        lst_nam.append(name)               
        lst_val.append(value)
        lst_r1.append(lst_val)
        lst_r2.append(lst_nam)
    return lst_r1, lst_r2

lst_values, lst_names = split_string(srs_a)

In [62]:
for i in range(len(lst_values)):
    temp = re.sub(r'[\D]',repl="_", string=lst_values[i][-1]).strip('_')
    temp_lst = temp.split('_')
    for k in temp_lst:
        lst_values[i].append(k)

for r in range(len(lst_values)):
    for ite in range(len(lst_values[r])):
        lst_values[r][ite] = re.sub(r'[、\s\xa0\u3000]', repl="",string=lst_values[r][ite])

for i in range(len(lst_values)):
    if len(lst_values[i]) < 9:
        for k in range(9-len(lst_values[i])):
            lst_values[i].append(-1)

import numpy as np
import pandas as pd
ary = np.array(lst_values)
dct = {}
for i in range(len(lst_values[0])):
    try:
        if indexs[i]:
            dct[indexs[i]] = ary[:,i]
        else:
            dct[f'col_{i}'] = ary[:,i]
    except:
        dct[f'col_{i}'] = ary[:,i]
dtfm_sv = pd.DataFrame(dct)
