### Login to ETL

In [42]:
import requests
import json
import os
from datetime import datetime
import configparser

os.environ["CUDA_VISIBLE_DEVICES"]=""

# load config file data
config = configparser.ConfigParser()
config.read('../config.ini')

# login to get jwt token

login_data = {'account': config['ETL']['account'], 'password': config['ETL']['password']}
res = requests.post('http://140.115.54.44:8001/api/auth/login', json=login_data)
token = json.loads(res.text)['token']
token

'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2NvdW50Ijoid2lkbS5kZXZvc21AZ21haWwuY29tIiwiaWF0IjoxNjczODgyMTQ0LCJleHAiOjUyNzM4Nzg1NDR9.aeS8Pr8iFdq7roXbGLInPSs9BrwAidY7xFcFqLE6qnE'

### Genarate SingleList Extractor (MDR)

In [9]:
sample_urls = ['https://www.com.tw/cross/check_016222_NO_1_111_0_3.html', 'https://www.com.tw/cross/check_004442_NO_1_111_0_3.html', 'https://www.com.tw/cross/check_001582_NO_0_111_0_3.html']
# sample_urls = []

In [10]:
headers = { 'Authorization' : 'Bearer ' + token}
body = {
    'name': 'DEvOSM_' + str(datetime.now().strftime('%y%m%d%H%M%S%f')),
    'dataSource': 'puppeteer',
    'pageType': 'DEvOSM',
    'source':{
        'params': [{'name': 'autoGenerate', 'type': 'stringList', 'range': [1, 1], 'stringListSource': 'empty', 'stringList': sample_urls }],
        'pattern': '${autoGenerate}'
    },
    'urls': sample_urls,
    'updateTime': 0,
    'waitTime': 15,
    'option': {
        'dcadeMerge': False
    },
    'UseCache': False,
}

res = requests.post('http://140.115.54.44:8001/api/extractors/create', json=body, headers=headers)
serial_number = json.loads(res.text)['serialNumber']
serial_number

'u53k97exvlcywj94p'

In [43]:
sample_urls = ['https://www.books.com.tw/web/sys_bbotm/books/010101/?o=1&v=1&page=6', 'https://www.books.com.tw/web/sys_bbotm/books/010101/?o=1&v=1&page=7']

headers = {'Authorization': 'Bearer ' + token}
body = {
    'dataSource': 'puppeteer',
    'urls': sample_urls,
    'updateTime': 0,
}

res = requests.post(
    'http://140.115.54.44:8001/api/extractors/re-extract-by-devosm/u53k97ex3lcywlbtf', json=body, headers=headers)
serial_number = json.loads(res.text)['serialNumber']
res, serial_number

(<Response [200]>, 'u53k9cdiflcyy7mkq')

### Connect to Database for Extractor Data

In [4]:
from pymongo import MongoClient

client = MongoClient(f"mongodb://{config['MongoDB']['account']}:{config['MongoDB']['password']}@{config['MongoDB']['ip']}:{config['MongoDB']['port']}/")
database = client[config['MongoDB']['database']]
collection = database[config['MongoDB']['collection']]

### Combine Multiple Extractor

In [5]:
sample_serial_numbers = ['u53k9144jel6xdnnm8', 'u53k9144i0l6xdn5a1', 'u53k9144itl6usu36e']
serial_numbers = sample_serial_numbers

In [6]:
import sys
 
# setting path
sys.path.append('../')

from util.matching import *
import time

start = time.time()

folder_path = '../schema_matching_data/multipage/test/Web_' + str(datetime.now().strftime('%y%m%d%H%M%S'))
master = collection.find_one({'serialNumber': serial_numbers[0]})['setsData']

# start matching data
if len(serial_numbers) > 1:
    for serial_number in serial_numbers[1:]:
        print('*'*10, serial_number, '*'*10)

        slave = collection.find_one({'serialNumber': serial_number})['setsData']
        # set matching
        print('--set matching--')
        set_result, master_index, slave_index = sets_matching(master, slave)
        # col matching
        print('--col matching--')
        master = col_matching_forDB(set_result, master, slave, master_index, slave_index, model_select=2)
else:
    print("Unable to combine!!!")

sets_data = master

# remove sets_data less than 3
for index, data in enumerate(sets_data.copy()):
    if len(data) < 3:
        sets_data.remove(data)

# check if folder path exists
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

for set_index, set_data in enumerate(sets_data):
    with open(f"{folder_path}/set_{str(set_index)}.txt", 'w') as fs:
        for col_data in set_data:
            fs.write(str(col_data) + '\n')
            
end = time.time()
print("{:.2f} s".format(end - start))

********** u53k9144i0l6xdn5a1 **********
--set matching--
- master -
Set_1 data less than 3. Skip this set!
Set_2 data less than 3. Skip this set!
Set_3 data less than 3. Skip this set!
Set_4 data less than 3. Skip this set!
Set_5 data less than 3. Skip this set!
Set_10 data less than 3. Skip this set!
Set_11 data less than 3. Skip this set!
Set_13 data less than 3. Skip this set!
- slave -
Set_1 data less than 3. Skip this set!
Set_2 data less than 3. Skip this set!
Set_3 data less than 3. Skip this set!
Set_4 data less than 3. Skip this set!
Set_5 data less than 3. Skip this set!
Set_6 data less than 3. Skip this set!
--col matching--
{1: [1], 2: [2], 3: [3], 4: [4], 5: [0], 6: [0]}
same col
[0, 1, 2, 3, 4, 5]
done
same col
[0, 1, 2]
done
same col
[0, 1]
done
diff col 245   253
********** u53k9144itl6usu36e **********
--set matching--
- master -
Set_1 data less than 3. Skip this set!
Set_2 data less than 3. Skip this set!
Set_3 data less than 3. Skip this set!
Set_4 data less than 3.

In [7]:
"""
#nonset matching   
# remove set data from html
# read set data
data = col_result['setsData'] # set data
#open orig html

fs=codecs.open("./nonsetdata/test/0000.html", 'r')
soup = BeautifulSoup(fs, 'lxml')
#delete tag
for dim1 in data:
    for dim2 in dim1:
        for dim3 in dim2:
            print(dim3)
            try:
                for replace_ in soup.findAll(text=dim3):
                    #print(replace_)
                    replace_.replace_with(replace_.replace(dim3,""))
                    #(replace_.parent).decompose()
            except:
                continue
for x in soup.find_all():
    if len(x.get_text(strip=True)) == 0:
        print(x.extract())
#save to new html 
with open("save/to/new/html/0000.html", "w") as file:
    file.write(str(soup))
#將新存好的html 去跑DCADE 
#DCADE 要去呼叫jar
"""   

'\n#nonset matching   \n# remove set data from html\n# read set data\ndata = col_result[\'setsData\'] # set data\n#open orig html\n\nfs=codecs.open("./nonsetdata/test/0000.html", \'r\')\nsoup = BeautifulSoup(fs, \'lxml\')\n#delete tag\nfor dim1 in data:\n    for dim2 in dim1:\n        for dim3 in dim2:\n            print(dim3)\n            try:\n                for replace_ in soup.findAll(text=dim3):\n                    #print(replace_)\n                    replace_.replace_with(replace_.replace(dim3,""))\n                    #(replace_.parent).decompose()\n            except:\n                continue\nfor x in soup.find_all():\n    if len(x.get_text(strip=True)) == 0:\n        print(x.extract())\n#save to new html \nwith open("save/to/new/html/0000.html", "w") as file:\n    file.write(str(soup))\n#將新存好的html 去跑DCADE \n#DCADE 要去呼叫jar\n'