In [1]:
import requests
import threading
import json
import pandas as pd
import numpy as np
import re
import bs4
import traceback


MARGIN_COL = 5
VOTE_RATE = 0.3
URL_CHAIN = 'http://35.188.227.39:8080/enhancer/chain/scorpiosvchain'
LABEL_CHAIN = 'http://fise.iks-project.eu/ontology/entity-label'
TYPE_CHAIN = 'http://fise.iks-project.eu/ontology/entity-type'

In [2]:
def strip(s):
    return ''.join(re.split('[^a-zA-Z0-9]', s.lower()))

def get_start_of_word(s):
    it = re.finditer('[a-zA-Z0-9,\./\+]+', s)
    return [ i.start() for i in it ]

def split_row(row, pos):
    r = []
    for i in range(0, len(pos) - 1):
        r.append(row[pos[i]:pos[i + 1]])
    r.append(row[pos[-1]:])
    return r

def is_null_row(row):
    return row.isnull().sum() == len(row)

In [3]:
def request_word(word):
    try:
        r = requests.post(URL_CHAIN, data=str(word).encode('utf-8'), headers={'Content-Type': 'application/pdf'})
        r = r.json()

        res = []
        track = []
        for obj in r:
            if LABEL_CHAIN in obj:
                v = obj[LABEL_CHAIN][0]['@value']
                if strip(v) not in track:
                    res.append(v)
                    track.append(strip(v))
        return res
    except Exception as e:
        traceback.print_exc()
    
    
def request_header(word, res):
    r = request_word(word)
    if len(r) > 0:
        print(r[0])
        res.append(r[0])

def request_row(table, index, res):
    r = []
    row = table.iloc[index].dropna().tolist()
    length, width = table.shape
    print(len(row), width)
    if len(row) / width <= 0.4 and width <= 50:
        res[index] = []
        return

    threads = [ threading.Thread(target=request_header, args=(word, r)) for word in row ]

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()

    res[index] = r

In [4]:
def separate_tables(raw_tables, table_margin=2):
    count = 0
    tables = {}
    for table in raw_tables:
        height, width = table.shape

        irow = 0
        istart = 0
        iend = 0

        while irow < height:
            while irow < height and is_null_row(table.iloc[irow]):
                irow += 1
            istart = irow

            while irow < height:
                while irow < height and (not is_null_row(table.iloc[irow])):
                    irow += 1
                iend = irow

                while irow < height and is_null_row(table.iloc[irow]):
                    irow += 1

                if irow - iend < table_margin:
                    irow += 1
                else: 
                    iend = irow 
                    break

            if iend - istart >= table_margin:
                tables['table_' + str(count)] = table.iloc[istart:irow].copy()
                count += 1

    return tables

# HTML

In [5]:
def format_html_table(raw_tables):
    for table in raw_tables:
        height, width = table.shape
        for col in table.columns:
            new_col = table[col].apply(lambda x: None if isinstance(x, str) \
                and len(x) > 0 \
                and (ord(x[0]) < 32 or ord(x[0]) > 126)  \
                else x)
            table[col] = new_col
            
        for ih in range(height):
            current_value = table.iloc[ih, 0]
            for iw in range(1, width):
                if table.iloc[ih, iw] == current_value:
                    table.iloc[ih, iw] = np.nan
                else:
                    current_value = table.iloc[ih, iw]

#         for ih in range(height):
#             same_value = True
#             first_value = table.iloc[ih, 0]
#             for iw in range(1, width):
#                 if not pd.isnull(table.iloc[ih, iw]) and table.iloc[ih, iw] != first_value:
#                     same_value = False
#                     break

#             if same_value:
#                 for iw in range(1, width):
#                     table.iloc[ih, iw] = np.nan

In [7]:
with open('../data/p5materials/html/c24.html') as f:
    soup = bs4.BeautifulSoup(f, features='lxml')
    for t in soup(['script', 'style', 'meta']):
        t.extract()
    pretty_soup_str = soup.prettify()
    pretty_soup_str = re.sub('\s+<span', '<span', pretty_soup_str)
    
    
raw_tables = pd.read_html(pretty_soup_str)
format_html_table(raw_tables)
tables = separate_tables(raw_tables, table_margin=1)

TypeError: 'str' object is not callable

In [9]:
soup = bs4.BeautifulSoup(open('../data/p5materials/html/c24.html'), features='lxml')
for t in soup(['script', 'style', 'meta']):
    t.extract()
pretty_soup_str = soup.text

print(pretty_soup_str)

DIETZE_CPP_POSITION_LIST_BSS_USG_(HOUSTON)







DIETZE PRODUCTS LLC 

APRIL 08TH, 2019


ATTN: CLEAN DESK 

CPP POSITION LIST BSS USG (HOUSTON)


ETA    VESSEL                     DWT  CUB  OPEN   PORT             FLEET            COMMENTS    


04/08  SEABREEZE                   53   56  04/08  USG              THENAMARIS       PPT         

       CRIMSON PEARL               51   52  04/08  USG              DAMICO           PPT         

       NORDIC AMY                  37   40  04/08  USG              UPT              PPT         

       ATLANTIC ROSE               49   51  04/08  USG              DIAMONDS         PPT         

       PLATYTERA                   47   52  04/08  USG              CLEARLAKE        PPT         

       ANCE                        52   56  04/08  USG              VITOL            PPT         

       RIDGEBURY KATHERINE Z       50   52  04/08  USG              NORIENT          PPT         

       MAERSK MARMARA              51   52  04/08  USG     

In [None]:
OVERLAP_URL = 'http://35.186.166.22:8082/portal/servlet/service/Poheader.poi'

def overlap(word_pos, header_item_pos):
    try:
        data = json.dumps({
            'user_name': 'carrotrule_xyz.com',
            'project_name': 'DataExtractionPO-2',
            'Rule_Engine': 'RuleData-2',
            'RawJson': {
                'wX1': word_pos[0],
                'wX2': word_pos[1],
                'hX1': header_item_pos[0],
                'hX2': header_item_pos[1],
            }
        })

        r = requests.post(OVERLAP_URL, data=data)
        print(json.dumps(data, indent=2))
        print('hello', r.json())
        return None
        return r.json()['hw1o'] == 'OVERLAP'
    except Exception as e:
        traceback.print_exc()
        return word_pos[1] >= header_item_pos[0] and word_pos[0] <= header_item_pos[1]
    
r = overlap((4, 6), (7, 8))
print(r)

In [19]:
request_word('SPORE/MSIA/INDO/THAI')

[]