In [11]:
import pandas as pd
import glob
import xml.etree.ElementTree as ET
from google.cloud import vision
import io
import time
import os

from tqdm import tqdm
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.path.join('.', 'gcp/googlecreds.json')

text_list = []


def xml_to_csv(path):
    xml_list = []
    for xml_file in glob.glob(path + '/*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('object'):
            class_name = member[0].text
            if class_name == 'keyvalue':
                value = (root.find('filename').text,
                         int(root.find('size')[0].text),
                         int(root.find('size')[1].text),
                         member[0].text,
                         int(member[4][0].text),
                         int(member[4][1].text),
                         int(member[4][2].text),
                         int(member[4][3].text)
                         )
                xml_list.append(value)
    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    return xml_df


def getTextGroupData(imagePath):
    vertexList = []
    client = vision.ImageAnnotatorClient()
    with io.open(imagePath, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)
    response = client.text_detection(image=image)
    texts = response.text_annotations
    bool = False
    complete_text = ''
    for text in texts:
        vertices = text.bounding_poly.vertices
        if bool:
            vertexList.append((text.description, vertices))
        else:
            complete_text = text.description
            bool = True

    if response.error.message:
        raise Exception(
            '{}\n'.format(response.error.message))
    return vertexList



def get_text_single_bbox(vertexList, bbox):
    text = ''
    boundingBox = bbox

    for vertexText in vertexList:
        vertex = vertexText[1]
        x1 = vertex[0].x
        y1 = vertex[0].y
        x2 = vertex[2].x
        y2 = vertex[2].y
        height = y2 - y1
        width = x2 - x1

        if (not (max(x1, boundingBox[0]) + width / 3 > min(x2, boundingBox[2]))) and \
                (not (max(y1, boundingBox[1]) + height / 3 > min(y2, boundingBox[3]))):
            text += vertexText[0] + ' '
    text_list.append(text)
    return text


In [16]:
def main():
    base_path = 'C:/Users/KartikayGupta/NER/new_data/ci/'
    xml_df = xml_to_csv(base_path)
    xml_df.to_excel('AN+BL_annotations.xlsx')
    xml_df = pd.read_excel('AN+BL_annotations.xlsx', engine='openpyxl')

    final_df = pd.DataFrame()
    files = xml_df['filename'].unique()
    i = 0
    for file in tqdm(files):
        start = time.time()
        df = xml_df[xml_df['filename'] == file]
        if not os.path.exists(base_path + file):
            continue
        vertexList = getTextGroupData(base_path + file)
        for index, row in df.iterrows():
            bbox = [row['xmin'], row['ymin'], row['xmax'], row['ymax']]
            text = get_text_single_bbox(vertexList, bbox)
            final_df.loc[i, 'filename'] = file
            final_df.loc[i, 'text'] = text
            final_df.loc[i, 'label'] = df.loc[index, 'class']
            i = i+1
        print(file + ' --> done')
        print(time.time()-start)
    #final_df.to_excel('AN+BL-not-seperated-keyvalues.xlsx')


if __name__ == '__main__':
    main()


  4%|▍         | 1/25 [00:02<01:01,  2.56s/it]

100678791-699489342-CIPLUPDATE-1.png --> done
2.558120012283325


  8%|▊         | 2/25 [00:04<00:50,  2.21s/it]

101352736-1017484529-XLS-CIPLforNBMZ136913-1.png --> done
1.9627797603607178


 12%|█▏        | 3/25 [00:06<00:47,  2.18s/it]

101352736-1090844567-XLS-CIPLforNBMZ136912-1.png --> done
2.141561508178711


 16%|█▌        | 4/25 [00:12<01:16,  3.65s/it]

103181614-1116994062-COMMERCIALINVOICE6PAGE-1.png --> done
5.91527795791626


 20%|██        | 5/25 [00:21<01:52,  5.61s/it]

105958527-819539355-CIPL_BLKLLAX2106184-1.png --> done
9.081838369369507


 24%|██▍       | 6/25 [00:25<01:32,  4.85s/it]

110920249-110935712-110920249-110920525-160504documents-XLS-1.png --> done
3.3778281211853027


 28%|██▊       | 7/25 [00:26<01:08,  3.83s/it]

119373-66252353.png --> done
1.728647232055664


 32%|███▏      | 8/25 [00:28<00:54,  3.20s/it]

119873-654498088.png --> done
1.8350229263305664


 36%|███▌      | 9/25 [00:29<00:41,  2.60s/it]

12539877-834052839-XLS-ShipDocsBLSGSHA004197PO43080-1.png --> done
1.3021163940429688


 40%|████      | 10/25 [00:31<00:36,  2.44s/it]

129828644-546505445-IHR2022000000049-CI-1.png --> done
2.088822603225708


 44%|████▍     | 11/25 [00:34<00:35,  2.51s/it]

204278075-1174210101-PL34273-1.png --> done
2.647385358810425


 48%|████▊     | 12/25 [00:36<00:31,  2.44s/it]

232064186-1504445230-33700shippingdocuments-1.png --> done
2.3019726276397705


 52%|█████▏    | 13/25 [00:39<00:29,  2.46s/it]

232064186-1504445230-33700shippingdocuments-2.png --> done
2.5034213066101074


 56%|█████▌    | 14/25 [00:41<00:25,  2.30s/it]

262371656-969391988-31236480Invoce13603-21-22-23-1.png --> done
1.9098012447357178


 60%|██████    | 15/25 [00:43<00:23,  2.35s/it]

290967770-652764262-KX-E5G4-1671-A09USA211529-PDF-5.png --> done
2.468593120574951


 64%|██████▍   | 16/25 [00:46<00:22,  2.47s/it]

290967770-652764262-KX-E5G4-1671-A09USA211529-PDF-7.png --> done
2.762699604034424


 68%|██████▊   | 17/25 [00:50<00:22,  2.78s/it]

331238858-1507103723-NaturaCustoms21-01-2022-1.png --> done
3.48807692527771


 72%|███████▏  | 18/25 [00:54<00:21,  3.14s/it]

331238858-197011187-Badge4UCustoms049-1-2022-1.png --> done
3.984544515609741


 76%|███████▌  | 19/25 [00:56<00:17,  2.93s/it]

35432876-654341905-PO34643Tivany-1.png --> done
2.440603256225586


 80%|████████  | 20/25 [01:00<00:15,  3.18s/it]

36907366-1114205724-XLSX-E1212E1217-PetstopDiscountWarehouse-3007-1.png --> done
3.736102342605591


 84%|████████▍ | 21/25 [01:04<00:14,  3.63s/it]

429089010-1402734547-21400970-1.png --> done
4.6904356479644775


 88%|████████▊ | 22/25 [01:08<00:10,  3.56s/it]

429347477-4886367-CommercialInvoice_409690408820414530413440409700-1.png --> done
3.3730735778808594


 92%|█████████▏| 23/25 [01:10<00:06,  3.10s/it]

438387920-438387759-CopyofInvoice-packinglist-86882-1.png --> done
2.034600019454956


 96%|█████████▌| 24/25 [01:12<00:02,  2.87s/it]

56776715-1732358103-comminv21012021-1.png --> done
2.3341317176818848


100%|██████████| 25/25 [01:18<00:00,  3.15s/it]

7375694-882004595-PAINANI530411MAR22-5.png --> done
5.976384401321411





In [17]:
print(len(text_list))

1113


In [18]:
with open('final_data.txt', 'w' , encoding="utf-8") as f:
    for line in text_list:
        f.write(line)
        f.write('\n')