In [None]:
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdf2image import convert_from_path  # For scanned PDFs
import easyocr
import cv2

import pandas as pd
from io import BytesIO
import csv
from flair.tokenization import SegtokTokenizer
from flair.tokenization import SegtokSentenceSplitter
from tqdm import tqdm
from difflib import SequenceMatcher
import re
import pickle
import re
import random

In [None]:
def pdf2img(pdf, name, pagenums=None):
    """
    Takes PDF page and converts it to png image for running OCR
    :param pdf: Pdf location
    :param name: Pdf name for image file
    :param pagenums: page number to be converted
    :return: save pages as jpeg images
    """

    # print(len(PDF))
    global total_pages
    pages = convert_from_path(pdf, 500, poppler_path=r"C:\poppler-0.68.0\bin", timeout=10000, first_page=pagenums,
                              last_page=pagenums)

    #
    # # Counter to store images of each page of PDF to image
    image_counter = 1
    #
    # Iterate through all the pages stored above
    for page in pages:
        # Declaring filename for each page of PDF as JPG
        # For each page, filename will be:
        # PDF page 1 -> pdfname_1.jpg
        # PDF page 2 -> pdfname_2.jpg
        # PDF page 3 -> pdfname_3.jpg
        # . . . .
        # PDF page n -> pdfname_n.jpg
        filename = f'{name}_' + str(image_counter) + ".png"

        # Save the image of the page in system
        page.save(r'C:/Data/Output/OCR/images/' + filename)
        # print('Saved page number ' + str(image_counter))
        # Increment the counter to update filename
        image_counter = image_counter + 1
    total_pages = image_counter
    
def img_ocr(location, filename):  # For Image/Scanned PDF to text
    """
    Opens PNG image (single page) and runs OCR model to extract text
    :param location: Location of PNG image
    :param filename: Name of PNG image
    :return: Text extracted from scanned image (string)
    """
    total_text = ''
    for page in range(1, total_pages):  # tqdm(range(1, total_pages), desc='Converting images to text. . .'):
        loc = f'{location}/{filename}_{page}.png'
        image = cv2.imread(loc)
        reader = easyocr.Reader(['en'],
                                recog_network='custom_example')  # , recog_network='custom_example' this needs to run only once to load the model into memory
        result = reader.readtext(loc, height_ths=0.2,
                                 ycenter_ths=0.3, width_ths=0.5, paragraph=True, decoder='wordbeamsearch', y_ths=0.2,
                                 x_ths=50)

        # paragraph=True)  # , rotation_info=[90, 180, 270], y_ths=1, x_ths=0.09, height_ths=0.5, ycenter_ths=0.5, width_ths=0.5
        cv2.startWindowThread()
        for (bbox, text) in result:  # , prob
            # display the OCR'd text and associated probability
            # print("[INFO] {:.4f}: {}".format(prob, text))
            # unpack the bounding box
            (tl, tr, br, bl) = bbox
            tl = (int(tl[0]), int(tl[1]))
            tr = (int(tr[0]), int(tr[1]))
            br = (int(br[0]), int(br[1]))
            bl = (int(bl[0]), int(bl[1]))
            # cleanup the text and draw the box surrounding the text along
            # with the OCR'd text itself
            cv2.rectangle(image, tl, br, (0, 0, 255), 4)
            cv2.putText(image, text, (tl[0], tl[1] - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 2.5, (0, 90, 200), 8)

        file = open(f"C:/Data/Output/OCR/{filename}_OCR.txt", 'a')
        for (bbox, text) in result:  # , prob
            total_text += str(text) + '\n'
            file.write(str(text))
            file.write('\n')
        file.close()
        # show the output image
        cv2.namedWindow('PDF Output', cv2.WINDOW_NORMAL)
        cv2.imshow("PDF Output", image)
        cv2.waitKey(20)
    # print(f'FINAL PAGE TEXT : {total_text}')
    return str(total_text)

In [None]:

def pdf2token(pdf, loc, titles):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    data = ''
    laparams = LAParams(char_margin=30, line_margin=2, boxes_flow=1)
    device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    caching = True
    fp = open(pdf, 'rb')
    writer = csv.writer(open(loc, 'w', newline='', encoding="utf-8"))
    pagenumber = 0
    pagenos=set()
    for pagenumber, page in enumerate(PDFPage.get_pages(fp,pagenos, maxpages=0,caching=True, check_extractable=True)):
        #Process pages one by one into interpreter
        if pagenumber is not None:
            interpreter.process_page(page)
            if len(retstr.getvalue()) < 10:
                # print(f'>> OCR PAGE >>{retstr.getvalue()} <<<<<<< Page number: {pagenum + 1}<<<<< ! ! ! ')
                # Page is OCR only
                try:
                    pdf2img(pdf, titles, pagenums=pagenumber)  # Convert page to image              
                    data += img_ocr('C:/Data/Output/OCR/images', titles)  # Get OCR from converted image
                except:
                    continue
            else:
                data += retstr.getvalue().decode('ascii', 'ignore')  # add extracted text from bytesIO to data variable
                data = data.replace('\x0c', ' ')    # Remove useless character
        retstr.truncate(0)
        retstr.seek(0)
                
    '''
        ### OLD READING LOGIC WITHOUT OCR
        if pagenumber is not None:
            interpreter.process_page(page)
        pagenumber = pagenumber+1
        
    data = retstr.getvalue()
    data = data.decode("ascii", "ignore")
    '''
    
    #print(f' >>>>>>NORMALSTART<<<<<< {data} >>>>>>>>END<<<<<<<')

    splitter = SegtokSentenceSplitter() # Flair sentence spliter + word tokenizer
    sent_text = splitter.split(data)
    #print(f' >>>>>>START<<<<<< {sent_text} >>>>>>>>END<<<<<<<')
    for sent in sent_text:
        writer.writerow([sent.to_tokenized_string().lower()])
        

def to_plain_sent(tokens): # Convert tokenized(cleaned) words back to sentence
    
    sent = " ".join([t.text for t in tokens])
    return sent.rstrip()

#####################
# NEW ATTRIBUTE TAKER
#####################

def txt2ent(train):
    entpair,attr,cntxt = [], [], []
    eg = []
    ent = []
    dic = {}
    tokeniser = SegtokTokenizer() # Flair sentence tokenizer
    with open(train, 'r', encoding="utf-8") as f:
        for linenum, line in enumerate(f):
            if line[0] == '|':
                context = None
                attr = line.lower().strip('\n').split('|')
                if len(attr[2])>0:context = attr[2].strip(' ')  # Take context request from header line
                else:context = 'no' # If no context is found deafult to 'no'
                new_attr = attr[1].replace(' ','I') # Replace entitity name whitespace with 'I'
                print(f'LineNO: {linenum + 1}  Attribute type: {new_attr} Context: {context}')
                dic.fromkeys(new_attr)
                dic[new_attr] = attr
                continue
            else:
                values = []
                values = line.lower().strip(' \n').split('|')  # Split multiple Entitity values
            values = list(dict.fromkeys(values))    # Remove duplicates by creating Dictionary keys
            for i in range(0, len(values)):
                cntxt.append(context)
                sent_text = tokeniser.tokenize(values[i].strip(' '))
                plain_val = to_plain_sent(sent_text)
                eg.append(plain_val)
                ent.append(new_attr)
    with open('./Outputs/EntityDic.txt', 'a', encoding="utf-8") as f:   # Create Entity dictionary to convert whitespace to 'XXX'
        for k, v in dic.items():
            f.writelines(f'{k},{v}\n')
    #print(f'Entity type: {eg} \nEntity names: {ent}')
    entpair = list(zip(eg,ent,cntxt))   # Entity value, Entitiy name, Context request flag
    print(entpair)
    return(entpair)

#####################
# OLD ATTRIBUTE TAKER
#####################

def txttrain(train):
    entpair = []
    eg = []
    ent = []
    with open(train, 'r', encoding="utf-8") as f:
        for line in f: 
            text = line.rstrip('\n')
            defi = []
            defi = text.lower().split(',')
            defi[0] = defi[0].replace(' ','I')
            if len(defi)>=3:
                for w in range(1,len(defi)-1):
#                    print(str(w))
                    defi[1] = defi[1]+', '+defi[w+1]
            defi[1] = defi[1].strip('\n "()')
            tokeniser = SegtokTokenizer() # Flair sentence tokenizer
            sent_text = tokeniser.tokenize(defi[1])
            new = to_plain_sent(sent_text)
#            print(f'Tok: {new} Old: {defi[1]}')
            temp = new
            #defi[1] = clean(defi[1])
            #defi[1] = " ".join(defi[1].split())
            #temp = defi[1]
            defi[1] = defi[0]
            defi[0] = temp
            print('[ '+defi[0]+' , '+defi[1]+' ]')
            eg.append(defi[0])
            ent.append(defi[1])
        entpair = list(zip(eg,ent))
    return(entpair)
            

def matcher(string, pattern, context, entt):
    '''
    Return the start and end index of any pattern present in the text.
    '''
    match_list = []
    pattern = pattern.strip("[] ")
    string = string.strip("[] ")
    match = []
    '''
    i=0
    x = 0
    while x!= -1:
      x = string.find(pattern, i)
      if x!= -1:           
        tup = (x, x+len(pattern))
        #print('Value: '+pattern+'Location:    '+str(tup)+'\n\n'+string)
        match.append(tup)
        i = x+len(pattern)
    #match.pop()
    return match, string
    '''
    entt = entt.replace('I', ' ')   # Temp variable to store enitity name with whitespace insead of char seperator
    entMatch = SequenceMatcher(None, string, entt, autojunk=False)
    seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
    match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
    support = entMatch.find_longest_match(0, len(string), 0, len(entt)) # Find if the sentence has enitity name mentioned ( For context only )
    if context == 'yes':
        if (match.size == len(pattern)) and (support.size == len(entt)):
            start = match.a
            end = match.a + match.size
            match_tup = (start, end)
            string = string.replace(pattern, "X" * len(pattern), 1)
            print(f'context match: {string} //||// {pattern}')
            match_list.append(match_tup)

    else:
        if (match.size == len(pattern)):# and support.size == len(entt):
            start = match.a
            end = match.a + match.size
            match_tup = (start, end)
            string = string.replace(pattern, "X" * len(pattern), 1)
            print(f'{string} //||// {pattern}')
            match_list.append(match_tup)
    return match_list, string
    

def sen2tag(s, match_list):
    tokeniser = SegtokTokenizer() # Flair sentence tokenizer
    s = s.strip("[] ")
    word_list = []
    temp = []
    first = 0    
    flag = 0
    endstr = None
    end_s,blank_s,blank_tok,end_tok = None,None,None,None
    
    #########################################################################
    # Logic to merge subset of matching to avoid multiple sentence creation #
    #########################################################################
    
    for i in range(0,len(match_list)-1):
        old_start = match_list[i][0] 
        old_end =  match_list[i][1]
        next_start = match_list[i+1][0]
        next_end = match_list[i+1][1]
        if flag ==1:
            flag = 0
            continue
        
        if old_start <= next_start <= old_end:
            flag = 1
            new = (next_end if next_end >= old_end else old_end)
            tup = (old_start,new,match_list[i][2])
            temp.append(tup)
        else:
            tup = (match_list[i][0],match_list[i][1],match_list[i][2])
            temp.append(tup)
            if i+1 == len(match_list)-1:
                tup = (match_list[i+1][0],match_list[i+1][1],match_list[i+1][2])
                temp.append(tup)
    if len(match_list)>1:
        final_list = temp
    else: # if only 1 matched string
        final_list = match_list
    print(f'NEW MATCH LIST => {temp}')
    
    for itr,multiple in enumerate(final_list):  # Merge subset matching to one match
        e_start = multiple[0]
        e_end = multiple[1]
        e_name = multiple[2]
        
        
        if e_start-first>1: # Atleast one word is present that is not an entity
            print(f'>> {e_start} {e_end} {e_name} First: {first} - {e_start}')
            blank_s = s[first:e_start]  # Assign it to blank sentence to be tagged 'O'
            
        print(f'>> {e_start} {e_end} {e_name} First: {e_start} - {e_end}')
        ent_s = s[e_start:e_end]    # Assign sentence to be tagged with entity
        
        if itr==len(match_list)-1:
            print(f'>> {e_start} {e_end} {e_name} First: {e_end} - end')
            end_s = s[e_end:]
        
        # Tokenize both types of sentence
        if blank_s:
            blank_tok = tokeniser.tokenize(blank_s)
        if end_s:
            end_tok = tokeniser.tokenize(end_s)
        ent_tok = tokeniser.tokenize(ent_s)
        
        if blank_tok:   # If blank sentence is not None
            for word in blank_tok:
                word_list.append(f'{word.text} O')
        for num,word in enumerate(ent_tok):
            if num == 0:
                word_list.append(f'{word.text} B-{e_name}')  # Tag the first word as Begining of entity
            else:
                word_list.append(f'{word.text} I-{e_name}')  # Tag rest as Inside entity
        if end_tok:
            for word in end_tok:
                word_list.append(f'{word.text} O')
            
        first = e_end        
        blank_s,end_s = None,None    # Reset blank sentence tokens to avoid duplicacy 
    return word_list

# Mark sentences that have no entities
def mark_sentence(s):
    '''
    Marks all the entities in the sentence as per the BIO scheme. 
    '''
    tokeniser = SegtokTokenizer() # Flair sentence tokenizer
    s = s.strip("[] ")
    word_list = []

    sentence = tokeniser.tokenize(s)
    for word in sentence:
                word_list.append(f'{word.text} O')
    return word_list

def clean(text):
    '''
    Just a helper fuction to add a space before the punctuations for better tokenization
    '''
    filters = ["!", "#", "$", "%", "(", ")", "/", "*", ":", ";", "<", "=", ">", "?", "@", "[",
               "\\", "]", "_", "`", "{", "}", "~", ",","."]
    text = text.strip('\n ')
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\br', ' ')
    for i in text:
        i = i.replace("'", ' ').replace("`", ' ').replace("°",'')
        if i in filters:
            text = text.replace(i, " "+i+' ')
    text = " ".join(text.split())
    return text

def create_data(df, text, filepath):
    '''
    The function responsible for the creation of data in the said format.
    1234
    1>1,a>
    '''
    appended = r'C:/Data/training/train_test.txt'
    with open(appended , 'a', newline='', encoding="utf-8") as f:
        for sentence in text:
            text_ = sentence       
            match_list = []
            for i,y,context in df:
                a, text_ = matcher(str(sentence), i, context, y)
                if a and len(str(sentence))<512:
                    print(f'{i} << SENTENCE LENGTH ======= LEN LESS THAN 512 >>>>>>>>>>> {len(str(sentence))}')
                    match_list.append((a[0][0], a[0][1], y))
            if match_list:
                match_list.sort(key = lambda x: x[0])
                print(f'Detected matches => {match_list}')
                d = sen2tag(str(sentence), match_list)#mark_sentence(str(sentence), match_list)
                for i in d:
                    f.writelines(str(i)+'\n')
                f.writelines('\n')
            # else:
            #     if random.randint(0,10000) < 0 and len(str(sentence))<512:
            #         d = mark_sentence(str(sentence))    # Mark blank sentences
            #         for i in d:
            #             f.writelines(str(i)+'\n')
            #         f.writelines('\n')

In [None]:
def main():
    
    names = ['PVPL', 'PMDP', 'TPCODL', 'DMRC', 'TSECL2','MSEDCL','BHEL','RGGVY','ONGC',
             'ADANI' ,'AGRA' ,'BSES', 'DUGJY' ,'KEONJ' ,'Reliance' ,'TATA' ,'TSLT', 'APDCL', 'KVX', 'PVVNL',
            'ADANILT','APGENCO','BESCOM','BESCOMCC','BESCOMHT','BMRCL','Cables 02','CIDCO',
            'CSL','DGVCL','IPDS','JBVNL','NESCO','NRDAHT', 'NRDALT','PGVCL','PSPCL','PVCPOWER',
            'TNEB','TNEB2','TNEBLV','TNEBXLPE','TSECL',
            '44AD', 'AIIMS', 'APDCL-AIIB', 'AVVNL', 'BEL', 'BWE', 'CABLE-AIIMS', 'Cables_Spec',
            'CESU MVCC', 'DDUGJY Control Cable', 'DHBVN', 'DHBVN CSC-65 LT UA', 'DHBVN CSC-67 AB',
            'DHBVN' ,'DHBVN_CSC', 'EHV' ,'GED' ,'GETCO LT', 'GETCO' ,'GETCO2', 'GMRC', 'HTABMGVCL',
            'HTCable', 'HT_GEB', 'IOCL MEERUT' ,'IOCL' ,'IOCL_HV', 'IOCL_PARADIP',
            'ITD_JNPT' ,'JACOB', 'Jajpur' ,'JSBAY', 'KMRCL 33KV' ,'KMRCL', 'KPTCL', 'KSEB' ,'LVAB_CABLE',
            'LV_ALU' ,'MECON' ,'NMDC', 'NRDA', 'RANCHI', 'RRCAT', 'RSP', 'SRF', 'T S_CABLE-AIIMS',
            'UPMRC', 'XLPE_CABLE', 'DHBVN_111']
    
    data = txt2ent('C:/Data/newtrain1.txt') # New attribute input function call
    
    for doc in tqdm(names,ncols=150,
                desc=f" Processing Files. . . . "):
        try:
            print(f'File currently being worked on: {doc}')
            filename = doc 
            # Path to save the txt file.
            pdf = r'C:/Data/raw/reworked/'+filename +'.pdf'
            # filepath = r'C:/Data/raw/reworked/'+filename +'.txt'
            sentoken = r'C:/Data/training/'+filename +'.csv'
            final = r'C:/Data/train.txt'
            
            ## creating the file.
            sentences = []
            pdf2token(pdf, sentoken, doc)
            #data = txttrain(filepath)    # OLD attribute call function call
            
            with open(sentoken, 'r', encoding="utf-8") as read_obj:
                csv_reader = csv.reader(read_obj)
                for row in csv_reader:
                    sentences.append(row)
            create_data(data, sentences, final)
        except FileNotFoundError:
            print(f'Skipping file {doc}.pdf. Please check if file exists or is named correctly.')
            continue  
        
def test():
    txt2ent('C:/Data/newtrain.txt')

if __name__ == '__main__':
    main()
    #test()
    # key_mappings = {}
    # with open('./Outputs/EntityDic.txt', 'r') as f:
    #     for line in f:
    #         key, value = line.split(',')
    #         key_mappings.fromkeys(key)
    #         key_mappings[key] = value
