In [10]:
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
import nltk
from nltk import sent_tokenize, word_tokenize
import pandas as pd
from io import BytesIO
import csv

from flair.tokenization import SegtokTokenizer
from flair.tokenization import SegtokSentenceSplitter
from tqdm import tqdm
from difflib import SequenceMatcher
import re
import pickle
import re

In [14]:
def pdf2token(pdf,loc):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams(char_margin=30, line_margin=2, boxes_flow=1)
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    caching = True
    fp = open(pdf, 'rb')
    writer = csv.writer(open(loc, 'w', newline='', encoding="utf-8"))
    pagenumber = 0
    pagenos=set()
    for pagenumber, page in enumerate(PDFPage.get_pages(fp,pagenos, maxpages=0,caching=True, check_extractable=True)):
        #Process pages one by one into interpreter
        if pagenumber is not None:
            interpreter.process_page(page)
        pagenumber = pagenumber+1
        
     #Convert Interpreter data into string
    data = retstr.getvalue()
    data = data.decode("ascii", "ignore")
    print(f' >>>>>>NORMALSTART<<<<<< {data} >>>>>>>>END<<<<<<<')
    #sent_text = sent_tokenize(data) # NLTK sentence tokenizer (poor results)
    splitter = SegtokSentenceSplitter() # Flair sentence spliter + word tokenizer
    sent_text = splitter.split(data)
    print(f' >>>>>>START<<<<<< {sent_text} >>>>>>>>END<<<<<<<')
    for sent in sent_text:
        writer.writerow([sent.to_tokenized_string().lower()])
        

def to_plain_sent(tokens): # Convert tokenized(cleaned) words back to sentence
    
    sent = " ".join([t.text for t in tokens])
    return sent.rstrip()

def txttrain(train):
    entpair = []
    eg = []
    ent = []
    with open(train, 'r', encoding="utf-8") as f:
        for line in f: 
            text = line.rstrip('\n')
            defi = []
            defi = text.lower().split(',')
            defi[0] = defi[0].replace(' ','I')
            if len(defi)>=3:
                for w in range(1,len(defi)-1):
#                    print(str(w))
                    defi[1] = defi[1]+', '+defi[w+1]
            defi[1] = defi[1].strip('\n "()')
            tokeniser = SegtokTokenizer() # Flair sentence tokenizer
            sent_text = tokeniser.tokenize(defi[1])
            new = to_plain_sent(sent_text)
#            print(f'Tok: {new} Old: {defi[1]}')
            temp = new
            #defi[1] = clean(defi[1])
            #defi[1] = " ".join(defi[1].split())
            #temp = defi[1]
            defi[1] = defi[0]
            defi[0] = temp
            print('[ '+defi[0]+' , '+defi[1]+' ]')
            eg.append(defi[0])
            ent.append(defi[1])
        entpair = list(zip(eg,ent))
    return(entpair)
            

    
def matcher(string, pattern):
    '''
    Return the start and end index of any pattern present in the text.
    '''
    match_list = []
    pattern = pattern.strip("[]'")
    string = string.strip("[]'")
    match = []
    '''
    i=0
    x = 0
    while x!= -1:
      x = string.find(pattern, i)
      if x!= -1:           
        tup = (x, x+len(pattern))
        #print('Value: '+pattern+'Location:    '+str(tup)+'\n\n'+string)
        match.append(tup)
        i = x+len(pattern)
    #match.pop()
    return match, string
    '''
    seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
    match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
    if (match.size == len(pattern)):
        start = match.a
        end = match.a + match.size
        match_tup = (start, end)
        string = string.replace(pattern, "X" * len(pattern), 1)
        print(f'{string} //||// {pattern}')
        match_list.append(match_tup)
    return match_list, string
    
        
    

def mark_sentence(s, match_list):
    '''
    Marks all the entities in the sentence as per the BIO scheme. 
    '''
    tokeniser = SegtokTokenizer() # Flair sentence tokenizer

    s = s.strip("[]'")
    #print(s)
    
    word_dict = []
    first = 0    
    itr = 0
    endstr = None
    
    for multiple in match_list:
        beg = False 
        #print('current itteration- '+str(multiple))
        start = multiple[0]
        end = multiple[1]
        e_type = multiple[2]
        temp_str = s[start:end]
        tmp_list = tokeniser.tokenize(temp_str) #word_tokenize(temp_str)
        if itr!=(len(match_list)-1):
            substring = s[first:end]
            #print('Itteration: '+str(itr)+str(substring))
        else:
            substring = s[first:end]
            endstr = s[end:]
            #print('Itteration: '+str(itr)+str(substring))
            
 #       print('First Identifier:    '+tmp_list[0].text) 
        tokens = tokeniser.tokenize(substring)
        for word in tokens:
            x = word.text.find(tmp_list[0].text)
            if x!=0:
                word_dict.append(word.text+' '+'O')
            if x==0 and beg is False:    
                word_dict.append(word.text+' '+'B-' + e_type)
                beg = True
#                print('Begiening detected : '+word.text)
                continue
                
            if len(tmp_list) > 1 and beg is True:
                #print('We in the long sent: :: : : : : : : ')
                #print(f'Rest KEY = {str(tmp_list[1:])}')
                for w in tmp_list[1:]:
                    t = word.text.find(w.text)
                    if t==0:
                        #print(str(t)+' Inner detected : '+word+' idf> '+w)
                        word_dict.pop()
                        word_dict.append(word.text+' '+'I-' + e_type)
                        
        # Seperate non entities left out for final sentence to be marked 'O'
        if endstr is not None:
            tokens = tokeniser.tokenize(endstr)
            for word in tokens:
                word_dict.append(word.text+' '+'O')
        first=end
        itr=itr+1
    #print('Final Word outcome:::    '+str(word_dict)+'\n\n\n')
    return word_dict

def clean(text):
    '''
    Just a helper fuction to add a space before the punctuations for better tokenization
    '''
    filters = ["!", "#", "$", "%", "(", ")", "/", "*", ":", ";", "<", "=", ">", "?", "@", "[",
               "\\", "]", "_", "`", "{", "}", "~", ",","."]
    text = text.strip('\n ')
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\br', ' ')
    for i in text:
        i = i.replace("'", ' ').replace("`", ' ').replace("°",'')
        if i in filters:
            text = text.replace(i, " "+i+' ')
    text = " ".join(text.split())
    return text

def create_data(df, text, filepath):
    '''
    The function responsible for the creation of data in the said format.
    1234
    1>1,a>
    '''
    appended = r'H:/Code/Doc_IMG-OCR/trainer/ner/training/train.txt'
    with open(filepath , 'a', newline='', encoding="utf-8") as f:
        for sentence in text:
            text_ = sentence       
            match_list = []
            for i,y in df:
                a, text_ = matcher(str(sentence), i)
                if a:
                    #print(i)
                    #print(str(a)+str(text_))
                    for itr in a:
#                        print((itr[0], itr[1], y))
                        match_list.append((itr[0], itr[1], y))
            if match_list:
#                print(f'{str(len(match_list))} Sending detected sentence')
#                print('Final match list:    '+str(match_list))
                d = mark_sentence(str(sentence), match_list)
                for i in d:
                    f.writelines(str(i)+'\n')
                f.writelines('\n')

In [67]:
def main():
    names = [#'MSEDCL','BHEL','RGGVY','ONGC','ADANI' ,'AGRA' ,'BSES', 'DUGJY' ,'KEONJ' ,'Reliance' ,'TATA' ,'TSLT', 'APDCL', 'KVX', 'PVVNL',
            'ADANILT','APGENCO','BESCOM','BESCOMCC','BESCOMHT','BMRCL','Cables 02','CIDCO','CSL','DGVCL','IPDS','JBVNL','NESCO','NRDAHT',
            'NRDALT','PGVCL','PSPCL','PVCPOWER','TNEB','TNEB2','TNEBLV','TNEBXLPE','TSECL',
            ]
    for doc in tqdm(names,ncols=150,
                desc=f" Processing Files. . . . "):
        print(f'File currently being worked on: {doc}')
        filename = doc 
        ## path to save the txt file.
        pdf = r'C:/Users/33669/PycharmProjects/OCR/trainer/ner_data/raw/new/'+filename +'.pdf'
        filepath = r'C:/Users/33669/PycharmProjects/OCR/trainer/ner_data/raw/new/'+filename +'.txt'
        sentoken = r'C:/Users/33669/PycharmProjects/OCR/trainer/ner_data/training/'+filename +'.csv'
        final = r'C:/Users/33669/PycharmProjects/OCR/trainer/ner_data/train.txt'
        
        ## creating the file.
        sentences = []
        pdf2token(pdf,sentoken)
        data = txttrain(filepath)    
        
        with open(sentoken, 'r', encoding="utf-8") as read_obj:
            csv_reader = csv.reader(read_obj)
            for row in csv_reader:
                sentences.append(row)
        create_data(data, sentences, final)
        

if __name__ == '__main__':
    main()


 Processing Files. . . . :   0%|                                                                                               | 0/23 [00:00<?, ?it/s]

File currently being worked on: ADANILT


 Processing Files. . . . :   4%|███▊                                                                                   | 1/23 [00:00<00:08,  2.55it/s]

 >>>>>>NORMALSTART<<<<<< SUB SECTION: LT AC CABLE SPECIFICATION 

1.0.0  SCOPE OF SUPPLY & SERVICES 

  The scope of supply shall cover Design, engineering, manufacture, assembly and testing at 
works, packing/dispatch, transportation to site with transit insurance, delivery at site of cables 
listed. Cable quantities indicated are tentative and liable to change at the time of placement of 
order. 

2.0.0     The equipment to be furnished under this specification shall be in accordance with the 

applicable section of the latest version of the following Indian Standards, IEC publications and 
any other standards of latest edition including amendments, except where modified and /or 
supplemented by this specification.  

3.0.0         CODES & STANDARDS 

  IS      Methods for random sampling. 

  IS 5831    PVC insulation and sheath of electric cables. 

  IS 7098   Cross-linked polyethylene insulated PVC sheathed cables for    
    working voltages up to and including 33 kV. (Part-II) 

 Processing Files. . . . :   9%|███████▌                                                                               | 2/23 [00:01<00:13,  1.56it/s]

"4.08.00 cable identification cable identification shall be provided by embossing on the outer sheath the following : a ) manufacturer 's name or trade mark b ) manufacturer 's name or trade mark c ) XXXXXXXXXXXXX d ) year of manufacture e ) type of insulation , e.g. xlpe , hrpvc & ie2 etc ." //||// voltage grade
"4.08.00 cable identification cable identification shall be provided by embossing on the outer sheath the following : a ) manufacturer 's name or trade mark b ) manufacturer 's name or trade mark c ) voltage grade d ) XXXXXXXXXXXXXXXXXXX e ) type of insulation , e.g. xlpe , hrpvc & ie2 etc ." //||// year of manufacture
File currently being worked on: BESCOM





KeyboardInterrupt: 