In [None]:
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
import nltk
from nltk import sent_tokenize, word_tokenize
import pandas as pd
from io import BytesIO
import csv

from flair.tokenization import SegtokTokenizer
from flair.tokenization import SegtokSentenceSplitter
from tqdm import tqdm
from difflib import SequenceMatcher
import re
import pickle
import re
import random

In [None]:
def pdf2token(pdf,loc):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams(char_margin=30, line_margin=2, boxes_flow=1)
    device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    caching = True
    fp = open(pdf, 'rb')
    writer = csv.writer(open(loc, 'w', newline='', encoding="utf-8"))
    pagenumber = 0
    pagenos=set()
    for pagenumber, page in enumerate(PDFPage.get_pages(fp,pagenos, maxpages=0,caching=True, check_extractable=True)):
        #Process pages one by one into interpreter
        if pagenumber is not None:
            interpreter.process_page(page)
        pagenumber = pagenumber+1
        
    #Convert Interpreter data into string
    data = retstr.getvalue()
    data = data.decode("ascii", "ignore")
    #print(f' >>>>>>NORMALSTART<<<<<< {data} >>>>>>>>END<<<<<<<')
    #sent_text = sent_tokenize(data) # NLTK sentence tokenizer (poor results)
    splitter = SegtokSentenceSplitter() # Flair sentence spliter + word tokenizer
    sent_text = splitter.split(data)
    #print(f' >>>>>>START<<<<<< {sent_text} >>>>>>>>END<<<<<<<')
    for sent in sent_text:
        writer.writerow([sent.to_tokenized_string().lower()])
        

def to_plain_sent(tokens): # Convert tokenized(cleaned) words back to sentence
    
    sent = " ".join([t.text for t in tokens])
    return sent.rstrip()

def txttrain(train):
    entpair = []
    eg = []
    ent = []
    with open(train, 'r', encoding="utf-8") as f:
        for line in f: 
            text = line.rstrip('\n')
            defi = []
            defi = text.lower().split(',')
            defi[0] = defi[0].replace(' ','I')
            if len(defi)>=3:
                for w in range(1,len(defi)-1):
#                    print(str(w))
                    defi[1] = defi[1]+', '+defi[w+1]
            defi[1] = defi[1].strip('\n "()')
            tokeniser = SegtokTokenizer() # Flair sentence tokenizer
            sent_text = tokeniser.tokenize(defi[1])
            new = to_plain_sent(sent_text)
#            print(f'Tok: {new} Old: {defi[1]}')
            temp = new
            #defi[1] = clean(defi[1])
            #defi[1] = " ".join(defi[1].split())
            #temp = defi[1]
            defi[1] = defi[0]
            defi[0] = temp
            print('[ '+defi[0]+' , '+defi[1]+' ]')
            eg.append(defi[0])
            ent.append(defi[1])
        entpair = list(zip(eg,ent))
    return(entpair)
            

    
def matcher(string, pattern):
    '''
    Return the start and end index of any pattern present in the text.
    '''
    match_list = []
    pattern = pattern.strip("[] ")
    string = string.strip("[] ")
    match = []
    '''
    i=0
    x = 0
    while x!= -1:
      x = string.find(pattern, i)
      if x!= -1:           
        tup = (x, x+len(pattern))
        #print('Value: '+pattern+'Location:    '+str(tup)+'\n\n'+string)
        match.append(tup)
        i = x+len(pattern)
    #match.pop()
    return match, string
    '''
    seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
    match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
    if (match.size == len(pattern)):
        start = match.a
        end = match.a + match.size
        match_tup = (start, end)
        string = string.replace(pattern, "X" * len(pattern), 1)
        print(f'{string} //||// {pattern}')
        match_list.append(match_tup)
    return match_list, string
    
        
def sen2tag(s, match_list):
    tokeniser = SegtokTokenizer() # Flair sentence tokenizer
    s = s.strip("[] ")
    
    word_list = []
    first = 0    
    endstr = None
    end_s,blank_s,blank_tok,end_tok = None,None,None,None
    for itr,multiple in enumerate(match_list):
        e_start = multiple[0]
        e_end = multiple[1]
        e_name = multiple[2]
        
        print(f'>> {e_start} {e_end} {e_name} First: {first} - {e_start}')
        if e_start-first>1: # Atleast one word is present that is not an entity
            blank_s = s[first:e_start]  # Assign it to blank sentence to be tagged 'O'
        
        ent_s = s[e_start:e_end]    # Assign sentence to be tagged with entity
        
        if itr==len(match_list)-1:
            end_s = s[e_end:]
        
        # Tokenize both types of sentence
        if blank_s:
            blank_tok = tokeniser.tokenize(blank_s)
        if end_s:
            end_tok = tokeniser.tokenize(end_s)
        ent_tok = tokeniser.tokenize(ent_s)
        
        if blank_tok:   # If blank sentence is not None
            for word in blank_tok:
                word_list.append(f'{word.text} O')
        for num,word in enumerate(ent_tok):
            if num == 0:
                word_list.append(f'{word.text} B-{e_name}')  # Tag the first word as Begining of entity
            else:
                word_list.append(f'{word.text} I-{e_name}')  # Tag rest as Inside entity
        if end_tok:
            for word in end_tok:
                word_list.append(f'{word.text} O')
            
        first = e_end        
        blank_s,end_s = None,None    # Reset blank sentence tokens to avoid duplicacy 
    return word_list

# Mark sentences that have no entities
def mark_sentence(s):
    '''
    Marks all the entities in the sentence as per the BIO scheme. 
    '''
    tokeniser = SegtokTokenizer() # Flair sentence tokenizer
    s = s.strip("[] ")
    word_list = []

    sentence = tokeniser.tokenize(s)
    for word in sentence:
                word_list.append(f'{word.text} O')
    return word_list

def clean(text):
    '''
    Just a helper fuction to add a space before the punctuations for better tokenization
    '''
    filters = ["!", "#", "$", "%", "(", ")", "/", "*", ":", ";", "<", "=", ">", "?", "@", "[",
               "\\", "]", "_", "`", "{", "}", "~", ",","."]
    text = text.strip('\n ')
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\br', ' ')
    for i in text:
        i = i.replace("'", ' ').replace("`", ' ').replace("°",'')
        if i in filters:
            text = text.replace(i, " "+i+' ')
    text = " ".join(text.split())
    return text

def create_data(df, text, filepath):
    '''
    The function responsible for the creation of data in the said format.
    1234
    1>1,a>
    '''
    appended = r'C:/Data/training/train.txt'
    with open(filepath , 'a', newline='', encoding="utf-8") as f:
        for sentence in text:
            text_ = sentence       
            match_list = []
            for i,y in df:
                a, text_ = matcher(str(sentence), i)
                if a and len(str(sentence))<512:
                    print(f'SENTENCE LENGTH ======= SHOULD BE LESS THAN 512 >>>>>>>>>>> {len(str(sentence))}')
                    match_list.append((a[0][0], a[0][1], y))
            if match_list:
                print(f'Detected matches => {match_list}')
                d = sen2tag(str(sentence), match_list)#mark_sentence(str(sentence), match_list)
                for i in d:
                    f.writelines(str(i)+'\n')
                f.writelines('\n')
            else:
                if random.randint(0,100) < 5 and len(str(sentence))<512:
                    d = mark_sentence(str(sentence))    # Mark blank sentences
                    for i in d:
                        f.writelines(str(i)+'\n')
                    f.writelines('\n')

In [None]:
def main():
    
    names = ['test','MSEDCL','BHEL','RGGVY','ONGC','ADANI' ,'AGRA' ,'BSES', 'DUGJY' ,'KEONJ' ,'Reliance' ,'TATA' ,'TSLT', 'APDCL', 'KVX', 'PVVNL',
            'ADANILT','APGENCO','BESCOM','BESCOMCC','BESCOMHT','BMRCL','Cables 02','CIDCO','CSL','DGVCL','IPDS','JBVNL','NESCO','NRDAHT',
            'NRDALT','PGVCL','PSPCL','PVCPOWER','TNEB','TNEB2','TNEBLV','TNEBXLPE','TSECL',
            '44AD', 'AIIMS', 'APDCL-AIIB', 'AVVNL', 'BEL', 'BWE', 'CABLE-AIIMS', 'Cables_Spec', 'CESU MVCC', 'DDUGJY Control Cable', 'DHBVN', 'DHBVN CSC-65 LT UA', 'DHBVN CSC-67 AB', 'DHBVN' ,'DHBVN_CSC', 'EHV' ,'GED' ,'GETCO LT', 'GETCO' ,'GETCO2',
            'GMRC', 'HTABMGVCL', 'HTCable', 'HT_GEB', 'IOCL MEERUT' ,'IOCL' ,'IOCL_HV', 'IOCL_PARADIP', 'ITD_JNPT' ,'JACOB', 'Jajpur' ,'JSBAY', 'KMRCL 33KV' ,'KMRCL', 'KPTCL', 'KSEB' ,'LVAB_CABLE', 'LV_ALU' ,'MECON' ,'NMDC',
            'NRDA' ,'RANCHI' ,'RRCAT' ,'RSP', 'SRF' ,'T S_CABLE-AIIMS' ,'UPMRC' ,'XLPE_CABLE','DHBVN_111']
    
    for doc in tqdm(names,ncols=150,
                desc=f" Processing Files. . . . "):
        try:
            print(f'File currently being worked on: {doc}')
            filename = doc 
            #path to save the txt file.
            pdf = r'C:/Data/raw/reworked/'+filename +'.pdf'
            filepath = r'C:/Data/raw/reworked/'+filename +'.txt'
            sentoken = r'C:/Data/training/'+filename +'.csv'
            final = r'C:/Data/train.txt'
            
            ## creating the file.
            sentences = []
            pdf2token(pdf,sentoken)
            data = txttrain(filepath)    
            
            with open(sentoken, 'r', encoding="utf-8") as read_obj:
                csv_reader = csv.reader(read_obj)
                for row in csv_reader:
                    sentences.append(row)
            create_data(data, sentences, final)
        except FileNotFoundError:
            print(f'Skipping file {doc}.pdf. Please check if file exists or is named correctly.')
            continue  

if __name__ == '__main__':
    main()
