In [1]:
import math
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import editdistance as ed

In [2]:
def read_text(path):
    file = open(path,'r')
    data = file.readlines() 
    file.close()
    return data

In [3]:
def cal_measure(lines):
    sum=0
    distances=[]
    for i in range(0,(len(lines)-1)):
        line1,line2 = re.split(r'\t+',lines[i]),re.split(r'\t+',lines[i+1])
        for word1,word2 in zip(line1,line2):
            dist = ed.eval(word1,word2)
            distances.append(dist)
            measure =dist/max(len(line1),len(line2))
            sum =sum +measure
    lim = math.sqrt((sum/len(lines)))
    return lim,distances

In [4]:
def ExtractTable(lines,lim):
    table=[]
    measure1=[]
    for i in range(0,(len(lines)-1)):
        line1,line2 = re.split(r'\t+',lines[i]),re.split(r'\t+',lines[i+1])
        for word1,word2 in zip(line1,line2):
            measure = ed.eval(word1,word2)/max(len(line1),len(line2))
            measure1.append(measure)
        if measure<lim:
            table.append(line2)
    return table

In [5]:
def createCSV(table,out_filename):
    df = pd.DataFrame(table)
    df.to_csv(out_filename,index=False,header = False)
    return True

In [6]:
def plotDistances(distances):
    plt.plot(distances)
    plt.ylabel('some numbers')
    plt.show()
    return True

In [7]:
def ExtractPan(text):
    PANpattern = re.compile(r"[\s]+[A-Z]{3}[PHFATBCLJG]{1}[a-zA-Z]{1}[0-9]{3}[A-Z0-9]*")
    for w in range(0,len(text)):
        PANres = PANpattern.search(text[w])
        if PANres:
            PAN_no = PANres.group()
            return PAN_no

In [8]:
def ExtractGSTIN(text):
    GSTINpattern=re.compile(r"[0-9]{2}[a-zA-Z]{5}[0-9]{4}[a-zA-Z]*[0-9]*[a-zA-Z0-9]*")
    for w in range(0,len(text)):
        GSTINres=GSTINpattern.search(text[w])        
        if GSTINres:
            GSTIN_no = GSTINres.group()
            return GSTIN_no

In [9]:
def ExtractEmail(text):
    Emailpattern = re.compile(r"\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}")
    for w in range(0,len(text)):
        Emailres=Emailpattern.search(text[w])        
        if Emailres:
            Email = Emailres.group()
            return Email

In [10]:
def ExtractPhone(text):
    PHONEpattern = re.compile(r"[\s]+((\+91[\-\s]?)?[0]?(91)?[789]\d{9})[\s]*[/]*[,]*[\s]*((\+91[\-\s]?)?[0]?(91)?[789]\d{9})*")
    for w in range(0,len(text)):
        PHONEres = PHONEpattern.search(text[w])
        if PHONEres:
            PHONE_no = PHONEres.group()
            return PHONE_no.strip()

In [11]:
def ExtractDate(text):
    DATEpattern = re.compile(r"\d{2}[.|-]+[a-zA-Z]*[0-9]*[.|-]+[0-9]*")
    for w in range(0,len(text)):
        DATEres = DATEpattern.search(text[w])
        if DATEres:
            DATE = DATEres.group()
            return DATE

In [12]:
def ExtractHeader(text):
    Companypattern = re.compile(r"[A-Z]+[.]*[\s]*[A-Z]+[.]*[\s]*[A-Z0-9]*[\n]")
    for w in range(0,len(text)):
        Companyres = Companypattern.search(text[w])
        if Companyres:
            Companyname = Companyres.group()
            return Companyname.strip()

In [13]:
def ExtractData(text):
    values={}
    values['PAN'] = ExtractPan(text)
    values['GSTIN'] = ExtractGSTIN(text)
    values['Email'] = ExtractEmail(text)
    values['PhoneNum'] = ExtractPhone(text)
    values['Date'] = ExtractDate(text)
    values['Header'] = ExtractHeader(text)
    return values
    

In [14]:
def main():
    in_file = os.path.abspath(input('Please Enter path of OCRed Text'))
    out_file = os.path.dirname(in_file) + "\\" + os.path.splitext(os.path.basename(in_file))[0] +".csv"
    
    text=read_text(in_file) #returns lines from a txt file
    
    measure,distances = cal_measure(text)
    tableContent = ExtractTable(text,measure)
    
    csv = createCSV(tableContent,out_file)
    metaData = ExtractData(text)
    print(metaData)
    #plotDistances(distances)


In [16]:
main()

Please Enter path of OCRed TextC:\Users\Kashmira Lokhande\Desktop\dosaidli.txt
{'PAN': None, 'GSTIN': '27AENPN0622R2ZQ', 'Email': None, 'PhoneNum': '9822072728', 'Date': None, 'Header': 'NAADBRAMHA'}
