In [1]:
import cv2
import numpy as np
import xlsxwriter
from PIL import Image
from pytesseract import pytesseract

In [2]:
# get the number of stacked horizontal lines

def getNumLinesHorz(image):
    img= cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    counter=0
    for i in range(1,img.shape[0]):
        for j in range(1,img.shape[1]):
            if img[i][j]==255 and img[i-1][j]==0 and  img[i][j-1]==0:
                counter=counter+1

    return counter



In [3]:
# function to detect the content of each cell
def detectSymbol(image):
    # detect stacked horzintal lines
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,1))
    horizontal_mask = cv2.morphologyEx(image, cv2.MORPH_CLOSE, horizontal_kernel, iterations=1)
    ver=1
    if np.sum(horizontal_mask/255)<horizontal_mask.size*.998:
        ver=3
        q=getNumLinesHorz(image)
        return q,ver

    # detect stacked vertical lines
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,10))
    vertical_mask = cv2.morphologyEx(image, cv2.MORPH_CLOSE, vertical_kernel, iterations=1)
    if np.sum(vertical_mask/255)<vertical_mask.size*.998:
        ver=0

    # detect empty cells 
    if np.sum(image/255)>=image.size*.998:
        return 0,2 

    # resizing the img to 150*70     
    width = int(image.shape[1] )
    height = int(image.shape[0] )
    scale_percentWidth = (150/width)*100
    scale_percentHeight = (70/height)*100
    width = int(image.shape[1] * scale_percentWidth / 100)
    height = int(image.shape[0] * scale_percentHeight / 100)
    dim = (width, height)
    resized = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
    
    #Erosion then closing then thinning then hough transfor to detect the component of the cell
    img2= cv2.cvtColor(resized,cv2.COLOR_BGR2GRAY)
    imgAfterBlur=cv2.GaussianBlur(img2,(3,3),0)
    thresh = cv2.threshold(imgAfterBlur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,3))
    ero = cv2.erode(thresh,kernel)
    
    closing = cv2.morphologyEx(ero,cv2.MORPH_CLOSE,kernel)
    thinned = cv2.ximgproc.thinning(closing)

    # edges = cv2.Canny(thinned,50,150,apertureSize = 3)
    
    lines = cv2.HoughLines(thinned,1,np.pi/180, 0,min_theta=0,max_theta=np.pi/180) 
    # return the number of lines obtained from hough transform and the opertion variable (ver) 
    if lines is not None:
        if ver==0:
            return len(lines),ver
        if len(lines)>=5:
            return 5,ver
        else:
            return 0,ver 
        

        
    return 0,ver

In [4]:

#reading the img and converting it to binary and removing noise
image=cv2.imread('try3.jpg')
img2= cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
imgAfterBlur=cv2.GaussianBlur(img2,(3,3),0)
thresh = cv2.threshold(imgAfterBlur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]


In [5]:
#detect horizontal and vertical lines in the thresholding image

horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50,1))
horizontal_mask = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=1)

# Detect vertical lines
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,50))
vertical_mask = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=1)

# Combine masks and remove lines
table_mask = cv2.bitwise_or(horizontal_mask, vertical_mask)
image[np.where(table_mask==255)] = [255,255,255]

cv2.imwrite('thresh.jpg', thresh)
cv2.imwrite('horizontal_mask.jpg', horizontal_mask)
cv2.imwrite('vertical_mask.jpg', vertical_mask)
cv2.imwrite('table_mask.jpg', table_mask)
cv2.imwrite('image.jpg', image)


True

In [6]:
# counting / calculating the positions and number of vertical and horizontal lines
arrh=[]
h,w=horizontal_mask.shape
f=True
for i in range(0,h,1):
    if(i!=0):
        if(horizontal_mask[i][0]==255 and horizontal_mask[i-1][0]==0):
            arrh.append(i)
    else:
        if(horizontal_mask[i][0]==255):
            arrh.append(i)
arrv=[]
for i in range(0,w,1):
    if(i!=0):
        if(vertical_mask[0][i]==255 and vertical_mask[0][i-1]==0):
            arrv.append(i)
    else:
        if(vertical_mask[0][i]==255):
            arrv.append(i)   


In [7]:
# Segmentting the code and names to write them in excel sheet
cv2.imwrite('Code_col.jpg', image[arrh[0]:,:arrv[0]])
cv2.imwrite('AraName_col.jpg', image[arrh[0]:,arrv[0]:arrv[1]])
cv2.imwrite('EngName_col.jpg', image[arrh[0]:,arrv[1]:arrv[2]])

workbook = xlsxwriter.Workbook('Output3.xlsx')
worksheet = workbook.add_worksheet()

# Reading code and names imgs 
path_to_tesseract = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
imgcode = r"Code_col.jpg"
imgaraname = r"AraName_col.jpg"
imgengname = r"EngName_col.jpg"  
# Opening the image & storing it in an image object
openimgcode = Image.open(imgcode)
openimgengname = Image.open(imgengname)
openimgaraname = Image.open(imgaraname)


# Providing the tesseract 
# executable location to pytesseract library
pytesseract.tesseract_cmd = path_to_tesseract
 
# Passing the image object to 
# image_to_string() function
# This function will
# extract the outcode from the image


# outara = pytesseract.image_to_string(openimgengname,lang="ara")

outcode = pytesseract.image_to_string(openimgcode,config='digits')
outeng = pytesseract.image_to_string(openimgengname)
outcode=outcode.split("\n") 
outeng=outeng.split("\n") 
# Displaying the extracted outcode
code=[]
nameenglish=[]
for i in outcode:
    if i != '':
        code.append(int(i))
for i in outeng:
    if i != '':
        nameenglish.append(i)

In [8]:
# Start from the first cell. Rows and columns are zero indexed.



row = 0
col = 0
worksheet.write(0, 0, 'code')
worksheet.write(0, 1, 'Eng_name')

for i in range(2,len(arrv)-2):
    cv2.imwrite('crop7.jpg', image[:arrh[0],arrv[i]:arrv[i+1]])
    imagedegree = r"crop7.jpg" 
    openimgdeg = Image.open(imagedegree)
    txt=pytesseract.image_to_string(openimgdeg,config='any')
    print(txt)
    worksheet.write(0, i,txt)



# printing the code and english name
for i in range(len(code)):
    worksheet.write(i+1, 0,code[i])   
for i in range(len(nameenglish)):
    if nameenglish[i] != '':
        worksheet.write(i+1, 1,nameenglish[i])  













In [9]:
for i in range(2,len(arrv)):
    for j in range(0,len(arrh)):
        if i==len(arrv)-1 and j==len(arrh)-1:
            lines,ver=detectSymbol(image[arrh[j]:,arrv[i]:])
       
        elif j==len(arrh)-1:
            lines,ver=detectSymbol(image[arrh[j]:,arrv[i]:arrv[i+1]])
            
        elif i==len(arrv)-1:
            lines,ver=detectSymbol(image[arrh[j]:arrh[j+1],arrv[i]:])
            
        else:
            lines,ver=detectSymbol(image[arrh[j]:arrh[j+1],arrv[i]:arrv[i+1]])
        if  ver==1:
            worksheet.write(j+1, i,lines)
        elif ver==0:
            worksheet.write(j+1, i,lines)
        elif ver==2:
            worksheet.write(j+1, i,'')
        elif ver==3:
            worksheet.write(j+1, i,5-lines)

workbook.close()