# Heuristic Algorithms for Automated Data Extraction

### Data is the currency of science, but it is packaged in figures

Can we unpackage data and re-use it?

In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import IPython
import scipy.ndimage as ndimage
# try:
#     import Image
# except ImportError:
from PIL import Image
import pytesseract as tes

import sys
sys.path.insert(0, './functions')
import ocr_tools
import find_axes
from thresholding import adaptive_thresh

print('All packages loaded')

All packages loaded


In [23]:
example = 'examples/example103.jpg'
img = cv2.imread(example)
m,n,p=img.shape

%matplotlib notebook
plt.figure()
implot = plt.imshow(img)

<IPython.core.display.Javascript object>

## This is our starting point

We would like to find the axes, calibrate them from pixel to data space, find the data and extract it.

In [24]:
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
bw4 = adaptive_thresh(gray)
nbw = (255-bw4)

%matplotlib notebook
plt.figure()
implot = plt.imshow(nbw, cmap='gray')

<IPython.core.display.Javascript object>

## Finding Axes

Now we are going to engineer an axis feature score, for example, the x-axis:

[cos^2, fractional length, fractional y-coord (or x coord for y axis)]

"How horizontal is this line segment?"
"How long is it relative to the image size?"
"Where is it in the image?"

In [25]:
linesP = cv2.HoughLinesP(nbw,1,np.pi/2,2, minLineLength = nbw.shape[1]/10, maxLineGap = 3)
print('Hough Transform complete')
import find_axes
# print(linesP)
xax = find_axes.get_xaxis(np.reshape(linesP,(-1,4)),nbw)
print('x axis')
print(xax)
yax = find_axes.get_yaxis(np.reshape(linesP,(-1,4)),nbw)
print('y axis')
print(yax)

Hough Transform complete
x axis
[ 99 504 869 504]
y axis
[107 511 107  25]


In [26]:
imlabel = img.copy();
cv2.line(imlabel,(xax[0],xax[1]),(xax[2],xax[3]),(255,0,0),4)
cv2.line(imlabel,(yax[0],yax[1]),(yax[2],yax[3]),(255,0,0),4)

%matplotlib notebook
plt.figure()
implot = plt.imshow(imlabel)
implot.set_cmap('gray')

<IPython.core.display.Javascript object>

## Locating Ticks and Tick Labels

It's actually easier to start by finding the labels - in fact, in many cases it's not necessary to even find the ticks at all.

OCR is difficult. Pytesseract is the best open source library for it, but it's not a panacea. We can help it by first isolating the region from which we'd like to extract text.

In [27]:
# Find where the x-ticks stop and crop the text image to be below that
below_xax_bw = np.zeros([m,n]).astype('uint8')
below_xax_bw[xax[1]:,:]=nbw[xax[1]:,:]
row_sums = below_xax_bw.sum(axis=1)
row_sums[0:xax[1]] = 1000
row_sums_list = row_sums.tolist()
text_crop_ind = row_sums_list.index(0)
text_crop_ind

xTextImg = np.ones([m,n,p]).astype('uint8')*255
xTextImg[text_crop_ind+1:,:,:]=img[text_crop_ind+1:,:,:]

cv2.imwrite('temp/xtext.png',xTextImg)
%matplotlib notebook
plt.figure()
implot = plt.imshow(xTextImg)

<IPython.core.display.Javascript object>

### Here, we perform OCR, then pick out only the numbers.

In [28]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [29]:
import ocr_tools
xocr = tes.image_to_string(Image.open("./temp/xtext.png"),boxes=True)
print('OCR complete')

words = ocr_tools.clean_ocr_results(xocr)
numbers = [w for w in words if is_number(w[0])]

for w in numbers:
    w[0]=float(w[0])
    w[2] = m-w[2]
    w[4] = m-w[4]
    w.append((w[1]+w[3])/2)
    w.append((w[2]+w[4])/2)
    
xt_labels = np.array(numbers)

%matplotlib notebook
plt.figure()
for w in numbers:
    cv2.rectangle(imlabel, (w[1],w[2]), (w[3],w[4]), (255,215,0), thickness=2, lineType=8, shift=0)
implot = plt.imshow(imlabel)

OCR complete


<IPython.core.display.Javascript object>

### Now, we use another Hough transform to find ticks, and match them with their nearest label.

In [30]:
xt_dirty = find_axes.get_xticks(nbw,xax,tickMargin=11,minTickLen=0,maxGap=0)
xt_list=xt_dirty.tolist()
xt_match = find_axes.match_xticks(numbers,xt_list)
xt_pts = [xt_list[i] for i in xt_match]

%matplotlib notebook
plt.figure()
for x1,y1,x2,y2 in xt_pts:
    cv2.line(imlabel,(x1,y1),(x2,y2),(0,255,0),4)
implot = plt.imshow(imlabel)

<IPython.core.display.Javascript object>

## Convert x-axis from pixel space to data space with regression

In [31]:
xt_pts_array = np.array(xt_pts)
xt_pts_array[:,0].reshape(-1,1)
print('X Axis in pixels')
print(xt_pts_array[:,0].reshape(-1,1))

xt_labels[:,0].reshape(-1,1)
print('X Axis in Data')
print(xt_labels[:,0].reshape(-1,1))

from sklearn import datasets, linear_model
regrx = linear_model.LinearRegression(fit_intercept=True)
regrx.fit(xt_pts_array[:,0].reshape(-1,1), xt_labels[:,0].reshape(-1,1))

# The coefficients
print 'Coefficients: \n', regrx.coef_
print 'Intercept: \n', regrx.intercept_

X Axis in pixels
[[344]
 [464]
 [582]
 [702]
 [821]]
X Axis in Data
[[ 10.]
 [ 15.]
 [ 20.]
 [ 25.]
 [ 30.]]
Coefficients: 
[[ 0.04194607]]
Intercept: 
[-4.43778187]


## For Y-Ticks, the rotated text breaks pytesseract OCR

To solve this, we isolate the text, rotate it, remove the axis label, rotate back, then perform OCR.

In [32]:
# Find where the y-ticks stop and crop the text image to be left of that
left_yax_bw = np.zeros([m,n]).astype('uint8')
left_yax_bw[:yax[0],:]=nbw[:yax[0],:]
col_sums = left_yax_bw.sum(axis=0)
col_sums[yax[0]:] = 1000
col_zeros=np.where(col_sums==0)[0]
y_crop_ind=col_zeros[-1]
print('cropping y-axis region')

yTextImg = np.ones([m,n,p]).astype('uint8')*255
yTextImg[:,0:y_crop_ind-1,:]=img[:,0:y_crop_ind-1,:]
cv2.imwrite('temp/ytext.png',yTextImg)

%matplotlib notebook
plt.figure()
yTextRot = ndimage.interpolation.rotate(yTextImg, -90)
cv2.imwrite('temp/ytextrot.png',yTextRot)
implot = plt.imshow(yTextRot)
implot.set_cmap('gray')

cropping y-axis region


<IPython.core.display.Javascript object>

In [33]:
yRotOcr = tes.image_to_string(Image.open('temp/ytextrot.png'),boxes=True)
print('Y-Axis OCR complete')
yrot_words = ocr_tools.clean_ocr_results(yRotOcr,charspace=15,neg_charspace=-3)
word_length = [len(i[0]) for i in yrot_words]
longest_word = np.array(word_length).argmax()
lwbox = yrot_words[longest_word]
x1 = lwbox[1]-1
y1 = lwbox[2]-1
x2 = lwbox[3]+1
y2 = lwbox[4]+1
y1 = n-y1
y2 = n-y2
yTextRot[y2:y1+1,x1:x2+1,:] = np.ones([y1-y2+1,x2-x1+1,3]).astype('uint8')*255
yText_Clean = ndimage.interpolation.rotate(yTextRot, 90)

%matplotlib notebook
plt.figure()
implot = plt.imshow(yText_Clean)

Y-Axis OCR complete


<IPython.core.display.Javascript object>

### Conventional OCR

In [34]:
cv2.imwrite('temp/ytextclean.png',yText_Clean)
yocr = tes.image_to_string(Image.open('temp/ytextclean.png'),boxes=True)
yl_ocr = ocr_tools.clean_ocr_results(yocr)
yt_numbers = [w for w in yl_ocr if is_number(w[0])]
for w in yt_numbers:
    w[0]=float(w[0])
    w[2] = m-w[2]
    w[4] = m-w[4]
    w.append((w[1]+w[3])/2)
    w.append((w[2]+w[4])/2)
    
yt_label_array = np.array(yt_numbers)

%matplotlib notebook
plt.figure()
for w in yt_numbers:
    cv2.rectangle(imlabel, (w[1],w[2]), (w[3],w[4]), (255,215,0), thickness=2, lineType=8, shift=0)
implot = plt.imshow(imlabel)

<IPython.core.display.Javascript object>

In [35]:
yt_dirty = find_axes.get_yticks(nbw,yax,tickMargin=11,minTickLen=0,maxGap=2)
yt_list=yt_dirty.tolist()
yt_match = find_axes.match_xticks(yt_numbers,yt_list)
yt_pts = [yt_list[i] for i in yt_match]
yt_pts

%matplotlib notebook
plt.figure()
for x1,y1,x2,y2 in yt_pts:
    cv2.line(imlabel,(x1,y1),(x2,y2),(0,255,0),2)
implot = plt.imshow(imlabel)

<IPython.core.display.Javascript object>

In [36]:
yt_pts_array = np.array(yt_pts)
print('y axis pixels')
print(yt_pts_array[:,1].reshape(-1,1))
print('y axis data')
print(yt_label_array[:,0].reshape(-1,1))

regry = linear_model.LinearRegression(fit_intercept=True)
regry.fit(yt_pts_array[:,1].reshape(-1,1), yt_label_array[:,0].reshape(-1,1))

# The coefficients
print 'Coefficients: \n', regry.coef_
print 'Intercept: \n', regry.intercept_

y axis pixels
[[ 57]
 [122]
 [185]
 [249]
 [313]
 [377]
 [441]]
y axis data
[[ 14.  ]
 [  0.12]
 [  1.  ]
 [  0.08]
 [  6.  ]
 [  4.  ]
 [  0.02]]
Coefficients: 
[[-0.01635022]]
Intercept: 
[ 7.67639869]


## Finally, extract the data.

In more advanced approaches, a trained CNN can be used to identify data markers. Here, we can achieve similar performance with binary erosion. Multiple series with different colors can be dealt with through clustering.

In [43]:
num_series = 5

from color_series_scrape import color_series_scrape
pix_pts, ROI = color_series_scrape(example, [[xax[0],xax[2]]],[[yax[3],yax[1]]], n_colors=num_series+1)
print('Done clustering')

import skimage.filters
from cluster_data import cluster_colorspace_km, return_series
im_recon, im_label = cluster_colorspace_km(ROI[0], num_series+1)
labels_as_ims = return_series(im_label)

%matplotlib notebook
plt.figure()
plt.imshow(labels_as_ims[4])

KM: fitting model on a small sub-sample of the data
done fitting
KM: Prediciting color indiced on the full image
done predicting
KM: recreating clustered image
Done clustering
KM: fitting model on a small sub-sample of the data
done fitting
KM: Prediciting color indiced on the full image
done predicting
KM: recreating clustered image


<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x12414dbd0>

In [39]:
imlabel4 = imlabel.copy()
pp_list = []
for series in pix_pts[0]:
    s_list = []
    for tup in series:
        x = int(round(tup[1]))+xax[0]
        y = int(round(tup[0]))+yax[3]
        s_list.append([x,y])
    pp_list.append(s_list)

pp_list

for series in pp_list:
    for pt in series:
        cv2.circle(imlabel4,(pt[0],pt[1]), 3, (255,240,10), -1)
        
%matplotlib notebook
lastplot = plt.imshow(imlabel4)

<IPython.core.display.Javascript object>

## ... and convert the pixel values to the units of the axes

In [18]:
data_list = []
for series in pp_list:
    ser_list = []
    for pt in series:
        xdata = pt[0]*regrx.coef_+regrx.intercept_
        ydata = pt[1]*regry.coef_+regry.intercept_
        ser_list.append([xdata[0,0],ydata[0,0]])
    data_list.append(ser_list[::-1])

[[format(j[0],'.3f'),format(j[1],'.3f')] for j in data_list[0]]

[['40.055', '0.011'],
 ['29.955', '0.021'],
 ['-0.002', '0.035'],
 ['4.945', '0.052'],
 ['20.061', '0.054'],
 ['10.029', '0.055'],
 ['15.045', '0.103']]