# Read a Graph

### Data is the currency of science, but it is packaged in figures

Can we unpackage data and re-use it?

In [42]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import IPython
import scipy.ndimage as ndimage
# try:
#     import Image
# except ImportError:
from PIL import Image
import pytesseract as tes

import sys
sys.path.insert(0, './functions')
import ocr_tools
import find_axes
from thresholding import adaptive_thresh

print('All packages loaded')

All packages loaded


In [43]:
example = 'examples/example102.jpg'
img = cv2.imread(example)
m,n,p=img.shape

%matplotlib notebook
implot = plt.imshow(img)

<IPython.core.display.Javascript object>

## This is our starting point

We would like to find the axes, calibrate them from pixel to data space, find the data and extract it.

In [44]:
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
bw4 = adaptive_thresh(gray)
nbw = (255-bw4)

%matplotlib notebook
implot = plt.imshow(nbw, cmap='gray')

<IPython.core.display.Javascript object>

### Every image processing workflow starts with a threshold

Many features are available from the black and white image. We will start with a simple one: lines. These are extracted through a Hough Transform

In [45]:
linesP = cv2.HoughLinesP(nbw,1,np.pi/2,2, minLineLength = nbw.shape[1]/10, maxLineGap = 3)[0]
print('Hough Transform complete')
print(linesP[0:4])

Hough Transform complete
[[100 341 463 341]
 [100 342 100  17]
 [ 97  18 463  18]
 [462 342 462  17]]


## So we've got some line segments.

Now we are going to engineer an axis feature score, for example, the x-axis:

[cos^2, fractional length, fractional y-coord (or x coord for y axis)]

"How horizontal is this line segment?"
"How long is it relative to the image size?"
"Where is it in the image?"

In [46]:
import find_axes
# print(linesP)
xax = find_axes.get_xaxis(linesP,nbw)
print('x axis')
print(xax)
yax = find_axes.get_yaxis(linesP,nbw)
print('y axis')
print(yax)

x axis
[102 342 461 342]
y axis
[100 342 100  17]


## Let's draw the results

In [47]:
imlabel = img.copy();
cv2.line(imlabel,(xax[0],xax[1]),(xax[2],xax[3]),(255,0,0),2)
cv2.line(imlabel,(yax[0],yax[1]),(yax[2],yax[3]),(255,0,0),2)

%matplotlib notebook
implot = plt.imshow(imlabel)
implot.set_cmap('gray')

<IPython.core.display.Javascript object>

## Now let's find ticks and tick labels

It's actually easier to start by finding the labels - in fact, in many cases it's not necessary to even find the ticks at all.

OCR is difficult. Pytesseract is the best open source library for it, but it's not a panacea. We can help it by first isolating the region from which we'd like to extract text.

In [48]:
%matplotlib notebook

# Find where the x-ticks stop and crop the text image to be below that
below_xax_bw = np.zeros([m,n]).astype('uint8')
below_xax_bw[xax[1]:,:]=nbw[xax[1]:,:]
row_sums = below_xax_bw.sum(axis=1)
row_sums[0:xax[1]] = 1000
row_sums_list = row_sums.tolist()
text_crop_ind = row_sums_list.index(0)
text_crop_ind

xTextImg = np.ones([m,n,p]).astype('uint8')*255
xTextImg[text_crop_ind+1:,:,:]=img[text_crop_ind+1:,:,:]

cv2.imwrite('temp/xtext.png',xTextImg)
implot = plt.imshow(xTextImg)


<IPython.core.display.Javascript object>

In [50]:
xocr = tes.image_to_string(Image.open('temp/xtext.png'),boxes=True)
print('OCR complete')
xocr

OCR complete


'0 106 34 115 47 0\n. 117 34 119 36 0\n0 121 34 130 47 0\n0 178 34 186 47 0\n. 188 34 190 36 0\n2 192 34 201 47 0\n0 249 34 258 47 0\n. 260 34 262 36 0\n4 263 34 273 47 0\n0 320 34 329 47 0\n. 331 34 333 36 0\n6 335 34 344 47 0\n0 391 34 400 47 0\n. 402 34 404 36 0\n8 406 34 415 47 0\nH 185 8 195 21 0\ne 197 8 205 18 0\nl 207 8 209 21 0\na 211 8 220 18 0\nk 220 8 225 20 0\ni 226 8 228 21 0\nv 229 8 238 18 0\ne 239 8 247 18 0\np 254 4 262 18 0\nr 264 8 269 18 0\ne 270 8 278 18 0\ns 280 8 287 18 0\ns 289 8 296 18 0\nu 298 8 306 18 0\nr 308 8 313 18 0\ne 313 8 322 18 0\n( 329 4 333 21 0\nP 336 8 347 21 0\n/ 346 8 351 22 0\nP 352 8 364 21 0\nD 363 3 370 12 0\n) 371 4 375 21 0'

In [51]:
import ocr_tools

words = ocr_tools.clean_ocr_results(xocr)
words

[['0.0', 106, 34, 130, 47],
 ['0.2', 178, 34, 201, 47],
 ['0.4', 249, 34, 273, 47],
 ['0.6', 320, 34, 344, 47],
 ['0.8', 391, 34, 415, 47],
 ['Helakive', 185, 8, 247, 21],
 ['pressure', 254, 4, 322, 18],
 ['(P/PD)', 329, 3, 375, 22]]

### We're looking for numbers

Let's pick those out.

In [52]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

numbers = [w for w in words if is_number(w[0])]
for w in numbers:
    w[0]=float(w[0])
    w[2] = m-w[2]
    w[4] = m-w[4]
    w.append((w[1]+w[3])/2)
    w.append((w[2]+w[4])/2)
    
xt_labels = np.array(numbers)
xt_labels

array([[  0.00000000e+00,   1.06000000e+02,   3.69000000e+02,
          1.30000000e+02,   3.56000000e+02,   1.18000000e+02,
          3.62000000e+02],
       [  2.00000000e-01,   1.78000000e+02,   3.69000000e+02,
          2.01000000e+02,   3.56000000e+02,   1.89000000e+02,
          3.62000000e+02],
       [  4.00000000e-01,   2.49000000e+02,   3.69000000e+02,
          2.73000000e+02,   3.56000000e+02,   2.61000000e+02,
          3.62000000e+02],
       [  6.00000000e-01,   3.20000000e+02,   3.69000000e+02,
          3.44000000e+02,   3.56000000e+02,   3.32000000e+02,
          3.62000000e+02],
       [  8.00000000e-01,   3.91000000e+02,   3.69000000e+02,
          4.15000000e+02,   3.56000000e+02,   4.03000000e+02,
          3.62000000e+02]])

In [53]:
%matplotlib notebook

for w in numbers:
    cv2.rectangle(imlabel, (w[1],w[2]), (w[3],w[4]), (255,215,0), thickness=2, lineType=8, shift=0)

implot = plt.imshow(imlabel)

<IPython.core.display.Javascript object>

# Find X Ticks


### This is going to match tick labels with their nearest vertical line segment on the x-axis.

In [54]:
xt_dirty = find_axes.get_xticks(nbw,xax,tickMargin=11,minTickLen=0,maxGap=0)
xt_list=xt_dirty.tolist()
xt_list

[[101, 342, 101, 331],
 [102, 342, 102, 341],
 [103, 342, 103, 341],
 [104, 342, 104, 341],
 [105, 342, 105, 341],
 [106, 342, 106, 341],
 [107, 342, 107, 341],
 [108, 342, 108, 341],
 [109, 342, 109, 341],
 [110, 342, 110, 341],
 [111, 342, 111, 341],
 [112, 342, 112, 341],
 [113, 342, 113, 341],
 [114, 342, 114, 341],
 [115, 342, 115, 341],
 [116, 342, 116, 341],
 [117, 347, 117, 341],
 [118, 348, 118, 341],
 [119, 347, 119, 345],
 [119, 343, 119, 341],
 [120, 342, 120, 341],
 [121, 342, 121, 341],
 [122, 342, 122, 341],
 [123, 342, 123, 341],
 [124, 342, 124, 341],
 [125, 342, 125, 341],
 [126, 342, 126, 341],
 [127, 342, 127, 341],
 [128, 342, 128, 341],
 [129, 342, 129, 341],
 [130, 342, 130, 341],
 [131, 342, 131, 341],
 [132, 342, 132, 341],
 [133, 342, 133, 341],
 [134, 342, 134, 341],
 [135, 342, 135, 341],
 [136, 342, 136, 341],
 [137, 342, 137, 341],
 [138, 342, 138, 341],
 [139, 342, 139, 341],
 [140, 342, 140, 341],
 [141, 342, 141, 341],
 [142, 342, 142, 341],
 [143, 342,

In [55]:
xt_match = find_axes.match_xticks(numbers,xt_list)
xt_pts = [xt_list[i] for i in xt_match]
xt_pts

[[118, 348, 118, 341],
 [189, 348, 189, 341],
 [261, 348, 261, 341],
 [332, 348, 332, 341],
 [403, 348, 403, 341]]

In [56]:
%matplotlib notebook
for x1,y1,x2,y2 in xt_pts:
    cv2.line(imlabel,(x1,y1),(x2,y2),(0,255,0),2)

implot = plt.imshow(imlabel)

<IPython.core.display.Javascript object>

In [15]:
xt_pts_array = np.array(xt_pts)
xt_pts_array[:,0].reshape(-1,1)

print('X Axis in pixels')
print(xt_pts_array[:,0].reshape(-1,1))

X Axis in pixels
[[118]
 [189]
 [261]
 [332]
 [403]]


In [57]:
xt_labels[:,0].reshape(-1,1)
print('X Axis in Data')
print(xt_labels[:,0].reshape(-1,1))

X Axis in Data
[[ 0. ]
 [ 0.2]
 [ 0.4]
 [ 0.6]
 [ 0.8]]


## Convert x-axis from pixel space to data space with regression

We have a sort of calibration curve between pixel space and data space.

In [58]:
%matplotlib notebook
hcalib = plt.plot(xt_pts_array[:,0].reshape(-1,1), xt_labels[:,0].reshape(-1,1))
plt.xlabel('pixels')
plt.ylabel('x-axis data')

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x11b6b7b90>

In [18]:
from sklearn import datasets, linear_model
regrx = linear_model.LinearRegression(fit_intercept=True)

regrx.fit(xt_pts_array[:,0].reshape(-1,1), xt_labels[:,0].reshape(-1,1))

# The coefficients
print 'Coefficients: \n', regrx.coef_
print 'Intercept: \n', regrx.intercept_

Coefficients: 
[[ 0.00280503]]
Intercept: 
[-0.33099148]


## Now Get Y Ticks

In [59]:
# Find where the y-ticks stop and crop the text image to be left of that
left_yax_bw = np.zeros([m,n]).astype('uint8')
left_yax_bw[:yax[0],:]=nbw[:yax[0],:]
col_sums = left_yax_bw.sum(axis=0)
col_sums[yax[0]:] = 1000
col_zeros=np.where(col_sums==0)[0]
y_crop_ind=col_zeros[-1]
print('cropping y-axis region')

cropping y-axis region


In [60]:
%matplotlib notebook
yTextImg = np.ones([m,n,p]).astype('uint8')*255
yTextImg[:,0:y_crop_ind-1,:]=img[:,0:y_crop_ind-1,:]

cv2.imwrite('temp/ytext.png',yTextImg)
implot = plt.imshow(yTextImg)

<IPython.core.display.Javascript object>

## Unfortunately

For OCR to work effectively on the y-axis, we must remove the rotated axis title completely. This is quite challenging.

In [61]:
%matplotlib notebook

yTextRot = ndimage.interpolation.rotate(yTextImg, -90)
cv2.imwrite('temp/ytextrot.png',yTextRot)
implot = plt.imshow(yTextRot)
implot.set_cmap('gray')

<IPython.core.display.Javascript object>

In [64]:
yRotOcr = tes.image_to_string(Image.open('temp/ytextrot.png'),boxes=True)
print(yRotOcr)

A 109 437 121 450 0
d 122 437 130 450 0
s 132 437 139 447 0
o 141 437 149 447 0
r 151 437 156 447 0
p 157 433 165 447 0
t 166 437 171 449 0
i 172 437 174 450 0
a 172 437 184 450 0
n 186 437 194 447 0
c 200 437 208 447 0
a 209 437 218 447 0
p 220 433 228 447 0
a 229 437 238 447 0
c 239 437 247 447 0
i 249 437 251 450 0
t 249 437 257 450 0
y 257 433 265 446 0
( 272 433 276 450 0
m 278 437 291 447 0
g 292 433 301 447 0
g 307 433 316 447 0
" 317 445 329 454 0
) 332 433 336 450 0
0 163 378 176 386 0
0 163 388 176 396 0
7 163 398 176 407 0
0 249 378 263 386 0
0 249 388 263 397 0
8 249 398 263 406 0
O 336 378 349 386 0
O 336 388 349 396 0
Z 336 398 349 407 0
‘ 334 409 338 411 0
l 336 415 349 420 0


### Find the longest "word" and clear its bounding box

In [65]:
yrot_words = ocr_tools.clean_ocr_results(yRotOcr,charspace=10)
word_length = [len(i[0]) for i in yrot_words]
longest_word = np.array(word_length).argmax()
lwbox = yrot_words[longest_word]
lwbox

['Adsorptiancapacity(mgg")', 109, 433, 336, 454]

In [66]:
x1 = lwbox[1]-1
y1 = lwbox[2]-1
x2 = lwbox[3]+1
y2 = lwbox[4]+1
y1 = n-y1
y2 = n-y2
yTextRot[y2:y1+1,x1:x2+1,:] = np.ones([y1-y2+1,x2-x1+1,3]).astype('uint8')*255
yText_Clean = ndimage.interpolation.rotate(yTextRot, 90)

%matplotlib notebook
implot = plt.imshow(yText_Clean)

<IPython.core.display.Javascript object>

### Conventional OCR

In [67]:
cv2.imwrite('temp/ytextclean.png',yText_Clean)
yocr = tes.image_to_string(Image.open('temp/ytextclean.png'),boxes=True)
yl_ocr = ocr_tools.clean_ocr_results(yocr)
yl_ocr

[['1200', 44, 334, 86, 349],
 ['800', 58, 249, 86, 263],
 ['400', 57, 163, 86, 176]]

In [68]:
yt_numbers = [w for w in yl_ocr if is_number(w[0])]
for w in yt_numbers:
    w[0]=float(w[0])
    w[2] = m-w[2]
    w[4] = m-w[4]
    w.append((w[1]+w[3])/2)
    w.append((w[2]+w[4])/2)
    
yt_label_array = np.array(yt_numbers)
yt_label_array

array([[ 1200.,    44.,    69.,    86.,    54.,    65.,    61.],
       [  800.,    58.,   154.,    86.,   140.,    72.,   147.],
       [  400.,    57.,   240.,    86.,   227.,    71.,   233.]])

In [69]:
%matplotlib notebook

for w in yt_numbers:
    cv2.rectangle(imlabel, (w[1],w[2]), (w[3],w[4]), (255,215,0), thickness=2, lineType=8, shift=0)

implot = plt.imshow(imlabel)

<IPython.core.display.Javascript object>

## Now run the same label/tick matching algorithm for the y-axis

In [70]:
yt_dirty = find_axes.get_yticks(nbw,yax,tickMargin=11,minTickLen=0,maxGap=2)
yt_list=yt_dirty.tolist()
yt_match = find_axes.match_xticks(yt_numbers,yt_list)
yt_pts = [yt_list[i] for i in yt_match]
yt_pts

[[94, 61, 101, 61], [94, 147, 101, 147], [94, 234, 101, 234]]

In [71]:
%matplotlib notebook
for x1,y1,x2,y2 in yt_pts:
    cv2.line(imlabel,(x1,y1),(x2,y2),(0,255,0),2)

implot = plt.imshow(imlabel)

<IPython.core.display.Javascript object>

In [73]:
yt_pts_array = np.array(yt_pts)
print('y axis pixels')
print(yt_pts_array[:,1].reshape(-1,1))

y axis pixels
[[ 61]
 [147]
 [234]]


In [74]:
print('y axis data')
print(yt_label_array[:,0].reshape(-1,1))

y axis data
[[ 1200.]
 [  800.]
 [  400.]]


## Regress the y tick labels to the y ticks

In [75]:
from sklearn import datasets, linear_model
regry = linear_model.LinearRegression(fit_intercept=True)

regry.fit(yt_pts_array[:,1].reshape(-1,1), yt_label_array[:,0].reshape(-1,1))

# The coefficients
print 'Coefficients: \n', regry.coef_
print 'Intercept: \n', regry.intercept_

Coefficients: 
[[-4.62422595]]
Intercept: 
[ 1481.30262396]


## Finally, Extract the Data by Clustering

This is where the problem explodes. This is not a universal approach.

In [76]:
from color_series_scrape import color_series_scrape

pix_pts, ROI = color_series_scrape(example, [[xax[0],xax[2]]],[[yax[3],yax[1]]], n_colors=4)

print('Done clustering')

KM: fitting model on a small sub-sample of the data
done fitting
KM: Prediciting color indiced on the full image
done predicting
KM: recreating clustered image
Done clustering


In [77]:
from cluster_data import cluster_colorspace_km, return_series

im_recon, im_label = cluster_colorspace_km(ROI[0], 4)
labels_as_ims = return_series(im_label)

%matplotlib notebook
plt.imshow(im_recon)

KM: fitting model on a small sub-sample of the data
done fitting
KM: Prediciting color indiced on the full image
done predicting
KM: recreating clustered image


<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x11fc84250>

In [35]:
%matplotlib notebook

plt.imshow(labels_as_ims[0])

<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x117cb4110>

In [78]:
%matplotlib notebook

plt.imshow(labels_as_ims[1])

<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x11ffa0150>

In [80]:
%matplotlib notebook

plt.imshow(labels_as_ims[2])

<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x120547490>

In [81]:
%matplotlib notebook
imlabel4 = imlabel.copy()
pp_list = []
for series in pix_pts[0]:
    s_list = []
    for tup in series:
        x = int(round(tup[1]))+xax[0]
        y = int(round(tup[0]))+yax[3]
        s_list.append([x,y])
    pp_list.append(s_list)

pp_list

for series in pp_list:
    for pt in series:
        cv2.circle(imlabel4,(pt[0],pt[1]), 3, (255,240,10), -1)


lastplot = plt.imshow(imlabel4)

<IPython.core.display.Javascript object>

## ... and convert the pixel values to the units of the axes

In [82]:
data_list = []
for series in pp_list:
    ser_list = []
    for pt in series:
        xdata = pt[0]*regrx.coef_+regrx.intercept_
        ydata = pt[1]*regry.coef_+regry.intercept_
        ser_list.append([xdata[0,0],ydata[0,0]])
    data_list.append(ser_list[::-1])

data_list

[[[0.008417458081876994, 10.798770437029589],
  [0.044882881039868372, 98.659063571969682],
  [0.10378856427970062, 168.02245288902759],
  [0.17671941019568349, 200.39203457032136],
  [0.23843012597074575, 209.64048647926234],
  [0.29733580921057801, 218.88893838820331],
  [0.41514717569024262, 228.13739029714452],
  [0.50210318428237588, 237.3858422060855],
  [0.58905919287450925, 242.0100681605561],
  [0.70406552681894374, 251.25852006949708],
  [0.76858127512923602, 260.50697197843829],
  [0.8302919909042985, 265.13119793290866],
  [0.88919767414413076, 274.37964984184987]],
 [[0.086958369068319996, 232.7616162516149],
  [0.11500869442062106, 302.12500556867303],
  [0.23843012597074575, 500.96672161090578],
  [0.29453077667534799, 570.33011092796369],
  [0.41514717569024262, 639.69350024502171],
  [0.504908216817606, 667.43885597184487],
  [0.58905919287450925, 685.93575978972694],
  [0.70406552681894374, 713.68111551655011],
  [0.76577624259400601, 732.17801933443229],
  [0.8274869

## Broader Problem: How to identify Graphs?

In [40]:
spat_stats_path = '/Users/Imperssonator/CC/MIC-Pres/SS PCA presentation/Slide1.tiff'
ss_img = cv2.imread(spat_stats_path)

In [41]:
%matplotlib notebook
plt.imshow(ss_img)

<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x117fd8450>