# Preprocessing terms

1. Scaling to the right size
2. Increase contrast
3. Binarize image
4. remove noise and scanning artefacts (black border)
5. Deskew
6. Remove border

In [1]:
import re
import cv2
import glob
import imutils
import tempfile
import argparse
import dateparser
import numpy as np
import pandas as pd
from PIL import Image
import pytesseract as pt
from imutils import contours
from datefinder import find_dates
from dateutil.parser import parse
from matplotlib import pyplot as plt
from dateparser.search import search_dates
from imutils.perspective import four_point_transform
pt.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'

In [None]:
image = cv2.imread("images/4.jpeg")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
edged = cv2.Canny(blurred, 75, 200)

In [None]:

# find contours in the edge map, then initialize
# the contour that corresponds to the document
cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
docCnt = None
 
# ensure that at least one contour was found
if len(cnts) > 0:
 # sort the contours according to their size in
# descending order
    cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
 
    # loop over the sorted contours
for c in cnts:
    # approximate the contour
    peri = cv2.arcLength(c, True)
    approx = cv2.approxPolyDP(c, 0.02 * peri, True)
 
    # if our approximated contour has four points,
    # then we can assume we have found the paper
    if len(approx) == 4:
        docCnt = approx
        break

In [None]:
# apply a four point perspective transform to both the
# original image and grayscale image to obtain a top-down
# birds eye view of the paper
paper = four_point_transform(image, docCnt.reshape(4, 2))
warped = four_point_transform(gray, docCnt.reshape(4, 2))

In [None]:
thresh = cv2.threshold(warped, 0, 255,cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]

In [None]:
Image.fromarray(thresh)

In [None]:
pattern1 = r"(\d{1,4}([.'’\-/])\d{1,2}([.'’\-/])\d{1,4})"
pattern2 = r"(\d{1,4}([.'’\-/\s])[ADFJMNOSadfjmnos]\w*([.'’\-/\s]*)\d{1,4})"
pattern3 = r"([ADFJMNOSadfjmnos]\w*\s\d{1,4}([,'’.\-/\s]*)([.'’\-/\s])\d{1,4})"
pattern4 = r"[ADFJMNOSadfjmnos]\w*\d{1,4}(['’]*)\d{1,4}"
pattern5 = r"(\d{1,4}([.'’\-/\s])\d{1,4}"

In [2]:
def spread(arg):
    ret = []
    for i in arg:
        if isinstance(i, list):
            ret.extend(i)
        else:
            ret.append(i)
    return ret

def deep_flatten(lst):
    result = []
    result.extend(spread(list(map(lambda x: deep_flatten(x) if type(x) == list else x, lst))))
    return result

In [19]:
def finding_date(img_str):
    imstr = img_str
    pattern1 = r"(\d{1,4}([.'’\-/])\d{1,2}([.'’\-/])\d{1,4})"
    pattern2 = r"(\d{1,4}([.'’\-/\s])[ADFJMNOSadfjmnos]\w*([.'’\-/\s]*)\d{1,4})"
    pattern3 = r"([ADFJMNOSadfjmnos]\w*\s\d{1,4}([,'’.\-/\s]*)([.'’\-/\s])\d{1,4})"
    pattern4 = r"[ADFJMNOSadfjmnos]\w*\d{1,4}(['’]*)\d{1,4}"
    pattern5 = r"(\d{1,4}([.'’\-/\s])\d{1,4}"
    
    date_find = []

    for i in imstr:
        for j in (re.search(regex,i) for regex in [pattern1, pattern2, pattern3, pattern4]):
            if j:
                s = j.group()
                if s.find('.'):
                    date_find.append("-".join(s.split('.')))
                else:
                    date_find.append(s)

    date_find = list(set(date_find))

    dates = []
    for i in date_find:
        try:
            #dates.append(search_dates(i)[0][1].strftime("%Y-%m-%d"))
            dates.append(parse(i).strftime("%Y-%m-%d"))   
        except:
            continue
    
    dates = set(dates)
    
    new_dates = []
    for i in dates:
        if 2010 < int(i[:4]) < 2020:
            new_dates.append(i)
    print(new_dates)
    
    if new_dates:
        return max(new_dates)
    return None

# Instead of feeding single input image with pre-process.
# Now, I'm going to feed more than 4 same image with differently pre-processed.Also called Stacking technique

In [23]:
def globalpreprocess(path):
    
    img_str_1 = pt.image_to_string(path, lang='eng')
    
    gaussian_filter = (5,5)
    img = cv2.imread(path) 
    img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC) 
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img_blur = cv2.GaussianBlur(img_gray, gaussian_filter, 0)
    img_str_2 = pt.image_to_string(Image.fromarray(img_blur),lang ="eng")
    
    kernel = np.ones((1, 1), np.uint8)
    img_dilate = cv2.dilate(img_gray, kernel, iterations=1)
    img_erode = cv2.erode(img_dilate, kernel, iterations=1)
    img_gauss = cv2.GaussianBlur(img_erode, gaussian_filter, 0)
    #img_ad_thres = cv2.adaptiveThreshold(img_gauss, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
    #img_str_3 = pt.image_to_string(Image.fromarray(img_ad_thres),lang='eng')

    ret,th1 = cv2.threshold(img_gauss, 100, 225, cv2.THRESH_BINARY)
    img_str_4 = pt.image_to_string(Image.fromarray(th1), lang = 'eng')

    return [img_str_1,img_str_2,img_str_4]

In [None]:
path = 'images/1.jpeg'

In [None]:
ll = globalpreprocess(path)

z = [list(set(map(lambda x: x.strip(),img.split('\n')))) for img in ll]
newstr = set(deep_flatten(z))

date = finding_date(newstr)

In [None]:
print(date)

In [None]:
# 1. Without any filter
path = "images/68.jpeg"
print(pt.image_to_string(path, lang='eng'))

In [None]:
# 2. with filter
# image < resize < grayscale < gaussian 
gaussian_filter = (5,5)
img = cv2.imread(path) 
image = cv2.resize(img, None, fx = 2, fy = 2, interpolation=cv2.INTER_CUBIC) 
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, gaussian_filter, 0)

print(pt.image_to_string(Image.fromarray(blurred)))

In [None]:
# 3. with filter
# image < resize < grayscale < gaussian < Binarization(auto)
gaussian_filter = (5,5)
img = cv2.imread(path) 
img = cv2.resize(img, None, fx = 2, fy = 2, interpolation=cv2.INTER_CUBIC) 
img_grayscale = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

kernel = np.ones((1, 1), np.uint8)
img_dilate = cv2.dilate(img_grayscale, kernel, iterations=1)
img_erode = cv2.erode(img_dilate, kernel, iterations=1)
img_gauss = cv2.GaussianBlur(img_erode, gaussian_filter, 0)

# Thresholding
img_thres = cv2.adaptiveThreshold(img_gauss, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)

print(pt.image_to_string(Image.fromarray(img_thres),lang='eng'))

In [None]:
# 4. with filters
# image < resize < grayscale < gaussian < Binarization(threshold)
ret,th1 = cv2.threshold(img_gauss,100,225,cv2.THRESH_BINARY)
ret2,th2 = cv2.threshold(img_gauss,100,220,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
print(pt.image_to_string(Image.fromarray(th1), lang = 'eng'))

# adaptive threshold doesnt works
# also OTSU doesnt works

In [None]:
#Image.fromarray(ret2)


In [None]:
def rectify(h):
    h = h.reshape((4,2))
    hnew = np.zeros((4,2),dtype = np.float32)

    add = h.sum(1)
    hnew[0] = h[np.argmin(add)]
    hnew[2] = h[np.argmax(add)]

    diff = np.diff(h,axis = 1)
    hnew[1] = h[np.argmin(diff)]
    hnew[3] = h[np.argmax(diff)]

    return hnew


In [None]:
img = cv2.imread(path) 
image = cv2.resize(img, None, fx = 2, fy = 2, interpolation=cv2.INTER_CUBIC) 
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, gaussian_filter, 0)

# creating copy of original image
orig = image.copy()

# convert to grayscale and blur to smooth
#blurred = cv2.medianBlur(gray, 5)

# apply Canny Edge Detection
edged = cv2.Canny(blurred, 0, 50)
orig_edged = edged.copy()

# find the contours in the edged image, keeping only the
# largest ones, and initialize the screen contour
(contours, _) = cv2.findContours(img_thres2, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)

#x,y,w,h = cv2.boundingRect(contours[0])
#cv2.rectangle(image,(x,y),(x+w,y+h),(0,0,255),0)

# get approximate contour
for c in contours:
    p = cv2.arcLength(c, True)
    approx = cv2.approxPolyDP(c, 0.02 * p, True)

    if len(approx) == 4:
        target = approx
        break


# mapping target points to 800x800 quadrilateral
approx = rectify(target)
pts2 = np.float32([[0,0],[800,0],[800,800],[0,800]])

M = cv2.getPerspectiveTransform(approx,pts2)
dst = cv2.warpPerspective(orig,M,(800,800))
dst = cv2.drawContours(image, [target], -1, (0, 255, 0), 2)
dst = cv2.cvtColor(dst, cv2.COLOR_BGR2GRAY)

In [None]:
Image.fromarray(dst)

In [None]:
print(pt.image_to_string(Image.fromarray(dst)))


In [None]:
img_str = pt.image_to_string('images/38.jpeg', lang='eng')
print(img_str)

In [5]:
raw = pd.read_csv('image_data.csv')

In [6]:
raw.head()

Unnamed: 0.1,Unnamed: 0,actual,converted,predicted
0,1,"May 21, 2019",2019-05-21,
1,2,25-07-19,2019-07-25,2019-07-25
2,3,"Sep 29, 2018",2018-09-29,
3,4,Oct06' 16,2016-10-06,
4,5,5/29/2019,2019-05-29,2019-05-29


In [7]:
raw['predicted_2'] = None

In [8]:
z = raw.columns.tolist()
z[0] = 'images'
raw.columns = z
raw.columns

Index(['images', 'actual', 'converted', 'predicted', 'predicted_2'], dtype='object')

In [None]:
raw.loc[0,'predicted_2']

In [24]:
for i in raw.index:
    path = "images/" + str(i+1) + ".jpeg"
    print(i+1,end=' : ')
    ll = globalpreprocess(path)
    z = [list(set(map(lambda x: x.strip(),img.split('\n')))) for img in ll]
    newstr = set(deep_flatten(z))
    date = finding_date(newstr)
    raw.loc[i,'predicted_2'] = date

1 : []
2 : ['2019-07-25']
3 : ['2018-09-29']
4 : []
5 : ['2019-05-29']
6 : ['2019-07-03']
7 : []
8 : []
9 : []
10 : ['2019-05-22']
11 : ['2019-07-17']
12 : ['2019-07-02']
13 : []
14 : ['2019-07-22']
15 : []
16 : []
17 : ['2019-05-24', '2019-08-24']
18 : ['2016-04-15']
19 : ['2019-02-05']
20 : ['2019-06-20']
21 : ['2019-05-29']
22 : []
23 : ['2019-05-02']
24 : ['2019-06-18']
25 : ['2019-06-28']
26 : ['2019-06-03']
27 : ['2019-06-20']
28 : ['2019-04-13', '2019-07-12']
29 : ['2019-06-07']
30 : ['2019-06-28']
31 : ['2019-04-30']
32 : []
33 : []
34 : ['2019-06-08']
35 : ['2019-07-14']
36 : ['2018-06-30']
37 : ['2019-06-27']
38 : []
39 : ['2016-11-24']
40 : []
41 : ['2019-06-19']
42 : []
43 : ['2019-09-19', '2019-05-19']
44 : ['2019-08-07']
45 : ['2019-05-18']
46 : ['2014-05-22']
47 : []
48 : ['2015-08-22']
49 : ['2019-08-09']
50 : ['2019-06-27']
51 : ['2019-07-20']
52 : ['2019-06-02', '2019-05-22', '2019-03-02']
53 : ['2019-07-19']
54 : ['2019-06-01']
55 : ['2019-07-09']
56 : ['2013-07-06']

In [89]:
(raw.converted[raw.converted.notnull()] == raw.predicted_2[z]).value_counts()

True     52
False    34
dtype: int64

In [92]:
(raw.converted == raw.predicted_2).value_counts()
(52/86)*100

60.46511627906976

# 3rd attempt accuracy is 60%

In [61]:
raw.predicted_2[1]

'2019-07-25'

In [69]:
raw.predicted_2[9:]

9     2019-05-22
10    2019-07-17
11    2019-07-02
12          None
13    2019-07-22
         ...    
95    2019-06-15
96          None
97    2019-07-03
98          None
99    2019-07-18
Name: predicted_2, Length: 91, dtype: object

In [70]:
c = 0
for i,j in enumerate(raw.converted[raw.converted.notnull()]):
    print(i,j , raw.predicted_2[i])
    if j == raw.predicted_2[i]:
        c+=1
print(c)

0 2019-05-21 None
1 2019-07-25 2019-07-25
2 2018-09-29 2018-09-29
3 2016-10-06 None
4 2019-05-29 2019-05-29
5 2019-07-03 2019-07-03
6 2019-05-23 None
7 2019-07-01 None
8 2019-05-22 None
9 2019-07-17 2019-05-22
10 2019-07-02 2019-07-17
11 2015-02-08 2019-07-02
12 2019-07-22 None
13 2019-07-22 2019-07-22
14 2019-06-07 None
15 2019-05-24 None
16 2016-04-15 2019-08-24
17 2019-07-20 2016-04-15
18 2019-05-20 2019-02-05
19 2019-05-29 2019-06-20
20 2019-11-22 2019-05-29
21 2019-05-07 None
22 2019-06-18 2019-05-02
23 2019-06-28 2019-06-18
24 2019-06-03 2019-06-28
25 2019-06-20 2019-06-03
26 2019-04-10 2019-06-20
27 2019-06-07 2019-07-12
28 2019-06-28 2019-06-07
29 2019-04-30 2019-06-28
30 2019-07-25 2019-04-30
31 2017-03-10 None
32 2019-06-08 None
33 2019-07-14 2019-06-08
34 2018-06-30 2019-07-14
35 2019-06-27 2018-06-30
36 2019-06-25 2019-06-27
37 2016-11-24 None
38 2019-06-19 2016-11-24
39 2019-06-20 None
40 2019-05-19 2019-06-19
41 2019-05-18 None
42 2014-05-22 2019-09-19
43 2017-10-01 2019-

In [53]:
if raw.predicted_2[6] == (raw.converted[6]):
    print('asd')
    
#if raw.loc[6,'converted'] is None:
#    raw.loc[6,'converted'] = 0


Unnamed: 0,images,actual,converted,predicted,predicted_2
0,1,"May 21, 2019",2019-05-21,,
1,2,25-07-19,2019-07-25,2019-07-25,2019-07-25
2,3,"Sep 29, 2018",2018-09-29,,2018-09-29
3,4,Oct06' 16,2016-10-06,,
4,5,5/29/2019,2019-05-29,2019-05-29,2019-05-29
5,6,03-Jul-19,2019-07-03,2019-07-03,2019-07-03
6,7,,,,
7,8,23-05-19,2019-05-23,,
8,9,07-01-19,2019-07-01,,
9,10,22-05-19,2019-05-22,,2019-05-22


In [21]:
new_dates = []
for i in raw.converted[raw.converted.notnull()]:
    if 2010 < int(i[:4]) < 2021:
        new_dates.append(i)
print(len(new_dates))    

86


In [None]:
for i in raw.index:
    if raw.loc[i,'converted'] == raw.loc[i,'predicted_2'] and :
        

In [None]:
raw.head(20)