In [1]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
from glob import glob
import spacy
import re
import string

In [2]:
def cleanText(txt):
    whitespace = string.whitespace
    punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
    tableWhitespace = str.maketrans('','',whitespace)
    tablePunctuation = str.maketrans('','',punctuation)
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)

In [3]:
### Load NER model
model_ner = spacy.load('./output/model-best/')

In [4]:
# Load Image
image = cv2.imread('./data/056.jpeg')

# cv2.imshow('businesscard',image)
# cv2.waitKey(0)
# cv2.destroyAllWindows()

# extract data using Pytesseract 
tessData = pytesseract.image_to_data(image)
# convert into dataframe
tessList = list(map(lambda x:x.split('\t'), tessData.split('\n')))
df = pd.DataFrame(tessList[1:],columns=tessList[0])
df.dropna(inplace=True) # drop missing values
df['text'] = df['text'].apply(cleanText)

# convet data into content
df_clean = df.query('text != "" ')
content = " ".join([w for w in df_clean['text']])
print(content)
# get prediction from NER model
doc = model_ner(content)

certificate no uc-4228b310-7cde-42de-a5b8-61b67d3ca658 u a e mm certificate url ude.my/uc-4228b310-7cde-42de-a5b8-61b67d3ca658 reference number 0004 certificate of completion data warehouse fundamentals for beginners instructors alan simon priyanshu singh date march 15, 2022 length 5 total hours


In [5]:
from spacy import displacy

In [6]:
displacy.serve(doc,style='ent')
auto_select_port=True




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [7]:
displacy.render(doc,style='ent')

In [8]:
docjson = doc.to_json()
docjson.keys()


dict_keys(['text', 'ents', 'tokens'])

In [9]:
docjson['text']

'certificate no uc-4228b310-7cde-42de-a5b8-61b67d3ca658 u a e mm certificate url ude.my/uc-4228b310-7cde-42de-a5b8-61b67d3ca658 reference number 0004 certificate of completion data warehouse fundamentals for beginners instructors alan simon priyanshu singh date march 15, 2022 length 5 total hours'

In [10]:
docjson['tokens']

[{'id': 0, 'start': 0, 'end': 11},
 {'id': 1, 'start': 12, 'end': 14},
 {'id': 2, 'start': 15, 'end': 26},
 {'id': 3, 'start': 26, 'end': 27},
 {'id': 4, 'start': 27, 'end': 36},
 {'id': 5, 'start': 36, 'end': 37},
 {'id': 6, 'start': 37, 'end': 41},
 {'id': 7, 'start': 41, 'end': 42},
 {'id': 8, 'start': 42, 'end': 54},
 {'id': 9, 'start': 55, 'end': 56},
 {'id': 10, 'start': 57, 'end': 58},
 {'id': 11, 'start': 59, 'end': 60},
 {'id': 12, 'start': 61, 'end': 63},
 {'id': 13, 'start': 64, 'end': 75},
 {'id': 14, 'start': 76, 'end': 79},
 {'id': 15, 'start': 80, 'end': 126},
 {'id': 16, 'start': 127, 'end': 136},
 {'id': 17, 'start': 137, 'end': 143},
 {'id': 18, 'start': 144, 'end': 148},
 {'id': 19, 'start': 149, 'end': 160},
 {'id': 20, 'start': 161, 'end': 163},
 {'id': 21, 'start': 164, 'end': 174},
 {'id': 22, 'start': 175, 'end': 179},
 {'id': 23, 'start': 180, 'end': 189},
 {'id': 24, 'start': 190, 'end': 202},
 {'id': 25, 'start': 203, 'end': 206},
 {'id': 26, 'start': 207, 'e

In [11]:
docjson['ents']

[{'start': 0, 'end': 11, 'label': 'B-DES'},
 {'start': 12, 'end': 14, 'label': 'I-DES'},
 {'start': 15, 'end': 54, 'label': 'I-DES'},
 {'start': 55, 'end': 56, 'label': 'B-COI'},
 {'start': 57, 'end': 58, 'label': 'I-COI'},
 {'start': 59, 'end': 60, 'label': 'I-COI'},
 {'start': 61, 'end': 63, 'label': 'I-COI'},
 {'start': 64, 'end': 75, 'label': 'B-DES'},
 {'start': 76, 'end': 79, 'label': 'I-DES'},
 {'start': 80, 'end': 126, 'label': 'I-DES'},
 {'start': 127, 'end': 136, 'label': 'I-DES'},
 {'start': 137, 'end': 143, 'label': 'I-DES'},
 {'start': 144, 'end': 148, 'label': 'I-DES'},
 {'start': 149, 'end': 160, 'label': 'B-TYP'},
 {'start': 161, 'end': 163, 'label': 'I-TYP'},
 {'start': 164, 'end': 174, 'label': 'I-TYP'},
 {'start': 175, 'end': 179, 'label': 'B-DES'},
 {'start': 180, 'end': 189, 'label': 'I-DES'},
 {'start': 190, 'end': 202, 'label': 'I-DES'},
 {'start': 203, 'end': 206, 'label': 'I-DES'},
 {'start': 207, 'end': 216, 'label': 'I-DES'},
 {'start': 217, 'end': 228, 'labe

In [12]:
doc_text = docjson['text']

In [13]:
datafram_tokens = pd.DataFrame(docjson['tokens'])
datafram_tokens['token'] = datafram_tokens[['start','end']].apply(
        lambda x:doc_text[x[0]:x[1]] , axis = 1)

datafram_tokens.head(10)

  lambda x:doc_text[x[0]:x[1]] , axis = 1)


Unnamed: 0,id,start,end,token
0,0,0,11,certificate
1,1,12,14,no
2,2,15,26,uc-4228b310
3,3,26,27,-
4,4,27,36,7cde-42de
5,5,36,37,-
6,6,37,41,a5b8
7,7,41,42,-
8,8,42,54,61b67d3ca658
9,9,55,56,u


In [14]:
doc_text[49:60]

'ca658 u a e'

In [15]:
right_table = pd.DataFrame(docjson['ents'])[['start','label']]
datafram_tokens = pd.merge(datafram_tokens,right_table,how='left',on='start')

In [16]:
datafram_tokens.fillna('O',inplace=True)
datafram_tokens.head(10)

Unnamed: 0,id,start,end,token,label
0,0,0,11,certificate,B-DES
1,1,12,14,no,I-DES
2,2,15,26,uc-4228b310,I-DES
3,3,26,27,-,O
4,4,27,36,7cde-42de,O
5,5,36,37,-,O
6,6,37,41,a5b8,O
7,7,41,42,-,O
8,8,42,54,61b67d3ca658,O
9,9,55,56,u,B-COI


In [17]:
#join label to df_clean 'dataframe'
df_clean['end'] = df_clean['text'].apply(lambda x: len(x)+1).cumsum() - 1
df_clean['start'] = df_clean[['text','end']].apply(lambda x: x[1] - len(x[0]),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['end'] = df_clean['text'].apply(lambda x: len(x)+1).cumsum() - 1
  df_clean['start'] = df_clean[['text','end']].apply(lambda x: x[1] - len(x[0]),axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['start'] = df_clean[['text','end']].apply(lambda x: x[1] - len(x[0]),axis=1)


In [18]:
#inner join with start
dataframe_info = pd.merge(df_clean,datafram_tokens[['start','token','label']],how='inner',on='start')

In [19]:
dataframe_info.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
0,5,1,1,1,1,1,2205,221,154,23,96,certificate,11,0,certificate,B-DES
1,5,1,1,1,1,2,2371,227,42,17,92,no,14,12,no,I-DES
2,5,1,1,1,1,3,2428,221,704,23,74,uc-4228b310-7cde-42de-a5b8-61b67d3ca658,54,15,uc-4228b310,I-DES
3,5,1,2,1,1,1,200,267,81,94,86,u,56,55,u,B-COI
4,5,1,2,1,1,2,294,233,96,128,88,a,58,57,a,I-COI


In [20]:
dataframe_info.tail(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
24,5,1,6,1,1,1,210,2005,617,123,92,priyanshu,249,240,priyanshu,B-STU
25,5,1,6,1,1,2,867,2003,362,125,96,singh,255,250,singh,I-STU
26,5,1,7,1,1,1,204,2197,100,35,96,date,260,256,date,B-DAT
27,5,1,7,1,1,2,344,2197,139,35,94,march,266,261,march,I-DAT
28,5,1,7,1,1,3,496,2197,62,43,93,15,270,267,15,I-DAT
29,5,1,7,1,1,4,575,2197,115,35,95,2022,275,271,2022,I-DAT
30,5,1,7,1,2,1,204,2274,150,46,96,length,282,276,length,B-DES
31,5,1,7,1,2,2,392,2274,25,35,49,5,284,283,5,I-DES
32,5,1,7,1,2,3,429,2274,109,35,49,total,290,285,total,I-DES
33,5,1,7,1,2,4,556,2274,127,35,95,hours,296,291,hours,I-DES


## Bounding Box

In [21]:
bb_df = dataframe_info.query("label != 'O' ")
img = image.copy()

for x,y,w,h,label in bb_df[['left','top','width','height','label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)
    cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
    cv2.putText(img,str(label),(x,y),cv2.FONT_HERSHEY_PLAIN,1,(255,0,255),2)

cv2.imshow('Predictions',img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [22]:
bb_df['label'] = bb_df['label'].apply(lambda x: x[2:])
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
0,5,1,1,1,1,1,2205,221,154,23,96,certificate,11,0,certificate,DES
1,5,1,1,1,1,2,2371,227,42,17,92,no,14,12,no,DES
2,5,1,1,1,1,3,2428,221,704,23,74,uc-4228b310-7cde-42de-a5b8-61b67d3ca658,54,15,uc-4228b310,DES
3,5,1,2,1,1,1,200,267,81,94,86,u,56,55,u,COI
4,5,1,2,1,1,2,294,233,96,128,88,a,58,57,a,COI


In [23]:
#group the labels
class groupgen():
    def __init__(self):
        self.id = 0
        self.text = ''
        
    def getgroup(self,text):
        if self.text == text:
            return self.id
        else:
            self.id +=1
            self.text = text
            return self.id
grp_gen =  groupgen()               
            
    

In [24]:
bb_df['group'] = bb_df['label'].apply(grp_gen.getgroup)

In [25]:
#right and bottom of bounding box
bb_df[['left','top','width','height']] = bb_df[['left','top','width','height']].astype(int)
bb_df['right'] = bb_df['left'] + bb_df['width'] 
bb_df['bottom'] = bb_df['top'] + bb_df['height'] 

In [26]:
# tagging: grouby group
col_group = ['left','top','right','bottom','label','token','group']
group_tag_img = bb_df[col_group].groupby(by='group')


In [27]:
img_tagging = group_tag_img.agg({
    'left':min,
    'right':max,
    'top' :min,
    'bottom' :max,
    'label' :np.unique,
    'token' : lambda x:" ".join(x)
})

  img_tagging = group_tag_img.agg({
  img_tagging = group_tag_img.agg({


In [28]:
img_tagging

Unnamed: 0_level_0,left,right,top,bottom,label,token
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2205,3132,221,244,[DES],certificate no uc-4228b310
2,200,595,233,362,[COI],u a e mm
3,2080,3134,281,367,[DES],certificate url ude.my/uc-4228b310-7cde-42de-a...
4,202,1067,646,681,[TYP],certificate of completion
5,204,3046,771,1300,[DES],data warehouse fundamentals for beginners inst...
6,471,736,1265,1300,[NCER],alan simon
7,210,1229,2003,2128,[STU],priyanshu singh
8,204,690,2197,2240,[DAT],date march 15 2022
9,204,683,2274,2320,[DES],length 5 total hours


In [29]:
img_bb = image.copy()
for l, r, t, b, label, token in img_tagging.values:
    cv2.rectangle(img_bb, (l, t), (r, b), (0, 255, 0), 2)

    # Convert label to string
    label = str(label)

    cv2.putText(img_bb, label, (l, t), cv2.FONT_HERSHEY_PLAIN, 1, (255, 0, 255),2)

cv2.imshow('Bounding Box BusinessCard', img_bb)
cv2.waitKey(0)
cv2.destroyAllWindows()


## Parser

In [35]:
import re
def parser (text,label):
    if label == 'DAT':
        text = text.lower()
        allow_special_char = '_./\-'
        text = re.sub(r'^0-9{}\D ]'.format(allow_special_char),'',text)
    elif label == 'NCER':
        text = text.lower()
        allow_special_char = '_.\-'
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(allow_special_char),'',text)
    elif label in ('STU', 'TYP', 'DES','COI'):
        text = text.lower()
        text = re.sub(r'[^a-z]','',text)
        text = text.title()
    return text
    

In [36]:
parser('Srikanth 1896196','STU')


'Srikanth'

In [37]:
parser('23/02/2023','DAT')

'23/02/2023'

In [38]:
info_array = dataframe_info[['token','label']].values
entities = dict(STU=[],DES=[],COI=[],DAT=[],NCER=[],TYP=[])
previous = 'O'

for token,label in info_array:
    #print(token,label)
    bio_tag = label[:1]
    label_tag = label[2:]
    
    #step 1 parse the token
    text = parser(token,label_tag)
    if bio_tag in ('B','I'):
        if previous != label_tag:
            entities[label_tag].append(text)
        else:
            if bio_tag == "B":
                entities[label_tag].append(text)
            else:
                if label_tag in ("STU",'COI','DES'):
                    entities[label_tag][-1] = entities[label_tag][-1] + " " + text
                else:
                    entities[label_tag][-1] = entities[label_tag][-1] + text
    previous = label_tag
                    

In [39]:
entities

{'STU': ['Priyanshu Singh'],
 'DES': ['Certificate No Ucb',
  'Certificate Url Udemyucbcdedeabbdca Reference Number ',
  'Data Warehouse Fundamentals For Beginners Instructors',
  'Length  Total Hours'],
 'COI': ['U A E Mm'],
 'DAT': ['datemarch152022'],
 'NCER': ['alansimon'],
 'TYP': ['CertificateOfCompletion']}