In [1]:
TXT_OCR_NAS_DIR = 'data/boxes/TXT_OCR_NAS/'
TXT_CROWDSOURCE_DIR = 'data/boxes/TXT_Crowdsource/'

# bugs solved:
1. I found that in some cases, case sensitive has larger similarity than case insensitive. the problem is solved by setting autojunk=False. Altough I still have no idea why using lower() will trigger auto junk while origin text doesn't. http://stackoverflow.com/questions/12142339/unexplaned-behavior-with-difflib-sequencematcher-get-matching-blocks

# potential bugs:
1. character \xc2\x95 is found in 'Box8-MINIDOKA to TULE LAKE-0269.txt' and 'Box8-MINIDOKA to TULE LAKE-0272', both in crowdsource and ocr nas. Not sure how many times does this happen. We can simply remove them, but it might have something to do with encoding. But I just leave them there by now(http://stackoverflow.com/questions/6609895/efficiently-replace-bad-characters)

In [2]:
import glob
import pandas as pd
import difflib
import re

In [3]:
def get_file_path(gt_dir, ocr_dir):
    gt_paths = list(glob.glob(gt_dir+'*/*.txt'))
    ocr_paths = list(glob.glob(ocr_dir+'*/*.txt'))
    return gt_paths, ocr_paths

def get_file_names(gt_paths, ocr_paths):
    gt_fns = [path.rsplit('\\',1)[-1] for path in gt_paths]
    ocr_fns = [path.rsplit('\\',1)[-1] for path in ocr_paths]
    assert len(set(ocr_fns)) == len(ocr_fns), 'you get duplicate file names in ocr files'
    assert len(set(gt_fns)) == len(gt_fns), 'you get duplicate file names in gt files'
    assert set(ocr_fns).issubset(set(gt_fns)), 'ocr file names to be tested are not a subset of ground truth dataset'
    return gt_fns, ocr_fns


def pair_paths(gt_paths, ocr_paths, gt_fns, ocr_fns):
    gt_df = pd.DataFrame(zip(gt_paths, gt_fns), columns=['gt_path','fn'])
    ocr_df = pd.DataFrame(zip(ocr_paths, ocr_fns), columns=['ocr_path', 'fn'])
    paired_df = ocr_df.merge(gt_df)
    paired_df = paired_df[['fn', 'gt_path', 'ocr_path']]
    return paired_df


In [16]:
char_similarity('abcdefghijk ','abdejk')

0.6666666666666666

In [4]:
def get_diff(seq1, seq2):
    d = difflib.Differ()
    diff = d.compare(seq1, seq2)
    return diff

def get_diff_str(seq1, seq2):
    return ' '.join(get_diff(seq1, seq2))
    
def cal_similarity(seq1, seq2):
    return difflib.SequenceMatcher(a=seq1, b=seq2, autojunk=False).ratio()

def char_similarity(seq1, seq2, case_sensitive=False):
    assert type(seq1)==str, 'seq1 is not a string'
    assert type(seq2)==str, 'seq2 is not a string'
    if not case_sensitive:
        seq1, seq2 = seq1.lower(), seq2.lower()
    return cal_similarity(seq1, seq2)

def wordize(s):
    return re.split('[ \n]', s)


def word_similarity(seq1, seq2, case_sensitive=False):
    assert type(seq1)==str, 'seq1 is not a string'
    assert type(seq2)==str, 'seq2 is not a string'
    if not case_sensitive:
        seq1, seq2 = seq1.lower(), seq2.lower()
    return cal_similarity(wordize(seq1), wordize(seq2))

In [5]:
CNT_OCR_CLEAN = 0 
def ocr_nas_remove_page(ocr_lines):
    global CNT_OCR_CLEAN
    if '----Page 1----' in ocr_lines[0]:
        CNT_OCR_CLEAN+=1
        return ocr_lines[1:]
    return ocr_lines
def replace_consecutive_space(s):
    return ' '.join(re.split(' +', s))

def refine_lines(lines):
    new_lines = []
    for line in lines:
        line = line.strip()
        if line:
            line = replace_consecutive_space(line)
            new_lines.append(line)
    return '\n'.join(new_lines)
        

def get_str(gt_path, ocr_path):
    gt_lines = open(gt_path).readlines()
    ocr_lines = open(ocr_path).readlines()
    ocr_lines = ocr_nas_remove_page(ocr_lines)
    gt_str = refine_lines(gt_lines)
    ocr_str =  refine_lines(ocr_lines)
    return gt_str, ocr_str

def get_similarity(gt_str, ocr_str):
    similarity = {
        '~case_word': word_similarity(gt_str, ocr_str),
        'case_word': word_similarity(gt_str, ocr_str, case_sensitive=True),
        '~case_char': char_similarity(gt_str, ocr_str),
        'case_char': char_similarity(gt_str, ocr_str, case_sensitive= True)
    }
    return similarity

def evaluate_quality(gt_dir, ocr_dir):
    gt_paths, ocr_paths = get_file_path(gt_dir, ocr_dir)
    gt_fns, ocr_fns = get_file_names(gt_paths, ocr_paths)
    paired_df = pair_paths(gt_paths, ocr_paths, gt_fns, ocr_fns)
    similarities = []
    for idx, fn, gt_path, ocr_path in paired_df.itertuples():
        gt_str, ocr_str = get_str(gt_path, ocr_path)
        similarity = get_similarity(gt_str, ocr_str)
        similarity['fn'] = fn
        similarities.append(similarity)
    similarities = pd.DataFrame(similarities)
    return similarities


similarities = evaluate_quality(TXT_CROWDSOURCE_DIR, TXT_OCR_NAS_DIR)
print CNT_OCR_CLEAN

248


In [14]:
%matplotlib inline
def print_statistics(similarities, col):
    array = similarities[col]
    print col, 'max =', array.max(), 'mean =',array.mean()
    ratio_lower = [0.7, 0.8, 0.9, 0.95, 0.98, 0.99]
    ratio_upper = [0.8, 0.9, 0.95, 0.98, 0.99, 1.1]
    for l,u in zip(ratio_lower,ratio_upper):
        print '({:4},{:4}] : {} '.format(l,u, ((array>l)&(array<=u)).value_counts()[True])
    print '\n'
print_statistics(similarities, 'case_char')
print_statistics(similarities, '~case_char')
print_statistics(similarities, 'case_word')
print_statistics(similarities, '~case_word')

case_char max = 1.0 mean = 0.920727261387
( 0.7, 0.8] : 1 
( 0.8, 0.9] : 21 
( 0.9,0.95] : 64 
(0.95,0.98] : 93 
(0.98,0.99] : 29 
(0.99, 1.1] : 29 


~case_char max = 1.0 mean = 0.921705891622
( 0.7, 0.8] : 1 
( 0.8, 0.9] : 21 
( 0.9,0.95] : 62 
(0.95,0.98] : 93 
(0.98,0.99] : 31 
(0.99, 1.1] : 29 


case_word max = 1.0 mean = 0.788253087506
( 0.7, 0.8] : 59 
( 0.8, 0.9] : 81 
( 0.9,0.95] : 42 
(0.95,0.98] : 16 
(0.98,0.99] : 1 
(0.99, 1.1] : 6 


~case_word max = 1.0 mean = 0.789390955376
( 0.7, 0.8] : 59 
( 0.8, 0.9] : 79 
( 0.9,0.95] : 44 
(0.95,0.98] : 16 
(0.98,0.99] : 1 
(0.99, 1.1] : 6 




In [7]:
# print (similarities['case_char']<similarities['~case_char']).value_counts()
print (similarities['case_char']>similarities['~case_char']).value_counts()
# print (similarities['case_word']<similarities['~case_word']).value_counts()
print (similarities['case_word']>similarities['~case_word']).value_counts()
# similarities[(similarities['case_ch']>similarities['~case_ch'])]

False    243
True       5
dtype: int64
False    248
dtype: int64


# single str test

In [331]:
(re.split'[ \n]', '11-4-43 a-7-p389 riot\nimai asaichi and mrs. kagawa:')

['11-4-43', 'a-7-p389', 'riot', 'imai', 'asaichi', 'and', 'mrs.', 'kagawa:']

In [332]:
def count_diff(diff):
    diff = list(diff)
    same = 0
    total = 0
    for d in diff:
        total+=1
        if d.startswith((' ')):
            same+=1
    print same,'/',total

def wordize(s):
    return re.split('[ \n]', s)


def word_similarity(seq1, seq2, case_sensitive=False):
    assert type(seq1)==str, 'seq1 is not a string'
    assert type(seq2)==str, 'seq2 is not a string'
    if not case_sensitive:
        seq1, seq2 = seq1.lower(), seq2.lower()
    return cal_similarity(wordize(seq1), wordize(seq2))
def get_similarity(gt_str, ocr_str):
    similarity = {
        '~case_word': word_similarity(gt_str, ocr_str),
        'case_word': word_similarity(gt_str, ocr_str, case_sensitive=True),
        '~case_char': char_similarity(gt_str, ocr_str),
        'case_char': char_similarity(gt_str, ocr_str, case_sensitive= True)
    }
    return similarity

def evaluate_quality_one(gt_dir, ocr_dir, fn):
    gt_path = gt_dir+fn
    ocr_path = ocr_dir+fn
    gt_str, ocr_str = get_str(gt_path, ocr_path)
    print gt_str
    print ocr_str
    print 
    print gt_str.lower()
    print ocr_str.lower()
    print len(gt_str), len(ocr_str), len(gt_str.lower()), len(ocr_str.lower())
    similarity = get_similarity(gt_str, ocr_str)
    return similarity

box_dir = 'Box8-TULE LAKE/'
no = '0759'
# no = '0270'
fn = box_dir+'Box8-MINIDOKA to TULE LAKE-%s.txt' % no
evaluate_quality_one(TXT_CROWDSOURCE_DIR, TXT_OCR_NAS_DIR, fn)

11-4-43 A-7-P389 Riot
IMAI ASAICHI and MRS. KAGAWA:
Asaichi, chief cook, and Mrs. Kagawa, Assist.
Chief Cook from #4 Mess Hall stated that:
Hibino Yusaki - 4314-D
Yahumoto Hyomen - 4302-E
Nakamura Mobijiro - 4305-C
were leaders of a group of 'muscle men' are the
following names and addresses:
Oki Kakuma - 4313C
Marsui Takeo - 4306-A
Yamasaki Mineiichi 4305-E
Kumagai Kazao 430-1-E
(over)
11-4-43 A-7-$389 Bio t
IMAI ASAICHI and MRS. KAGAWA;
Asaichi, chief cook, and Mrs. Kagawa, Assist. s
Chief Cook from #4 Mess Hall stited that:
Hihino Yusaki-- 4314-D
Yahumoto Hyomeh - 4302-E
Nakarjura Mohijiro - 4305-C
were leaders of a group of 'muscle men' are the
following names and addresses:
Oki Kakuma - 4313C
Marsui Tpkeo - 4306-A
Yamasaki Mineiichi 4305-E
Kumagai Kazao 430-1 -E (over)

11-4-43 a-7-p389 riot
imai asaichi and mrs. kagawa:
asaichi, chief cook, and mrs. kagawa, assist.
chief cook from #4 mess hall stated that:
hibino yusaki - 4314-d
yahumoto hyomen - 4302-e
nakamura mobijiro - 4305-c

{'case_char': 0.9655172413793104,
 'case_word': 0.8,
 '~case_char': 0.9655172413793104,
 '~case_word': 0.8}

In [81]:
string1 = 'Hello'
string2 = 'hello'

if string1.lower() == string2.lower():
    print "The strings are the same (case insensitive)"
else:
    print "The strings are not the same (case insensitive)"


The strings are the same (case insensitive)


In [283]:
 
box_dir = 'Box8-TULE LAKE/'
no = '0759'
no = '0664'
# no = '0270'
# no = '0269'
fn = box_dir+'Box8-MINIDOKA to TULE LAKE-%s.txt' % no
gt_lines = list(open(TXT_CROWDSOURCE_DIR+fn).readlines())
ocr_lines = list(open(TXT_OCR_NAS_DIR+fn).readlines())[2:]
# gt_str= ''.join(gt_lines)
# ocr_str=''.join(ocr_lines)
gt_str = refine_lines(gt_lines)
ocr_str = refine_lines(ocr_lines)
gt_str_lower = gt_str.lower()
ocr_str_lower = ocr_str.lower()
gt_str_lower_m =manually_lower(gt_str)
ocr_str_lower_m  = manually_lower(ocr_str)
gt_str_decode = gt_str.decode('utf-8')
ocr_str_decode = ocr_str.decode('utf-8')
gt_str_decode_lower = gt_str.decode('utf-8').lower()
ocr_str_decode_lower = ocr_str.decode('utf-8').lower()

In [285]:
gt_str, ocr_str

('3/7/44 A-172 INFRACTIONS OFPROJECT REGULATIONSARASUNA, Teruois a janitor at Japanese language school.',
 'Teruoft* a jftnttcr at ^ptcael*j!g\xc2\xbb?\xc2\xa3\xc2\xa9')

In [286]:
gt_str_lower, ocr_str_lower

('3/7/44 a-172 infractions ofproject regulationsarasuna, teruois a janitor at japanese language school.',
 'teruoft* a jftnttcr at ^ptcael*j!g\xc2\xbb?\xc2\xa3\xc2\xa9')

In [253]:
gt_str.__len__()

376

In [254]:
gt_str_lower.__len__()

376

In [287]:
cal_similarity(gt_str, ocr_str), cal_similarity(gt_str_decode, ocr_str_decode)

(0.28169014084507044, 0.28776978417266186)

In [299]:
cal_similarity(ocr_str, gt_str), cal_similarity(gt_str_decode, ocr_str_decode)

(0.2535211267605634, 0.28776978417266186)

In [300]:
cal_similarity(gt_str_decode_lower, ocr_str_decode_lower),\
cal_similarity(ocr_str_lower, gt_str_lower),\
cal_similarity(gt_str_lower_m, ocr_str_lower_m)

(0.2589928057553957, 0.2535211267605634, 0.2535211267605634)

In [296]:
from difflib import SequenceMatcher

s = SequenceMatcher(None, gt_str, ocr_str)
s.get_matching_blocks()
for match in s.get_matching_blocks():
    print match.a, match.size, gt_str[match.a:match.a+match.size]
    print match.b, match.size, ocr_str[match.b:match.b+match.size]
gt_str[77:87]

55 5 Teruo
0 5 Teruo
62 4  a j
8 4  a j
67 1 n
14 1 n
69 1 t
15 1 t
71 5 r at 
18 5 r at 
77 1 a
27 1 a
81 1 e
28 1 e
85 1 l
29 1 l
88 1 g
33 1 g
101 0 
41 0 


'apanese la'

In [298]:
s = SequenceMatcher(None, gt_str_lower, ocr_str_lower)
print s.get_matching_blocks()
for match in s.get_matching_blocks():
    print match.a, match.size, gt_str_lower[match.a:match.a+match.size]
    print match.b, match.size, ocr_str_lower[match.b:match.b+match.size]
gt_str_lower[76:86], ocr_str_lower[31:]

[Match(a=55, b=0, size=5), Match(a=62, b=8, size=4), Match(a=67, b=14, size=1), Match(a=69, b=15, size=1), Match(a=71, b=18, size=5), Match(a=76, b=31, size=1), Match(a=88, b=33, size=1), Match(a=101, b=41, size=0)]
55 5 teruo
0 5 teruo
62 4  a j
8 4  a j
67 1 n
14 1 n
69 1 t
15 1 t
71 5 r at 
18 5 r at 
76 1 j
31 1 j
88 1 g
33 1 g
101 0 
41 0 


('japanese l', 'j!g\xc2\xbb?\xc2\xa3\xc2\xa9')