## Assessing the quality of the alignments

This notebook looks at the quality of the alignments between the OCR text and the human-corrected text from the Trove dataset that takes place in the `aligning_trove.ipynb` notebook. It checks the ratio of characters that have been aligned in contrast with the total number of characters, and produces a CSV file for quick evaluation of the alignment of a sample of 50 articles.

In [1]:
import pandas as pd
import ast

In [2]:
sampledf = pd.read_pickle("trove_subsample_aligned.pkl")

In [3]:
sampledf.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length,quality_band,use_corrected,alignment,processed
26185,./trove_overproof/datasets/dataset1/rawTextAnd...,14330048,Article,1900,"WESTERN SUBURBS COTTAGE HOSPITAL. ,-«A-. Tim c...",WESTERN SUBURBS COTTAGE HOSPITAL. — The commit...,WESTERN SUBURBS COTTAGE HOSPITAL. — The commit...,0.920732,1144,1,0,"[(0, 7, 0, 7), (8, 15, 8, 15), (16, 23, 16, 23...",yes
13445,./trove_overproof/datasets/dataset1/rawTextAnd...,13166969,Article,1868,INTERCOLONIAL NEWS. QUEENSLAND. BRISBANE files...,INTERCOLONIAL NEWS. QUEENSLAND. BRISBANE files...,INTERCOLONIAL NEWS. QUEENSLAND. BRISBANE files...,0.924826,2155,1,0,"[(0, 13, 0, 13), (14, 19, 14, 19), (20, 31, 20...",yes
13729,./trove_overproof/datasets/dataset1/rawTextAnd...,14953246,Article,1908,IN DIVORCE. (Before Mr. Justlco Simpson.) O'BR...,IN DIVORCE. (Before Mr. Justice Simpson.) O'BR...,IN DIVORCE. (Before Mr. Justice Simpson.) O'BR...,0.929938,3240,1,0,"[(0, 2, 0, 2), (3, 11, 3, 11), (12, 19, 12, 19...",yes
13979,./trove_overproof/datasets/dataset1/rawTextAnd...,14947605,Article,1908,"I MILITAEY. MELBOURNE, Saturday. The following...","MILITARY. MELBOURNE, Saturday. The following n...","MILITARY. MELBOURNE, Saturday. The following n...",0.921942,2563,1,0,"[(2, 11, 0, 9), (12, 22, 10, 20), (23, 32, 21,...",yes
22979,./trove_overproof/datasets/dataset1/rawTextAnd...,15708305,Article,1917,I WAR CASUALTIES. -» j KILLED. ! PRIVATE W. PR...,WAR CASUALTIES. -» KILLED. PRIVATE W. PRYOR. M...,WAR CASUALTIES. -» KILLED. PRIVATE W. PRYOR. M...,0.915966,578,1,0,"[(6, 17, 4, 15), (23, 30, 19, 26), (44, 50, 38...",yes


### Compare the amount of text that has been aligned vs uncertain

Number of characters that have been aligned vs characters whose alignment has been left as uncertain. Measure is given for each quality band.

In [4]:
sampledf['alignedchars_ocr'] = 0
sampledf['alignedchars_hum'] = 0
sampledf['uncertainchars_ocr'] = 0
sampledf['uncertainchars_hum'] = 0

sampledf = sampledf[(abs(sampledf['ocrText'].str.len() - sampledf['humanText'].str.len()) <= 100)]

for index, row in sampledf.iterrows():
    aligned_chars_ocr = 0
    aligned_chars_hum = 0
    uncertain_chars_ocr = 0
    uncertain_chars_hum = 0
    ocrText = row['ocrText']
    humanText = row['humanText']
    alignment = ast.literal_eval(row['alignment'])
    ocr_index = 0
    hum_index = 0
    alignment.sort(key=lambda tup: tup[0])
    for a in alignment:
        if a[0] != 0 and a[2] != 0:
            uncertain_chars_ocr += len(ocrText[ocr_index : a[0] - 1].strip())
            uncertain_chars_hum += len(humanText[hum_index : a[2] - 1].strip())
            aligned_chars_ocr += len(ocrText[a[0] : a[1]].strip())
            aligned_chars_hum += len(humanText[a[2] : a[3]].strip())
        ocr_index = a[1] + 1
        hum_index = a[3] + 1
    sampledf.loc[index, 'alignedchars_ocr'] = aligned_chars_ocr
    sampledf.loc[index, 'alignedchars_hum'] = aligned_chars_hum
    sampledf.loc[index, 'uncertainchars_ocr'] = uncertain_chars_ocr
    sampledf.loc[index, 'uncertainchars_hum'] = uncertain_chars_hum

In [5]:
sampledf.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length,quality_band,use_corrected,alignment,processed,alignedchars_ocr,alignedchars_hum,uncertainchars_ocr,uncertainchars_hum
26185,./trove_overproof/datasets/dataset1/rawTextAnd...,14330048,Article,1900,"WESTERN SUBURBS COTTAGE HOSPITAL. ,-«A-. Tim c...",WESTERN SUBURBS COTTAGE HOSPITAL. — The commit...,WESTERN SUBURBS COTTAGE HOSPITAL. — The commit...,0.920732,1144,1,0,"[(0, 7, 0, 7), (8, 15, 8, 15), (16, 23, 16, 23...",yes,773,777,206,198
13445,./trove_overproof/datasets/dataset1/rawTextAnd...,13166969,Article,1868,INTERCOLONIAL NEWS. QUEENSLAND. BRISBANE files...,INTERCOLONIAL NEWS. QUEENSLAND. BRISBANE files...,INTERCOLONIAL NEWS. QUEENSLAND. BRISBANE files...,0.924826,2155,1,0,"[(0, 13, 0, 13), (14, 19, 14, 19), (20, 31, 20...",yes,1399,1416,411,409
13729,./trove_overproof/datasets/dataset1/rawTextAnd...,14953246,Article,1908,IN DIVORCE. (Before Mr. Justlco Simpson.) O'BR...,IN DIVORCE. (Before Mr. Justice Simpson.) O'BR...,IN DIVORCE. (Before Mr. Justice Simpson.) O'BR...,0.929938,3240,1,0,"[(0, 2, 0, 2), (3, 11, 3, 11), (12, 19, 12, 19...",yes,2193,2212,566,565
13979,./trove_overproof/datasets/dataset1/rawTextAnd...,14947605,Article,1908,"I MILITAEY. MELBOURNE, Saturday. The following...","MILITARY. MELBOURNE, Saturday. The following n...","MILITARY. MELBOURNE, Saturday. The following n...",0.921942,2563,1,0,"[(2, 11, 0, 9), (12, 22, 10, 20), (23, 32, 21,...",yes,1971,1971,273,265
22979,./trove_overproof/datasets/dataset1/rawTextAnd...,15708305,Article,1917,I WAR CASUALTIES. -» j KILLED. ! PRIVATE W. PR...,WAR CASUALTIES. -» KILLED. PRIVATE W. PRYOR. M...,WAR CASUALTIES. -» KILLED. PRIVATE W. PRYOR. M...,0.915966,578,1,0,"[(6, 17, 4, 15), (23, 30, 19, 26), (44, 50, 38...",yes,339,339,177,160


In [6]:
sampledf_band1 = sampledf[sampledf['quality_band'] == 1]
sampledf_band2 = sampledf[sampledf['quality_band'] == 2]
sampledf_band3 = sampledf[sampledf['quality_band'] == 3]
sampledf_band4 = sampledf[sampledf['quality_band'] == 4]

In [7]:
dfbands = [sampledf_band1, sampledf_band2, sampledf_band3, sampledf_band4]

for band in reversed(range(len(dfbands))):
    ahum = dfbands[band]['alignedchars_hum'].sum(axis=0)
    aocr = dfbands[band]['alignedchars_ocr'].sum(axis=0)
    uhum = dfbands[band]['uncertainchars_ocr'].sum(axis=0)
    uocr = dfbands[band]['uncertainchars_hum'].sum(axis=0)
    
    print("Band:", band, ahum / (ahum + uhum))

Band: 3 0.3051644178703162
Band: 2 0.468855100896861
Band: 1 0.6407480665926935
Band: 0 0.7756302856978652


### Create csv with aligned text of articles in the sample

Format (uncertain text is surrounded by `(((` and `)))` to make it easier to see them when annotating):

In [10]:
to_annotate = pd.DataFrame()
to_annotate = pd.concat([to_annotate, sampledf_band1.sample(2)])
to_annotate = pd.concat([to_annotate, sampledf_band2.sample(2)])
to_annotate = pd.concat([to_annotate, sampledf_band3.sample(3)])
to_annotate = pd.concat([to_annotate, sampledf_band4.sample(4)])
to_annotate.shape

(11, 17)

In [11]:
to_annotate.head()

Unnamed: 0,filePath,articleId,articleType,year,ocrText,humanText,corrected,str_similarity,str_length,quality_band,use_corrected,alignment,processed,alignedchars_ocr,alignedchars_hum,uncertainchars_ocr,uncertainchars_hum
418,./trove_overproof/datasets/dataset1/rawTextAnd...,17948935,Article,1945,"""BRITAIN STILL UNITED"" Conservative M.P. Reass...","""BRITAIN STILL UNITED"" Conservative M.P. Reass...","""BRITAIN STILL UNITED"" Conservative M.P. Reass...",0.905191,2331,1,0,"[(0, 8, 0, 8), (9, 14, 9, 14), (15, 22, 15, 22...",yes,1378,1423,563,576
16951,./trove_overproof/datasets/dataset1/rawTextAnd...,13205688,Article,1870,THE DARLING. Tho correspondent of the Pastoral...,THE DARLING. The correspondent of the Pastoral...,THE DARLING. The correspondent of the Pastoral...,0.966443,1638,1,0,"[(0, 3, 0, 3), (4, 12, 4, 12), (17, 30, 17, 30...",yes,1164,1164,202,201
14637,./trove_overproof/datasets/dataset1/rawTextAnd...,17582605,Article,1939,LATE MR. JUSTICE ' STEPHEN. Supreme Court Trib...,LATE MR. JUSTICE STEPHEN. Supreme Court Tribut...,LATE MR. JUSTICE STEPHEN. Supreme Court Tribut...,0.872858,1809,2,0,"[(0, 4, 0, 4), (5, 8, 5, 8), (9, 16, 9, 16), (...",yes,999,1014,538,535
21641,./trove_overproof/datasets/dataset1/rawTextAnd...,14847840,Article,1907,"SHOCKING FATALITY. ALBURY, Thursday. Benjamin ...","SHOCKING FATALITY. ALBURY, Thursday. Benjamin ...","SHOCKING FATALITY. ALBURY, Thursday. Benjamin ...",0.892356,641,2,0,"[(0, 8, 0, 8), (9, 18, 9, 18), (19, 26, 19, 26...",yes,401,406,135,134
16349,./trove_overproof/datasets/dataset1/rawTextAnd...,14739922,Article,1905,BRITISH íASTEONOMICAL ASSOO&l ATION. » ---?--....,BRITISH ASTRONOMICAL ASSOCI- ATION. ----------...,BRITISH ASTRONOMICAL ASSOCI- ATION. ----------...,0.787679,909,3,0,"[(0, 7, 0, 7), (8, 21, 8, 20), (30, 36, 29, 35...",yes,443,453,279,348


In [13]:
import csv

dAlignmentsToEvaluate = dict()
for index, row in to_annotate.iterrows():
    newArticle = []
    ocrText = row['ocrText']
    humanText = row['corrected']
    alignment = ast.literal_eval(row['alignment'])
    ocr_index = 0
    hum_index = 0
    alignment.sort(key=lambda tup: tup[0])
    for a in alignment:
        if a[0] != 0 and a[2] != 0:
            uncertain_match = ocrText[ocr_index : a[0] - 1].strip() + "\t" + humanText[hum_index : a[2] - 1].strip()
            uncertain_match = uncertain_match.strip()
            if uncertain_match:
                uncertain = str(row['articleId']) + "\t" + str(row['str_similarity']) + "\tUNCERTAIN\t" + "(((" + uncertain_match + ")))\t" + str(ocr_index) + "\t " + str(a[0] - 1)
                newArticle.append(uncertain)
        aligned_match = ocrText[a[0] : a[1]].strip() + "\t" + humanText[a[2] : a[3]].strip()
        aligned = str(row['articleId']) + "\t" + str(row['str_similarity']) + "\t" + "ALIGNED\t" + aligned_match + "\t" + str(a[2]) + "\t " + str(a[3])
        newArticle.append(aligned)
        ocr_index = a[1] + 1
        hum_index = a[3] + 1

    dAlignmentsToEvaluate[(row['articleId'], row['str_similarity'])] = newArticle

with open("alignments.tsv", "w") as csvfile:
    writer = csv.writer(csvfile, delimiter='\t')
    for article in dAlignmentsToEvaluate:
        for textsnippet in dAlignmentsToEvaluate[article]:
            textsnippet = textsnippet.split("\t")
            writer.writerow(textsnippet)

Assessment of the quality of the evaluation here: https://docs.google.com/spreadsheets/d/1HugVFVYPjtz9rBPNfJwsmL9WXVDCUVw_IuFiCYSHR88/edit#gid=1936002185

### Alignment of just one article

In [None]:
trovedf = pd.read_pickle("trove_artidigh_aligned_human.pkl")
trovedf['quality_band'] = trovedf["str_similarity"].apply(quality)
artidigh_example = trovedf[trovedf['articleId'] == '13031691']

for index, row in artidigh_example.iterrows():
    aligned_chars_ocr = 0
    aligned_chars_hum = 0
    uncertain_chars_ocr = 0
    uncertain_chars_hum = 0
    ocrText = row['ocrText']
    humanText = row['humanText']
    alignment = ast.literal_eval(row['alignment'])
    ocr_index = 0
    hum_index = 0
    alignment.sort(key=lambda tup: tup[0])
    for a in alignment:
        if a[0] != 0 and a[2] != 0:
            uncertain_chars_ocr += len(ocrText[ocr_index : a[0] - 1].strip())
            uncertain_chars_hum += len(humanText[hum_index : a[2] - 1].strip())
            aligned_chars_ocr += len(ocrText[a[0] : a[1]].strip())
            aligned_chars_hum += len(humanText[a[2] : a[3]].strip())
        ocr_index = a[1] + 1
        hum_index = a[3] + 1
    artidigh_example.loc[index, 'alignedchars_ocr'] = aligned_chars_ocr
    artidigh_example.loc[index, 'alignedchars_hum'] = aligned_chars_hum
    artidigh_example.loc[index, 'uncertainchars_ocr'] = uncertain_chars_ocr
    artidigh_example.loc[index, 'uncertainchars_hum'] = uncertain_chars_hum

dAlignmentsToEvaluate = dict()
for index, row in artidigh_example.iterrows():
    newArticle = []
    ocrText = row['ocrText']
    humanText = row['humanText']
    alignment = ast.literal_eval(row['alignment'])
    ocr_index = 0
    hum_index = 0
    alignment.sort(key=lambda tup: tup[0])
    for a in alignment:
        if a[0] != 0 and a[2] != 0:
            uncertain_match = ocrText[ocr_index : a[0] - 1].strip() + "\t" + humanText[hum_index : a[2] - 1].strip()
            uncertain_match = uncertain_match.strip()
            if uncertain_match:
                uncertain = str(row['articleId']) + "\t" + str(row['str_similarity']) + "\tUNCERTAIN\t" + "(((" + uncertain_match + ")))\t" + str(ocr_index) + "\t " + str(a[0] - 1)
                newArticle.append(uncertain)
        aligned_match = ocrText[a[0] : a[1]].strip() + "\t" + humanText[a[2] : a[3]].strip()
        aligned = str(row['articleId']) + "\t" + str(row['str_similarity']) + "\t" + "ALIGNED\t" + aligned_match + "\t" + str(a[2]) + "\t " + str(a[3])
        newArticle.append(aligned)
        ocr_index = a[1] + 1
        hum_index = a[3] + 1

    dAlignmentsToEvaluate[(row['articleId'], row['str_similarity'])] = newArticle

for i in dAlignmentsToEvaluate:
    print(i[0])
    print(i[1])
    for e in dAlignmentsToEvaluate[i]:
        print(e)

In [None]:
import pandas as pd
import ast

dfexample = pd.read_pickle("trove_subsample_aligned.pkl")

In [None]:
for i, row in dfexample.iloc[:1].iterrows():
    ocrText = row['ocrText']
    humanText = row['corrected']
    print(ocrText)
    print(humanText)
    alignments = ast.literal_eval(row['alignment'])
    for a in alignments:
        print(a, ocrText[a[0]:a[1]], "---", humanText[a[2]:a[3]])