## Create a DataFrame from the Trove dataset

### Obtaining the data

You can download the whole Trove dataset using:

`wget --recursive --no-parent http://overproof.projectcomputing.com/datasets/`

Change the parent directory `overproof.projectcomputing.com` to a friendlier name `trove_overproof`.

### Preprocessing of the data

In [None]:
from glob import glob
import jellyfish
import numpy as np
import os
import pandas as pd
import re

In [None]:
# Dictionary with the following structure:
# arbitrary_dirname: path_to_txt_files
# dirname is an arbitrary name which can be used to filter the DB in later steps

dict_dirnames = {
    "trove_dataset_1": './trove_overproof/datasets/dataset1/rawTextAndHumanCorrectionPairs',
    "trove_dataset_2": './trove_overproof/datasets/dataset2/rawTextAndHumanCorrectionAndOverproofCorrectionTriples',
    "trove_dataset_3": './trove_overproof/datasets/dataset3/rawTextAndHumanCorrectionAndOverproofCorrectionTriples'
}

In [None]:
df = pd.DataFrame(columns=["filePath", "articleId", "articleType", "year", 
                           "ocrText", "humanText", "corrected"])

In [None]:
def process_header(line):
    re_header = r'^\*\$\*OVERPROOF\*\$\*\s*([0-9]+)\s+year\s+([0-9]{4})\s+type\s(.+)\s+title\s+.+$'
    if re.match(re_header, line):
        articleId, year, articleType = re.match(re_header, line).groups()
        return articleId, year, articleType

In [None]:
def process_content(article_content):
    ocr_line = []
    human_line = []
    hcorr_line = []
    for line in article_content:
        line = line.split("||@@||")
        if (not line[0].strip() == '') or (not line[1].strip() == ''):
            if len(line) == 3:
                ocr_line.append(line[0].strip())
                human_line.append(line[1].strip())
                hcorr_line.append(line[2].strip())
            if len(line) == 2:
                ocr_line.append(line[0].strip())
                human_line.append(line[1].strip())

    return " ".join(ocr_line), " ".join(human_line), " ".join(hcorr_line)

In [None]:
article_content = []
year = ""
file_path = ""
articleId = ""
articleType = ""
processed_content = ""
article_counter = 0
for ldir in dict_dirnames.keys():
    list_files = glob(os.path.join(dict_dirnames[ldir], "*.txt"))
    for lfile in list_files:
        file_path = lfile
        fio = open(lfile, "r")
        flines = fio.readlines()
        for iline in range(len(flines)):
            if flines[iline].startswith('*$*OVERPROOF*$*'):
                articleId, year, articleType = process_header(flines[iline])
                article_counter += 1
            else:
                article_content.append(flines[iline])
                if ((len(flines) -1 == iline) or (flines[iline + 1].startswith('*$*OVERPROOF*$*'))):
                    processed_content = process_content(article_content)

                    # Insert into dataframe:
                    df.loc[article_counter] = [file_path, 
                                               articleId,
                                               articleType,
                                               year,
                                               processed_content[0],
                                               processed_content[1],
                                               processed_content[2]
                                              ] 
                    
                    # Clean variables:
                    year = ""
                    articleId = ""
                    articleType = ""
                    processed_content = ""
                    article_content = []

print(article_counter)

In [None]:
df

### Add string similarity and length

In [None]:
from pandarallel import pandarallel
# Initialization
pandarallel.initialize()

In [None]:
def distance_via_levenshtein(gs_clean, ocr_clean):
    gs_clean = gs_clean.lower()
    ocr_clean = ocr_clean.lower()
    max_sentlength = max(len(gs_clean), len(ocr_clean))
    lev_distance = jellyfish.levenshtein_distance(gs_clean, ocr_clean)
    dist_similarity = (max_sentlength - lev_distance) / float(max_sentlength)
    return dist_similarity

In [None]:
df['str_similarity'] = df.parallel_apply(lambda row: distance_via_levenshtein(row['ocrText'], row['humanText']), axis=1)
df['str_length_humanText'] = df.parallel_apply(lambda row: len(row['humanText']), axis=1)
df['str_length_ocrText'] = df.parallel_apply(lambda row: len(row['ocrText']), axis=1)


In [None]:
# Number of documents that have at least one article with string similarity lower than 0.8:
len(df[df['str_similarity'] < 0.8]['articleId'].unique())

In [None]:
def quality(similarity):
    if similarity > 0.9:
        return 1 # good
    elif similarity > 0.8:
        return 2 # soso
    elif similarity > 0.7:
        return 3 # bad
    return 4 # ugly

df['quality_band'] = df["str_similarity"].apply(quality)

In [None]:
df['quality_band'].value_counts()

### OCR and GS string similarity distribution

Plot the distribution of sentences according to string similarity between OCR and GS text:

In [None]:
df[['str_similarity']].plot(kind='hist',bins=np.arange(0, 1.1, 0.1),rwidth=0.8)

Get some examples from each bin:

In [None]:
df[(df['str_similarity'] > 0.7) & (df['str_similarity'] <= 0.8)].sample(6).loc[:, ['ocrText', 'humanText']]

In [None]:
df.to_pickle("db_trove.pkl")

In [None]:
trovedf = df

dExamples = dict()
for i in np.arange(0.1, 1.1, 0.1):
    tempdf = trovedf[(trovedf['str_similarity'] > (i - 0.1)) & (trovedf['str_similarity'] <= i) & (abs(trovedf['ocrText'].str.len() - trovedf['humanText'].str.len()) <= 100)]
    if tempdf.shape[0] >= 1:
        dExamples[((i - 0.1, i))] = (tempdf.shape[0], tempdf.iloc[0].filePath, tempdf.iloc[0].ocrText, tempdf.iloc[0].humanText)

for example in dExamples:
    print("Range:", example)
    print("Number of articles:", dExamples[example][0])
    print("Example: ./" + dExamples[example][1])
    print("\nOCR text:")
    print(dExamples[example][2])
    print("\nHuman-corrected text:")
    print(dExamples[example][3])
    print()
    print('==========')
    print()

# Dictionary lookup

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import syntok.segmenter as segmenter

In [None]:
df = pd.read_pickle("db_trove.pkl")
df.head()

In [None]:
corrected_cond = (df["corrected"] == '')# & (df["str_length_humanText"] > 10)

In [None]:
df["use_corrected"] = 1
df.loc[corrected_cond, 'corrected'] = df.loc[corrected_cond, 'humanText']
df.loc[corrected_cond, 'use_corrected'] = 0
df['use_corrected'].value_counts()

In [None]:
ocr_corr_diff = abs(df['ocrText'].str.len() - df['corrected'].str.len())
ocr_corr_diff /= np.maximum(df['ocrText'].str.len().values, df['corrected'].str.len().values)
ocr_corr_diff *= 100.

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(ocr_corr_diff, c='k')
plt.grid()
plt.xticks(size=24)
plt.yticks(size=24)
plt.xlabel("ArticleID", size=32)
plt.ylabel("|#ocrText - #corrected| (%)", size=32)
plt.show()

In [None]:
plt.figure(figsize=(20, 10))

perc_diff = 10.

plt.plot(np.sort(ocr_corr_diff), c='k', lw=3)

plt.grid()
plt.xticks(size=24)
plt.yticks(size=24)
plt.xlabel("#Articles (sorted)", size=32)
plt.ylabel("|#ocrText - #corrected| (%)", size=32)
plt.axhline(perc_diff, 0, 1, color='r', ls='--', lw=3)
print("#Articles with character difference lower than %.2f%%: %i, percentage: %.2f%%" % (perc_diff, len(ocr_corr_diff[ocr_corr_diff <= perc_diff]), len(ocr_corr_diff[ocr_corr_diff <= perc_diff])/len(ocr_corr_diff)*100.))
plt.show()

In [None]:
# spacy to do most of the pre-processing
import spacy
# see: https://spacy.io/universe/project/spacy-langdetect
from spacy_langdetect import LanguageDetector

# preprocessing
# Load a spacy model
nlp = spacy.load('en_core_web_lg')
# spacy_dict will be later used for the "Dictionary lookup" evaluation
spacy_dict = list(nlp.vocab.strings)

In [None]:
def dictionary_lookup(myrow, colname="corrected"):
    #print(myrow.name, end=" ")
    
    sent_list = []
    found_dict = []
    
    for paragraph in segmenter.analyze(myrow[colname]):
        for sentence in paragraph:
            all_tokens = []
            all_txt_dict = []
            for token in sentence:
                # exactly reproduce the input
                # and do not remove "imperfections"
                # print(token.spacing, token.value, sep='', end='')
                all_tokens.append(token.value)
                if token.value.lower() in spacy_dict:
                    all_txt_dict.append(str(len(token.value)))
                else:
                    all_txt_dict.append(str(-len(token.value)))
            sent_list.append(all_tokens)
            found_dict.append(all_txt_dict)
            #found_dict.append([])
    return sent_list, found_dict

In [None]:
df.to_pickle("db_trove_before_sentencizer.pkl")

In [None]:
from pandarallel import pandarallel
# Initialization
pandarallel.initialize()

In [None]:
df['corrected_sentencizer'] = ''
df['corrected_dict_lookup'] = ''

df['ocr_sentencizer'] = ''
df['ocr_dict_lookup'] = ''

In [None]:
df['corrected_sentencizer'], df['corrected_dict_lookup'] = zip(*df.apply(dictionary_lookup, args=["corrected"], axis=1))


In [None]:
df['ocr_sentencizer'], df['ocr_dict_lookup'] = zip(*df.parallel_apply(dictionary_lookup, args=["ocrText"], axis=1))


In [None]:
df.to_pickle("db_trove_sentence_with_lookup.pkl")

In [None]:
df.head()

# OLD

In [None]:
df['corrected_sentencizer'] = ''
df['corrected_dict_lookup'] = ''

df['ocr_sentencizer'] = ''
df['ocr_dict_lookup'] = ''

counter = 0
for i_row, myrow in df.iterrows():
    counter += 1
    print(counter, end=" ")
    
    corrected_sent_list = []
    corrected_found_dict = []
    ocr_sent_list = []
    ocr_found_dict = []
    
    for paragraph in segmenter.analyze(myrow["corrected"]):
        for sentence in paragraph:
            all_tokens = []
            all_txt_dict = []
            for token in sentence:
                # exactly reproduce the input
                # and do not remove "imperfections"
                # print(token.spacing, token.value, sep='', end='')
                all_tokens.append(token.value)
                if token.value.lower() in spacy_dict:
                    all_txt_dict.append(str(len(token.value)))
                else:
                    all_txt_dict.append(str(-len(token.value)))
            corrected_sent_list.append(all_tokens)
            corrected_found_dict.append(all_txt_dict)
    
    for paragraph in segmenter.analyze(myrow["ocrText"]):
        for sentence in paragraph:
            all_tokens = []
            all_txt_dict = []
            for token in sentence:
                # exactly reproduce the input
                # and do not remove "imperfections"
                # print(token.spacing, token.value, sep='', end='')
                all_tokens.append(token.value)
                if token.value.lower() in spacy_dict:
                    all_txt_dict.append(str(len(token.value)))
                else:
                    all_txt_dict.append(str(-len(token.value)))
            ocr_sent_list.append(all_tokens)
            ocr_found_dict.append(all_txt_dict)
    df.at[i_row, 'corrected_sentencizer'] = corrected_sent_list
    df.at[i_row, 'corrected_dict_lookup'] = corrected_found_dict    
    df.at[i_row, 'ocr_sentencizer'] = ocr_sent_list  
    df.at[i_row, 'ocr_dict_lookup'] = ocr_found_dict    
    