# Impact of OCR in linguistic processing

Tasks in this notebook:
- [ ] Part-of-speech tagging
- [ ] Named entity recognition
- [ ] Dependency parsing
- [ ] Semantic role labelling

Not considered here:
- Sentence splitting
- Tokenisation
- Lemmatisation

## Preprocessing the data

In [None]:
import sys
import argparse
from pathlib import Path
import spacy
import glob
import syntok.segmenter as segmenter
from spacy.tokens import Doc
import string

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
def generic_preprocessing(doc):
    with open(doc) as fr:
        lines = fr.readlines()
        ocr_toinput = lines[0].replace('[OCR_toInput]', '')
        ocr_aligned = lines[1].replace('[OCR_aligned]', '')
        gs_aligned = lines[2].replace('[ GS_aligned]', '')
        return ocr_aligned, gs_aligned

## Aligned sentence splitting using Syntok

In [None]:
def aligned_sentence_splitting(ocr_aligned, gs_aligned):
    """
    Sentence splitting with syntok.segmenter.
    """
    list_sentence_offsets = []
    sentenceTuples = []

    for paragraph in segmenter.process(gs_aligned):
        for sentence in paragraph:
            list_sentence_offsets.append(sentence[0].offset)

    for x in range(len(list_sentence_offsets)):
        begSentence = list_sentence_offsets[x]
        try:
            endSentence = list_sentence_offsets[x+1]
        except IndexError:
            endSentence = len(gs_aligned)
        ocr_sentence = ocr_aligned[begSentence - 1 : endSentence - 1]
        gs_sentence = gs_aligned[begSentence - 1 : endSentence - 1]
        sentenceTuples.append((begSentence, endSentence, ocr_sentence, gs_sentence))
        
    return sentenceTuples

In [None]:
ocr_aligned, gs_aligned = generic_preprocessing('example.txt')
alignedSentences = aligned_sentence_splitting(ocr_aligned, gs_aligned)

## Evaluate part-of-speech tagging

[In progress]

In [None]:
def evaluate_pos(ocr_sentence, gs_sentence):
    ocr_pos_sequence = dict()
    gs_pos_sequence = dict()
    for token in ocr_sentence:
        ocr_pos_sequence[token.idx] = (token.text, token.pos_)
    for token in gs_sentence:
        gs_pos_sequence[token.idx] = (token.text, token.pos_)
    
    for k, v in gs_pos_sequence.items():
        if k in ocr_pos_sequence:
            print(k, ocr_pos_sequence[k], v)

## Evaluate named entity recognition

[In progress]

In [None]:
def evaluate_ner(ocr_sentence, gs_sentence):
    ocr_ner_sequence = dict()
    gs_ner_sequence = dict()
    for token in ocr_sentence:
        ocr_ner_sequence[token.idx] = (token.text, token.ent_type_, token.ent_iob_)
    for token in gs_sentence:
        gs_ner_sequence[token.idx] = (token.text, token.ent_type_, token.ent_iob_)
    
    for k, v in gs_ner_sequence.items():
        if k in ocr_ner_sequence:
            print(k, ocr_ner_sequence[k], v)

## Compare OCR aligned with GS aligned

[In progress]

In [None]:
for st in alignedSentences:
    st_boundaries = (st[0], st[1])
    ocr_sentence = nlp(st[2])
    gs_sentence = nlp(st[3])
    
    evaluate_pos(ocr_sentence, gs_sentence)
    evaluate_ner(ocr_sentence, gs_sentence)

In [None]:
#     for token in nlp(ocr_sentence):
#         print(token.idx, token.text, token.lemma_, token.pos_, token.dep_, token.head, token.left_edge, token.right_edge, token.ent_type_, token.ent_iob_, token.is_oov)
#     print()
#     for token in nlp(gs_sentence):
#         print(token.idx, token.text, token.lemma_, token.pos_, token.dep_, token.head, token.left_edge, token.right_edge, token.ent_type_, token.ent_iob_, token.is_oov)
#     print()