In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# ground truth vocabulary + number of hapaxes

# lees in gt files
# tokenize
# create terms document matrix (using sklearn)
# get vocabulary
# calculate corpus frequencies
# count hapaxes

In [None]:
import json

import nltk.data

from itertools import chain

from nltk import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer

def nltk_tokenize(texts_file, punkt='tokenizers/punkt/dutch.pickle'):
    """
    Inputs:
        texts_file (str): File name of a file that contains the texts. This
            should contain one document per line.
        punkt (str): Path to the nltk punctuation data to be used.

    Yields:
        Counter: term-frequency vector representing a document.
    """
    tokenizer = nltk.data.load(punkt)
    
    with open(texts_file) as f:
        text = f.read()
        tokens = [word_tokenize(sent)
                  for sent in tokenizer.tokenize(text)]

        return list(chain(*tokens))
   

def do_nothing(list_of_words):
    return list_of_words

def terms_documents_matrix_word_lists(word_lists):
    """Returns a terms document matrix and related objects of a corpus

    Inputs:
        word_lists: iterator over lists of words
    Returns:
        corpus: a sparse terms documents matrix
        v: the vecorizer object containing the vocabulary (i.e., all word forms
            in the corpus)
    """
    v = CountVectorizer(tokenizer=do_nothing, lowercase=False)
    corpus = v.fit_transform(word_lists)

    return corpus, v


def get_datadivision(json_file):
    with open(json_file) as f:
        return json.load(f)
    

In [None]:
%%time

from nlppln.utils import get_files
from ochre.utils import get_files as get_test_files

nltk.download('punkt')

def texts_iterator(in_files):    
    for in_file in in_files:
        yield(nltk_tokenize(in_file))

json_file = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/datadivision-A8P3.json'
        
in_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/gs/'
#in_files = get_files(in_dir)
div = get_datadivision(json_file)
in_files = get_test_files(in_dir, div, 'test')
print(len(in_files))
gs_corpus, gs_vocabulary = terms_documents_matrix_word_lists(texts_iterator(in_files))

In [None]:
%%time
in_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/ocr/'
#in_files = get_files(in_dir)
in_files = get_test_files(in_dir, div, 'test')
print(len(in_files))
ocr_corpus, ocr_vocabulary = terms_documents_matrix_word_lists(texts_iterator(in_files))

In [None]:
%%time
in_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/pred-A8P3/'
#in_files = get_files(in_dir)
in_files = get_test_files(in_dir, div, 'test')
print(len(in_files))
#print(in_files)
pred_corpus, pred_vocabulary = terms_documents_matrix_word_lists(texts_iterator(in_files))

In [None]:
print('# types in gs', len(gs_vocabulary.get_feature_names()))
print('# types in ocr', len(ocr_vocabulary.get_feature_names()))
print('# types in pred', len(pred_vocabulary.get_feature_names()))

In [None]:
gs_voc = set(gs_vocabulary.get_feature_names())
ocr_voc = set(ocr_vocabulary.get_feature_names())
pred_voc = set(pred_vocabulary.get_feature_names()) 

In [None]:
len(ocr_voc.difference(pred_voc))

In [None]:
len(gs_voc.intersection(ocr_voc))

In [None]:
len(gs_voc.intersection(pred_voc))

In [None]:
len(ocr_voc.intersection(pred_voc))

In [None]:
from matplotlib_venn import venn2
venn2(subsets = [gs_voc, pred_voc], set_labels=['GT vocabulary', 'Corrected OCR vocabulary'])

In [None]:
venn2(subsets = [gs_voc, ocr_voc], set_labels=['GT vocabulary', 'Original OCR vocabulary'])

In [None]:
venn2(subsets = [ocr_voc, pred_voc], set_labels=['Original OCR vocabulary', 'Corrected OCR vocabulary'])

In [None]:
import scipy

def get_hapaxes(corpus, vectorizer):
    cx = scipy.sparse.csr_matrix(corpus)
    print(cx.shape)
    word_counts = cx.sum(axis=0)  # sum the colums
    wc_list = np.array(word_counts).flatten().tolist()
    
    if len(wc_list) != len(vectorizer.get_feature_names()):
        print('Unequal lengths')
        print('wc_list', len(wc_list))
        print('vocabulary', len(vectorizer.get_feature_names()))
    else:
        hapaxes = []
        for word, freq in zip(vectorizer.get_feature_names(), wc_list):
            if freq == 1:
                hapaxes.append(word)
        print('# hapaxes:', len(hapaxes))
        
        return(set(hapaxes))

print('GS')
gs_hapaxes = get_hapaxes(gs_corpus, gs_vocabulary)
print('OCR')
ocr_hapaxes = get_hapaxes(ocr_corpus, ocr_vocabulary)
print('CORR')
pred_hapaxes = get_hapaxes(pred_corpus, pred_vocabulary)

In [None]:
gs_hapaxes

In [None]:
for tv in ocr_corpus:
    