In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

In [None]:
from pattern.nl import parsetree

def parse(text, parsetree):
    tokens = []
    p = parsetree(text,
                  tokenize=True,     # Split punctuation marks from words?
                  tags=True,         # Parse part-of-speech tags? (NN, JJ, ...)
                  chunks=False,      # Parse chunks? (NP, VP, PNP, ...)
                  relations=False,   # Parse chunk relations? (-SBJ, -OBJ, ...)
                  lemmata=True,      # Parse lemmata? (ate => eat)
                  encoding='utf-8',  # Input string encoding.
                  tagset=None)       # Penn Treebank II (default) or UNIVERSAL.
    for sentence_id, sentence in enumerate(p):
        for word_id, word in enumerate(sentence):
            tokens.append({'id': word_id,
                           'word': word.string,
                           'lemma': word.lemma,
                           'sentence': sentence_id,
                           'pos': word.type})
    return tokens

In [None]:
import os
import datetime
import pattern

def basic_text_statistics(in_files):
    d = {'num_words': [], 'num_sentences': []}
    text_names = []

    for in_file in tqdm(in_files):
        with open(in_file) as f:
            tokens = parse(f.read(), parsetree)
    
        d['num_words'].append(len(tokens))
        sentences = [t['sentence'] for t in tokens]
        num_sentences = len(set(sentences))
        d['num_sentences'].append(num_sentences)
    
        text_id = os.path.splitext(os.path.basename(in_file))[0]
        text_names.append(text_id)
    return pd.DataFrame(d, index=text_names)

In [None]:
from nlppln.utils import get_files

language = 'nl'

in_dir = '/home/jvdzwaan/data/kb-ocr/text_aligned_blocks-match_gs/gs'

in_files = get_files(in_dir)
print(len(in_files))

df = basic_text_statistics(in_files)
df.to_csv('/home/jvdzwaan/data/kb-ocr/stats-text_aligned_blocks-match_gs-gs.csv')

In [None]:
in_dir = '/home/jvdzwaan/data/kb-ocr/text-not-aligned/gs/'

in_files = get_files(in_dir)
print(len(in_files))

df2 = basic_text_statistics(in_files)
df2.to_csv('/home/jvdzwaan/data/kb-ocr/stats-text_not_aligned-gs.csv')

In [None]:
df

In [None]:
df2

In [None]:
df.to_csv('/home/jvdzwaan/data/kb-ocr/stats-text_aligned_blocks-match_gs-gs.csv')