In [8]:
import os
import json
import string
import spacy

from nltk.corpus import words

nlp = spacy.load("en_core_web_sm")
nltk_words = set(words.words())

FIGURE OUT
* how to correlate eng and fr
* how to block into chucks for 1 to 1 comparison
* how to assess text quality, and drop tables, page numbers, extra characters, etc

In [17]:
def check_text_quality(text):
    non_alpha_ratio_threshold = 0.3
    valid_word_ratio_threshold = 0.6
    minimum_sentence_count = 1
    minimum_word_count = 5
    max_quality_score = 1.0
    min_quality_score = 0.0
    
    issues = []
    total_chars = len(text)
    non_alpha_chars = sum(1 for char in text if char not in string.ascii_letters + string.whitespace)
    non_alpha_ratio = non_alpha_chars / total_chars if total_chars else 1
    if non_alpha_ratio > non_alpha_ratio_threshold:
        issues.append("High ratio of non-alphanumeric characters.")
    
    words_in_text = text.split()
    valid_words = sum(1 for word in words_in_text if word.lower() in nltk_words)
    valid_word_ratio = valid_words / len(words_in_text) if len(words_in_text) > 0 else 0
    if valid_word_ratio < valid_word_ratio_threshold:
        issues.append("Low proportion of valid dictionary words.")
    
    doc = nlp(text)
    if len(list(doc.sents)) < minimum_sentence_count or len(words_in_text) < minimum_word_count:
        issues.append("Text too short or lacks coherent sentences.")
    
    quality_score = max(
        min_quality_score, 
        max_quality_score - non_alpha_ratio - (valid_word_ratio_threshold - valid_word_ratio if valid_word_ratio < valid_word_ratio_threshold else 0)
    )
    quality_score = min(max_quality_score, quality_score)

    return quality_score, issues


def process_json_file(json_file_path, debug=False, show_debug=False):
    minimum_quality_score = 1.0
    
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if isinstance(data, list):
        for i, entry in enumerate(data):
            text = entry.get("text", "")
            quality_score, issues = check_text_quality(text)
            if quality_score < minimum_quality_score:
                if debug:
                    print(f"{json_file_path} \tentry {i + 1} quality score: {quality_score}")
                    if show_debug:
                        print(f"Text: {text}")
                        print(f"Issues: {', '.join(issues)}")
    elif isinstance(data, dict):
        text = data.get("text", "")
        quality_score, issues = check_text_quality(text)
        if quality_score < minimum_quality_score:
            if debug:
                print(f"{json_file_path} quality score: {quality_score}")
                if show_debug:
                    print(f"Text: {text}")
                    print(f"Issues: {', '.join(issues)}")
    else:
        if debug:
            print(f"\nUnexpected JSON structure in file: {json_file_path}")

def process_all_json_in_folder(folder_path, debug=False):
    for root, _, files in os.walk(folder_path):
        if debug and root.split('\\')[-1] == '1978':
            print('BREAK AT 1978')
            break
            
        for file in files:
            if file.endswith('.json'):
                json_file_path = os.path.join(root, file)
                print(f"Processing: {json_file_path}")
                process_json_file(json_file_path, debug=debug)
                # if debug:
                #     return


folder_path = os.path.join("..", "ParsedPublications")
process_all_json_in_folder(folder_path, debug=True)

Processing: ..\ParsedPublications\1977\75586.pdf.json
..\ParsedPublications\1977\75586.pdf.json quality score: 0.9110455588331694
Processing: ..\ParsedPublications\1977\75587.pdf.json
..\ParsedPublications\1977\75587.pdf.json quality score: 0.8330854060718138
Processing: ..\ParsedPublications\1977\75588.pdf.json
..\ParsedPublications\1977\75588.pdf.json quality score: 0.8146835645498188
Processing: ..\ParsedPublications\1977\75589.pdf.json
..\ParsedPublications\1977\75589.pdf.json quality score: 0.8677001699203717
Processing: ..\ParsedPublications\1977\75590.pdf.json
..\ParsedPublications\1977\75590.pdf.json quality score: 0.33586310356286
Processing: ..\ParsedPublications\1977\75591.pdf.json
..\ParsedPublications\1977\75591.pdf.json quality score: 0.8990321633978872
Processing: ..\ParsedPublications\1977\75593.pdf.json
..\ParsedPublications\1977\75593.pdf.json quality score: 0.26831109714861656
Processing: ..\ParsedPublications\1977\75594.pdf.json
..\ParsedPublications\1977\75594.pdf.