In [1]:
import difflib
import json
import os
import random
import re
import pandas as pd

from nltk.corpus import wordnet as wn, stopwords
from textblob import TextBlob

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # jupyter notebook full-width display
display(HTML("<style>.dataframe td { white-space: nowrap; }</style>")) # no text wrapping

# pandas formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

In [12]:
# import nltk
# nltk.download("stopwords")
# nltk.download("wordnet")
# nltk.download("omw-1.4")

english_words = {w for w in wn.all_lemma_names(lang='eng')}
english_words.update(stopwords.words("english"))
french_words = {w for w in wn.all_lemma_names(lang='fra')}
french_words.update(stopwords.words("french"))

# remove overlapping words
english_words.difference_update(french_words)
french_words.difference_update(english_words)

# remove numeric
english_words.difference_update({w for w in english_words if w.isnumeric()})
french_words.difference_update({w for w in french_words if w.isnumeric()})

In [3]:
parsed_docs_folder = os.path.join("..", "ParsedPublications")
fr_eng_correlation_csv = "fr_eng_correlation_data.csv"

# import data

In [4]:
fr_eng_correlation_df = pd.read_csv(fr_eng_correlation_csv)

# weblinks for previewing / testing
weblinks_df = fr_eng_correlation_df.copy()
weblinks_df = weblinks_df[['pub_number', 'nom', 'name', 'url_fr', 'url_en', 'file_url_fr', 'file_url_en']]

# simplified correlation table
fr_eng_correlation_df = fr_eng_correlation_df[['pub_number', 'filename_fr', 'filename_en']]

# helper functions

In [5]:
def preview_publication(pub_number):
    if type(pub_number) is pd.DataFrame and pub_number.shape[0] == 1:
        try:
            pub_number = pub_number['pub_number'].values[0]
        except ValueError:
            return None
    elif type(pub_number) is pd.Series:
        try:
            pub_number = pub_number.values[0]
        except ValueError:
            return None
    
    try:
        output_df = weblinks_df[weblinks_df.pub_number == pub_number].T
    except Exception as e:
        print(e)
        return None
        
    return output_df


In [15]:
# DATA CLEANING FUNCTIONS
debug = True

# Function 1: Locate Both Files for a Publication
# This function retrieves both files (French and English) associated with a publication number from fr_eng_correlation_df.
def get_files_for_publication(pub_number, fr_eng_correlation_df):
    row = fr_eng_correlation_df.loc[fr_eng_correlation_df['pub_number'] == pub_number]
    if not row.empty:
        filename_fr = row['filename_fr'].values[0]
        filename_en = row['filename_en'].values[0]
        return filename_fr, filename_en
    return None, None


# Function 2: Locate JSON Files
# This function converts a PDF filename to a JSON filename in the parsed_docs_folder.
def get_json_file_link(parsed_docs_folder, pdf_filename):
    if pdf_filename.endswith(".pdf"):
        json_filename = pdf_filename + ".json"
        for root, _, files in os.walk(parsed_docs_folder):
            if json_filename in files:
                return os.path.join(root, json_filename)
    return None


# TODO: get a working language classifier before this task
# Function 3: Extract English and French Text from Two Files (Scenario 1)
# This function reads two JSON files and extracts English and French text.
def extract_text_from_two_files(json_file_fr, json_file_en):
    with open(json_file_fr, 'r', encoding='utf-8') as fr_file, open(json_file_en, 'r', encoding='utf-8') as en_file:
        data_fr = json.load(fr_file)
        data_en = json.load(en_file)
        text_fr = " ".join(data_fr.get('text', []))
        text_en = " ".join(data_en.get('text', []))
        return text_fr, text_en


# TODO: get a working language classifier before this task
# Function 4: Extract English and French Text from One File (Scenario 2)
# This function extracts English and French text from a single JSON file by separating text blocks based on language.
def extract_text_from_single_file(json_file):
    min_words_in_language = 3
    max_words_in_other_language = 0
    
    
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if 'text' not in data:
        raise KeyError(f"The key 'text' is missing in the JSON file: {json_file}")
    
    full_text = data['text']
    text_blocks = re.split(r'[.\n?]', full_text)
    text_fr, text_en = [], []

    for block in text_blocks:
        block = block.strip()
        if not block:
            continue
            
        en_count = sum(1 for word in block if word in english_words)
        fr_count = sum(1 for word in block if word in french_words)
        
        if fr_count > 40:
            debug = True
        
        if fr_count >= min_words_in_language and en_count <= max_words_in_other_language:
            text_fr.append(block)        
        elif en_count >= min_words_in_language and fr_count <= max_words_in_other_language:
            text_en.append(block)
        else:
            if debug:
                print(f'Mixed/missing: {block}\n\tfr: {fr_count}, en: {en_count}\n')

    return " ".join(text_fr), " ".join(text_en)


# Function 5: Assess Text Quality
# This function assigns a quality score to text blocks using NLP techniques such as language model perplexity or OCR error detection.
def assess_text_quality(text, is_french=True):
    min_valid_word_ratio = 0.7
    min_non_alphabet_ratio = 0.5
        
    if not text.strip():
        return 0
    
    # proportion of characters that are not letters
    total_chars = len(text)
    non_alphabet_chars = len(re.findall(r'[^a-zA-ZÀ-ÿ]', text))
    non_alphabet_ratio = non_alphabet_chars / max(1, total_chars)
    if non_alphabet_ratio > min_non_alphabet_ratio:
        return 0
    
    # word-to-sentence ratio
    blob = TextBlob(text)
    word_sentence_ratio = len(blob.words) / max(1, len(blob.sentences))
    
    # valid word ratio
    valid_words = []
    for word in blob.words:
        try:
            if detect(word) == ('fr' if is_french else 'en'):
                valid_words.append(word)
        except Exception as e:
            continue

    valid_word_ratio = len(valid_words) / max(1, len(blob.words))
    if valid_word_ratio < min_valid_word_ratio:
        return 0

    return word_sentence_ratio * valid_word_ratio * (1 - non_alphabet_ratio)


# Function 6: Correlate and Clean Text
# This function matches English and French text chunks and cleans extra characters, returning aligned data.
def correlate_and_clean_text(text_fr, text_en):
    matcher = difflib.SequenceMatcher(None, text_fr.split(), text_en.split())
    matched_fr, matched_en = [], []
    
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':  # Matches only equal segments
            matched_fr.append(" ".join(text_fr.split()[i1:i2]))
            matched_en.append(" ".join(text_en.split()[j1:j2]))
    
    return matched_fr, matched_en


# Function 7: Process All Rows
# This function orchestrates the processing of all rows in fr_eng_correlation_df and creates a master dataframe.
def process_all_rows(fr_eng_correlation_df, parsed_docs_folder):
    master_data = []
    min_text_quality = 1
    
    for _, row in fr_eng_correlation_df.iterrows():
        pub_number = row['pub_number']
        filename_fr, filename_en = row['filename_fr'], row['filename_en']
        
        if filename_fr == "WITHDRAWN" and filename_en == "WITHDRAWN":
            continue
        
        json_file_fr = get_json_file_link(parsed_docs_folder, filename_fr)
        json_file_en = get_json_file_link(parsed_docs_folder, filename_en)
        
        if filename_fr != filename_en:  # Scenario 1
            text_fr, text_en = extract_text_from_two_files(json_file_fr, json_file_en)
        else:  # Scenario 2
            text_fr, text_en = extract_text_from_single_file(json_file_fr)
        
        # Assess and correlate text
        if assess_text_quality(text_fr) > min_text_quality and assess_text_quality(text_en) > min_text_quality:
            matched_fr, matched_en = correlate_and_clean_text(text_fr, text_en)
            master_data.append({'pub_number': pub_number, 'text_fr': matched_fr, 'text_en': matched_en})
    
    return pd.DataFrame(master_data)


# Function 8: Debugging and Evaluation
# This function tracks metrics for debugging and evaluation.
def track_debug_data(pub_number, filename, success, num_rejected, quality_score):
    debug_data = {
        'pub_number': pub_number,
        'filename': filename,
        'success': success,
        'num_rejected': num_rejected,
        'quality_score': quality_score
    }
    return debug_data


# testing

In [19]:
def extract_text_from_single_file(json_file):
    min_words_in_language = 3
    max_words_in_other_language = 0
    
    
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if 'text' not in data:
        raise KeyError(f"The key 'text' is missing in the JSON file: {json_file}")
    
    full_text = data['text']
    text_blocks = re.split(r'[.\n?]', full_text)
    text_fr, text_en = [], []

    for block in text_blocks:
        block = block.strip()
        if not block:
            continue
            
        en_count = sum(1 for word in block.split() if word in english_words)
        fr_count = sum(1 for word in block.split() if word in french_words)
        
        if fr_count > 1:
            debug = True
        
        if fr_count >= min_words_in_language and en_count <= max_words_in_other_language:
            text_fr.append(block)        
        elif en_count >= min_words_in_language and fr_count <= max_words_in_other_language:
            text_en.append(block)
        else:
            if debug:
                print(f'Mixed/missing: {block}\n\tfr: {fr_count}, en: {en_count}\n')

    return " ".join(text_fr), " ".join(text_en)

json_file = '..\\ParsedPublications\\1993\\165710.pdf.json'
pub_number = 'RES 1993/064'
output_fr, output_en = extract_text_from_single_file(json_file)

Mixed/missing: Not to be cited without permission Ne pas citer sans autorisation des
	fr: 6, en: 2

Mixed/missing: of the authors l auteurs l
	fr: 2, en: 2

Mixed/missing: Research Document 93/64 peches dans l'Atlantique 93/64
	fr: 1, en: 0

Mixed/missing: by
	fr: 0, en: 1

Mixed/missing: Roderick Morin and Alan Sinclair
	fr: 0, en: 1

Mixed/missing: Department of Fisheries and Oceans
	fr: 0, en: 2

Mixed/missing: Marine and Anadromous Fish Division
	fr: 1, en: 1

Mixed/missing: Science Branch, Gulf Region
	fr: 0, en: 0

Mixed/missing: P
	fr: 1, en: 0

Mixed/missing: O
	fr: 1, en: 0

Mixed/missing: Box 5030
	fr: 0, en: 0

Mixed/missing: Moncton, New Brunswick
	fr: 1, en: 0

Mixed/missing: E1C 9B6
	fr: 0, en: 0

Mixed/missing: IThis series documents the lLa presente serie documente les
	fr: 1, en: 2

Mixed/missing: scientific basic for the bases scientifiques des
	fr: 2, en: 4

Mixed/missing: evaluation of fisheries resources evaluations des ressources
	fr: 1, en: 2

Mixed/missing: in A

In [21]:
block = "mortality for most age classes in 1991"
en_count = sum(1 for word in block.split() if word in english_words)
fr_count = sum(1 for word in block.split() if word in french_words)

en_count, fr_count

(4, 2)

In [23]:
print(list(word for word in block.split() if word in english_words))
print(list(word for word in block.split() if word in french_words))

# oh come on! 'age' and 'classes' are french but not english...

['mortality', 'for', 'most', 'in']
['age', 'classes']


In [25]:
block = 'Total mortalities at age, based on survey data, are presented in Table'  # this is french :(

en_count = sum(1 for word in block.split() if word in english_words)
fr_count = sum(1 for word in block.split() if word in french_words)

print('english words:', list(word for word in block.split() if word in english_words))
print('french words:', list(word for word in block.split() if word in french_words))

en_count, fr_count

english words: ['based', 'survey', 'in']
french words: ['at', 'on', 'are', 'Table']


(3, 4)

In [64]:
[x for x in stopwords.words("french") if x in ['at', 'on', 'are', 'Table']]

['on']

In [27]:
# what is 'rupicola_rupicola'? what is with these underscored words?
french_words

{'derviche',
 'capitainerie_générale_;_capitainerie',
 'polyèdre',
 'bataille_de_Little_Big_Horn',
 'explosif',
 'oukaze',
 'campanula_rotundifolia',
 'extatique',
 'bataille_de_bunker_hill',
 'Effets_spéciaux',
 'réouvrir',
 'James_Bowie',
 'clique',
 'goulache',
 'sapin_de_vancouver',
 'amplitude',
 'inaliénablement',
 'spermatique',
 'Sparidae',
 'baleine_grise',
 'glaucienne_jaune',
 'tad',
 'vague_de_froid',
 'euphausiacea',
 'Croix_du_Sud',
 'provençal',
 'Alcée',
 'ceratonia_siliqua',
 'couronnement',
 'admonester',
 'rosier_de_Chine',
 'Polypedates',
 'onzième',
 'ricinus',
 'baie_de_sureau',
 'Federal_Communications_Commission',
 'étrusque',
 'belliciste',
 'agathis',
 'doris',
 'serais',
 'Pistolet_à_eau',
 'rougeâtre',
 'Jerry_Lee_Lewis',
 'Ohio',
 'Rétroprojecteur',
 'subcontinent',
 "état_d'oxydation",
 'jarretelle',
 'essencerie',
 "capacité_d'emport",
 'Directeur_:',
 'tritium',
 'monographie',
 'Heliamphora',
 'moraine',
 'anesthésie_péridurale',
 'John_Muir',
 'corpore

In [13]:
[x for x in french_words if x.isnumeric()]

[]

In [212]:
sum(1 for word in eg1.lower().split() if word in french_words)

0

In [209]:
[word for word in eg1.lower().split() if word in english_words]

['comparison', 'of', 'observed', 'and', 'in', 'the', 'plaice']

In [210]:
[word for word in eg1.lower().split() if word not in english_words]

['estimated', 'discards', '1984', '4t']

In [208]:
# words in both lists?

[word for word in english_words if word in french_words]

[]

# old junk

In [6]:
fr_eng_correlation_df.sample(1)

Unnamed: 0,pub_number,filename_fr,filename_en
2680,RES 1993/064,165710.pdf,165710.pdf


In [7]:
get_files_for_publication("RES 1993/064", fr_eng_correlation_df)

('165710.pdf', '165710.pdf')

In [8]:
get_json_file_link(parsed_docs_folder, '165710.pdf')

'..\\ParsedPublications\\1993\\165710.pdf.json'

In [45]:
json_file = '..\\ParsedPublications\\1993\\165710.pdf.json'
pub_number = 'RES 1993/064'

In [57]:
output_fr, output_en = extract_text_from_single_file(json_file)

In [58]:
output_fr[:500]


"Research Document 93/64 peches dans l'Atlantique 93/64 IThis series documents the lLa presente serie documente les scientific basic for the bases scientifiques des As such, it halieutiques sur la cote Elle traite documents it contains are not echeanciers dictes Les documents qu'elle contient ne doivent pas rather as progress reports on definitifs sur les sujets traites mais plutot comme des rapports d'etape sur les etudes en cours plaice ages 7-19 at 0 Selon les donnees provisoires, les debarque"

In [62]:
output_en[:1000]

'Not to be cited without permission Ne pas citer sans autorisation des of the authors l auteurs l DFO Atlantic Fisheries MPO Document de recherche sur les status of American plaice in NAFO Division 4T Roderick Morin and Alan Sinclair Department of Fisheries and Oceans Marine and Anadromous Fish Division Science Branch, Gulf Region Moncton, New Brunswick evaluation of fisheries resources evaluations des ressources in Atlantic Canada addresses the issues of the day in Atlantique du Canada the time frames required and the des problemes courants selon les intended as definitive statements on the subjects addressed but etre consideres comme des enonces ongoing investigations Research documents are produced in Les documents de recherche sont the official language in which publies dans la langue officielle they are provided to the utilisee dans le manuscrit envoye Provisional landings of American plaice in NAFO Division 4T were 5,229 t in 1991 and 5,140 t in 1992 These are the lowest landings

In [42]:
# it takes 5s to extract data from 1 file. there are 13k files
f'this may take greater than {5*13000/60/60:0.0f} hours to run'

'this may take greater than 18 hours to run'

In [55]:
preview_publication(pub_number)

Unnamed: 0,2680
pub_number,RES 1993/064
nom,
name,Status of American plaice in NAFO Division 4T
url_fr,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/1993/1993_064-fra.html
url_en,http://www.dfo-mpo.gc.ca/csas-sccs/Publications/ResDocs-DocRech/1993/1993_064-eng.html
file_url_fr,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/165710.pdf
file_url_en,https://waves-vagues.dfo-mpo.gc.ca/library-bibliotheque/165710.pdf


In [63]:
from langdetect import detect_langs, DetectorFactory, LangDetectException

# Ensure consistent results across runs
DetectorFactory.seed = 0

def detect_language_with_certainty(block, threshold=0.8):
    try:
        lang_probs = detect_langs(block)
        if len(lang_probs) == 1 and lang_probs[0].prob >= threshold:
            return lang_probs[0].lang, 
        else:
            return 'mixed'
    except LangDetectException:
        return 'unknown'


In [65]:
block = 'Not to be cited without permission Ne pas citer sans autorisation des of the authors l auteurs l DFO Atlantic'
detect_language_with_certainty(block)

'en'

In [66]:
detect_langs(block)

[en:0.999994878914645]

In [68]:
block = 'Not to be cited without permission Ne pas citer sans autorisation des of the authors'
detect_langs(block)

[en:0.9999960972906797]

In [74]:
block = 'Ne pas citer sans autorisation des auteurs'
detect_langs(block)

[fr:0.9999963298348985]

In [107]:
block = 'Ne pas citer sans autorisation des of the authors'
detect_langs(block)

[en:0.9999982385409765]

In [85]:
from polyglot.detect import Detector

detector = Detector(block)
detector.languages

ModuleNotFoundError: No module named 'polyglot'

In [91]:
!pip install polyglot


Collecting polyglot
  Using cached polyglot-16.7.4.tar.gz (126 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [10 lines of output]
  Traceback (most recent call last):
    File "<string>", line 2, in <module>
    File "<pip-setuptools-caller>", line 34, in <module>
    File "C:\Users\CARRK\AppData\Local\Temp\1\pip-install-a2712izn\polyglot_0a7d5708da6c40d2a3270cae58349a98\setup.py", line 15, in <module>
      readme = readme_file.read()
               ^^^^^^^^^^^^^^^^^^
    File "C:\Program Files\Python312\Lib\encodings\cp1252.py", line 23, in decode
      return codecs.charmap_decode(input,self.errors,decoding_table)[0]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 4941: character maps to <undefined>
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed

Encountered error while generating package met

In [108]:

# Load English and French models
nlp_en = spacy.load("en_core_web_md")
nlp_fr = spacy.load("fr_core_news_md")

# Example texts
english_text = "This is an example sentence in English."
french_text = "Ceci est une phrase d'exemple en français."

# Process texts with the respective models
doc_en = nlp_en(english_text)
doc_fr = nlp_fr(french_text)

# Analyze and print tokens, part-of-speech tags, etc.
print("English Text Analysis:")
for token in doc_en:
    print(f"Token: {token.text}, POS: {token.pos_}, Lemma: {token.lemma_}")

print("\nFrench Text Analysis:")
for token in doc_fr:
    print(f"Token: {token.text}, POS: {token.pos_}, Lemma: {token.lemma_}")


English Text Analysis:
Token: This, POS: PRON, Lemma: this
Token: is, POS: AUX, Lemma: be
Token: an, POS: DET, Lemma: an
Token: example, POS: NOUN, Lemma: example
Token: sentence, POS: NOUN, Lemma: sentence
Token: in, POS: ADP, Lemma: in
Token: English, POS: PROPN, Lemma: English
Token: ., POS: PUNCT, Lemma: .

French Text Analysis:
Token: Ceci, POS: PRON, Lemma: ceci
Token: est, POS: AUX, Lemma: être
Token: une, POS: DET, Lemma: un
Token: phrase, POS: NOUN, Lemma: phrase
Token: d', POS: ADP, Lemma: de
Token: exemple, POS: NOUN, Lemma: exemple
Token: en, POS: ADP, Lemma: en
Token: français, POS: ADJ, Lemma: français
Token: ., POS: PUNCT, Lemma: .


In [109]:
list(nlp_en("this blorrrbbb is englitch").sents)

[this blorrrbbb is englitch]

In [103]:
type(doc_en)

spacy.tokens.doc.Doc

In [114]:
english_text = "This is an example sentence in English."
french_text = "Ceci est une phrase d'exemple en français."

doc_en = nlp_en(english_text)
doc_fr = nlp_fr(french_text)

similarity = doc_en.similarity(doc_fr)
print(f"Similarity between English and French texts: {similarity:.2f}")


Similarity between English and French texts: 1.00


In [115]:
nlp_fr(doc_en)

This is an example sentence in English.

In [144]:
mixed_sentence = "This is an example sentence in English Ceci est une phrase d'exemple en français"

In [167]:
detect_langs(" ".join(random.sample(mixed_sentence.split(), len(mixed_sentence.split()))))

[fr:0.5714283365339268, en:0.4285707441167784]

"phrase is d'exemple une This English Ceci français sentence in est an en example"

In [170]:
'word' in nlp_en

TypeError: argument of type 'English' is not iterable

# wordnet from nltk

In [171]:
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CARRK\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\CARRK\AppData\Roaming\nltk_data...


True

In [175]:


mixed_sentence = "This is an example sentence in English Ceci est une phrase d'exemple en français"

words = mixed_sentence.lower().split()

english_count = sum(1 for word in words if word in english_words)
french_count = sum(1 for word in words if word in french_words)

print(f"English words: {english_count}")
print(f"French words: {french_count}")

English words: 8
French words: 7


# more dictionary options

In [29]:
# !pip install wordlist

In [32]:
# this looks like an AI hallucination, the module doesn't do that 

# from wordlist import words
# 
# english_words = set(words.get_words('en'))
# french_words = set(words.get_words('fr'))

In [34]:
# !pip install pyenchant

In [63]:
# this also looks like an AI hallucination, but the docs do say 'fr' as an example... it just doesn't work 

import enchant

english_dict = enchant.Dict("en_US")
# english_words = {word for word in english_dict}
french_dict = enchant.Dict("fr")
# french_words = {word for word in french_dict}

DictNotFoundError: Dictionary for language 'fr' could not be found
Please check https://pyenchant.github.io/pyenchant/ for details

In [45]:
help(enchant)

Help on package enchant:

NAME
    enchant

DESCRIPTION
    enchant:  Access to the enchant spellchecking library

    This module provides several classes for performing spell checking
    via the Enchant spellchecking library.  For more details on Enchant,
    visit the project website:

        https://abiword.github.io/enchant/

    Spellchecking is performed using 'Dict' objects, which represent
    a language dictionary.  Their use is best demonstrated by a quick
    example::

        >>> import enchant
        >>> d = enchant.Dict("en_US")   # create dictionary for US English
        >>> d.check("enchant")
        True
        >>> d.check("enchnt")
        False
        >>> d.suggest("enchnt")
        ['enchant', 'enchants', 'enchanter', 'penchant', 'incant', 'enchain', 'enchanted']

    Languages are identified by standard string tags such as "en" (English)
    and "fr" (French).  Specific language dialects can be specified by
    including an additional code - for example, "e

In [62]:
english_dict.check('plaice')

True

In [61]:
from enchant.checker import SpellChecker

en_chkr = SpellChecker("en_US")
fr_chkr = SpellChecker("fr")

DefaultLanguageNotFoundError: fr

In [58]:
block = 'Total mortalities at age, based on survey data, are presented in Table'

chkr = SpellChecker("en_US")
chkr.set_text("This is sme sample txt with erors.")

ERROR: sme
ERROR: txt
ERROR: erors


In [59]:
chkr = SpellChecker("en_US")
chkr.set_text("This is text without errors.")
for err in chkr:
    print("ERROR:", err.word)

In [72]:
# https://github.com/dwyl/english-words
en_words_filename = os.path.join("word_lists", "en_words.txt")
with open(en_words_filename, 'r', encoding='utf-8') as f:
    english_words = set(f.read().splitlines())

# https://github.com/51413resu/full-list-of-french-words
fr_words_filename = os.path.join("word_lists", "fr_words.txt")
with open(fr_words_filename, 'r', encoding='utf-8') as f:
    french_words = set(f.read().splitlines())
    
# remove overlapping words
english_words.difference_update(french_words)
french_words.difference_update(english_words)

# remove numeric and "words" with "." in them
english_words.difference_update({w for w in english_words if w.isnumeric() or '.' in w})
french_words.difference_update({w for w in french_words if w.isnumeric() or '.' in w})

In [73]:
block = 'Total mortalities at age, based on survey data, are presented in Table'  # this is french :(

en_count = sum(1 for word in block.split() if word in english_words)
fr_count = sum(1 for word in block.split() if word in french_words)

print('english words:', list(word for word in block.split() if word in english_words))
print('french words:', list(word for word in block.split() if word in french_words))

en_count, fr_count

english words: ['mortalities', 'based', 'survey', 'presented']
french words: ['on', 'are', 'in']


(4, 3)

set()