In [1]:
# Import glob, a module that helps with file management.
import glob
import pandas as pd
from spellchecker import SpellChecker
from pathlib import Path
from nltk import wordpunct_tokenize
from collections import Counter
import os
import re
from openpyxl import load_workbook
from openpyxl.styles import Alignment, Font
import numpy as np

In [2]:
# Define the output path and create the directory if it doesn't exist
outputpath = "./final"
outputfile_path = Path(outputpath)
outputfile_path.mkdir(exist_ok=True)

# Get the current working directory
texts_folder = Path.cwd()

# Find all .txt files in the current directory
texts_list = sorted(glob.glob("*.txt"))
print("Text files to be spellchecked:", texts_list)

current_directory = os.path.basename(os.getcwd())

print(f"current_directory = '{current_directory}'")

Text files to be spellchecked: ['Traité Justice IV.txt', 'Traité Justice I.txt', 'Traité Justice III.txt', 'Traite Justice VII.txt', 'Traite Justice V.txt', 'Traite Justice VI.txt', 'Traité Justice II.txt']
current_directory = 'Traité Justice'


In [3]:
# Load the spellchecker dictionary.
# Replace the language attribute with another 2 letter code
# to select another language. Options are: English - ‘en’, Spanish - ‘es’,
# French - ‘fr’, Portuguese - ‘pt’, German - ‘de’, Russian - ‘ru’.

spell = SpellChecker(language='fr')

In [4]:
# Initialize an empty list to collect data
data_list = []

with pd.ExcelWriter(f'{current_directory}_spellcheck_data.xlsx', engine='openpyxl') as writer:
    for txt_file in sorted(texts_folder.glob('*.txt')):
        
        # Extract the relevant parts of the file path
        parts = txt_file.parts[-3:]
        file_name = os.path.join(parts[0], parts[1], parts[2])
        
        # Open each text file and read text into `ocrText`
        with open(txt_file, 'r') as inputFile:
            ocrText = inputFile.read()
            
        # Join hyphenated words that are split between lines
        ocrText = ocrText.replace("-\n", "")
        
        # Tokenize the text
        tokens = wordpunct_tokenize(ocrText)
        
        # Lowercase all tokens and filter out non-alphabetic tokens
        tokens = [token.lower() for token in tokens if token.isalpha()]
        
        # Identify unknown words
        spell = SpellChecker()
        unknown = spell.unknown(tokens)
        
        # Calculate readability
        if len(unknown) != 0:
            readability = round(100 - (float(len(unknown)) / float(len(tokens)) * 100), 2)
        else:
            readability = 100
        
        # Sort the unknown words first by 'counts' descending and then by 'unknown_tokens' alphabetically
        sorted_unknown = sorted(Counter(unknown).items(), key=lambda item: (-item[1], item[0]))
        
        # Append the data for this file to the list
        data_list.append({
            "file_name": file_name,
            "token_count": len(tokens),
            "unknown_count": len(unknown),
            "readability": readability,
        })
        
        # Write the DataFrame to a new sheet in the Excel file
        pd.DataFrame([data_list[-1]]).to_excel(writer, sheet_name=os.path.splitext(txt_file.name)[0], index=False)
        
        # Write the unknown tokens and counts vertically
        sheet = writer.sheets[os.path.splitext(txt_file.name)[0]]
        start_row = 6  # Add two extra rows of space
        sheet.cell(row=start_row, column=1, value="unknown_tokens").font = Font(bold=True)
        sheet.cell(row=start_row, column=2, value="counts").font = Font(bold=True)
        for i, (token, count) in enumerate(sorted_unknown, start=start_row + 1):
            sheet.cell(row=i, column=1, value=token)
            sheet.cell(row=i, column=2, value=count)
        
        # Print a message indicating the file has been processed
        print(txt_file, "checked for readability.")
    
    # Create a DataFrame from the collected data
    df = pd.DataFrame(data_list)

  
    # Write the combined DataFrame to a summary sheet
    df.to_excel(writer, sheet_name='Summary', index=False)

# Load the workbook to adjust cell alignment
wb = load_workbook(f'{current_directory}_spellcheck_data.xlsx')

# Align cells in the summary sheet
summary_sheet = wb['Summary']
for row in summary_sheet.iter_rows(min_row=2, max_row=summary_sheet.max_row, min_col=1, max_col=4):
    for cell in row:
        cell.alignment = Alignment(vertical='top')

# Align cells in each individual sheet
for sheet_name in wb.sheetnames:
    if sheet_name != 'Summary':
        sheet = wb[sheet_name]
        for row in sheet.iter_rows(min_row=2, max_row=sheet.max_row, min_col=1, max_col=4):
            for cell in row:
                cell.alignment = Alignment(vertical='top')

# Save the workbook with the updated alignment
wb.save(f'{current_directory}_spellcheck_data.xlsx')

df.head(20)

/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Traité Justice/Traité Justice IV.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Traité Justice/Traité Justice I.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Traité Justice/Traité Justice III.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Traité Justice/Traite Justice VII.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Traité Justice/Traite Justice V.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Traité Justice/Traite Justice VI.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Traité Justice/Traité Justice II.txt checked for readability.


Unnamed: 0,file_name,token_count,unknown_count,readability
0,L'Hospital/Traité Justice/Traité Justice IV.txt,42378,6680,84.24
1,L'Hospital/Traité Justice/Traité Justice I.txt,10832,2430,77.57
2,L'Hospital/Traité Justice/Traité Justice III.txt,23220,4174,82.02
3,L'Hospital/Traité Justice/Traite Justice VII.txt,6821,1593,76.65
4,L'Hospital/Traité Justice/Traite Justice V.txt,21050,4094,80.55
5,L'Hospital/Traité Justice/Traite Justice VI.txt,41921,6418,84.69
6,L'Hospital/Traité Justice/Traité Justice II.txt,13036,2723,79.11


In [5]:
unknown_words = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
known_words = ['', '', '', '', '', '', '', '', '', '']

In [6]:
first_file_iteration = True  # Flag to control the print statement

for file in texts_list:
    
    # Identify the output file path for each text file
    outputfile = f'{outputfile_path}/{Path(file).stem}_corrected.txt'
    
    # Open a file in "read" (r) mode
    with open(file, "r") as text:
        # Read in the contents of that file
        word_correction = text.read()

    word_correction = word_correction.lower()
    
    # Find instances of an unknown word and replace with a known word
    for i in range(len(known_words)):
        unknown_word = unknown_words[i]
        known_word = known_words[i]
        word_correction = word_correction.replace(unknown_word, known_word)
        
        if first_file_iteration:
            print(f"All instances of '{unknown_word}' replaced with '{known_word}'.")
    
    # Open the file in "write" (w) mode
    with open(outputfile, "w") as file:
        # Add the changed word into the reopened file
        file.write(word_correction)
    
    print(f"Corrected file saved as {outputfile}.")
    first_file_iteration = False  # Disable the flag after the first file

All instances of '1' replaced with ''.
All instances of '2' replaced with ''.
All instances of '3' replaced with ''.
All instances of '4' replaced with ''.
All instances of '5' replaced with ''.
All instances of '6' replaced with ''.
All instances of '7' replaced with ''.
All instances of '8' replaced with ''.
All instances of '9' replaced with ''.
All instances of '0' replaced with ''.
All instances of 'estran- 
gers' replaced with 'estrangers'.
All instances of 'rap- 
port' replaced with 'rapport'.
All instances of 'parle- 
ment' replaced with 'parlement'.
All instances of 'présen- 
tement' replaced with 'présentement'.
All instances of 'céré- 
monies' replaced with 'cérémonies'.
All instances of 'sei- 
gneur' replaced with 'seigneur'.
All instances of 'nor- 
mandie' replaced with 'normandie'.
All instances of 'reli- 
gions' replaced with 'religions'.
All instances of 'seule- 
ment' replaced with 'seulement'.
All instances of 'rai- 
sons' replaced with 'raisons'.
All instances of 'ad

In [7]:
import locale

# Set the locale to your desired setting (e.g., 'fr_FR.UTF-8' for French)
locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')

# Assume spell is a previously defined spell checker instance
# outputfile_path is the directory path to save the Excel file
# outputpath is the directory path containing the .txt files

# Create an Excel writer object
with pd.ExcelWriter(f'{outputfile_path}/{Path.cwd().name}_corrected_spellcheck_data.xlsx', engine='openpyxl') as writer:

    # Get list of files and sort them alphabetically using locale-aware sorting
    files = [f for f in os.listdir(outputpath) if f.endswith('.txt')]
    sorted_files = sorted(files, key=locale.strxfrm)

    for filename in sorted_files:
        file_path = os.path.join(outputpath, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text_data = file.read()

        text_data = text_data.replace("-\n", "")

        words = wordpunct_tokenize(text_data)

        misspelled = spell.unknown(words)

        if len(misspelled) != 0:
            readability = round(100 - (float(len(misspelled)) / float(len(words)) * 100), 2)
        else:
            readability = 100

        # Count the frequency of each misspelled word
        word_counts = Counter(misspelled)

        # Sort word_counts first by 'count' descending and then by 'word' alphabetically
        sorted_word_counts = sorted(word_counts.items(), key=lambda item: (-item[1], item[0]))

        # Create a DataFrame and sort by frequency
        misspelled_df = pd.DataFrame(sorted_word_counts, columns=['word', 'count'])

        # Add readability as the first row
        readability_header = pd.DataFrame([["Readability", readability]], columns=['word', 'count'])
        blank_rows = pd.DataFrame([["", ""]], columns=['word', 'count'])
        header_row = pd.DataFrame([["word", "count"]], columns=['word', 'count'])
        misspelled_df = pd.concat([readability_header, blank_rows, header_row, misspelled_df], ignore_index=True)

        # Write the DataFrame to a new sheet in the Excel file
        sheet_name = os.path.splitext(filename)[0]
        misspelled_df.to_excel(writer, sheet_name=sheet_name, index=False, header=False)

        print(f'Spellchecked {filename}. Readability = {readability}')

print(f"All data combined into {outputfile_path}/{Path.cwd().name}_corrected_spellcheck_data.xlsx")


Spellchecked Traité Justice II_corrected.txt. Readability = 82.72
Spellchecked Traité Justice IV_corrected.txt. Readability = 87.01
Spellchecked Traité Justice III_corrected.txt. Readability = 85.17
Spellchecked Traite Justice V_corrected.txt. Readability = 84.13
Spellchecked Traité Justice I_corrected.txt. Readability = 81.58
Spellchecked Traite Justice VI_corrected.txt. Readability = 87.43
Spellchecked Traite Justice VII_corrected.txt. Readability = 80.9
All data combined into final/Traité Justice_corrected_spellcheck_data.xlsx
