In [1]:
# Import glob, a module that helps with file management.
import glob
import pandas as pd
from spellchecker import SpellChecker
from pathlib import Path
from nltk import wordpunct_tokenize
from collections import Counter
import os
import re
from openpyxl import load_workbook
from openpyxl.styles import Alignment, Font
import numpy as np

In [2]:
# Define the output path and create the directory if it doesn't exist
outputpath = "./final"
outputfile_path = Path(outputpath)
outputfile_path.mkdir(exist_ok=True)

# Get the current working directory
texts_folder = Path.cwd()

# Find all .txt files in the current directory
texts_list = glob.glob("*.txt")
print("Text files to be spellchecked:", texts_list)

# Set doc_name to the name of the first .txt file found in texts_list (without the extension)
if texts_list:
    doc_name = Path(texts_list[0]).stem
else:
    doc_name = None

print(f"doc_name = '{doc_name}'")

Text files to be spellchecked: ['Le paradoxe.txt']
doc_name = 'Le paradoxe'


In [3]:
# Load the spellchecker dictionary.
# Replace the language attribute with another 2 letter code
# to select another language. Options are: English - ‘en’, Spanish - ‘es’,
# French - ‘fr’, Portuguese - ‘pt’, German - ‘de’, Russian - ‘ru’.

spell = SpellChecker(language='fr')

In [4]:
# Initialize an empty list to collect data
data_list = []

# Set the folder for the input text files
with pd.ExcelWriter(f'{doc_name}_spellcheck_data.xlsx', engine='openpyxl') as writer:
    for txt_file in texts_folder.glob('*.txt'):
        
        # Open each text file and read text into `ocrText`
        with open(txt_file, 'r') as inputFile:
            ocrText = inputFile.read()
            
        # Join hyphenated words that are split between lines
        ocrText = ocrText.replace("-\n", "")
        
        # Tokenize the text
        tokens = wordpunct_tokenize(ocrText)
        
        # Lowercase all tokens and filter out non-alphabetic tokens
        tokens = [token.lower() for token in tokens if token.isalpha()]
        
        # Identify unknown words
        unknown = spell.unknown(tokens)
        
        # Calculate readability
        if len(unknown) != 0:
            readability = round(100 - (float(len(unknown)) / float(len(tokens)) * 100), 2)
        else:
            readability = 100
        
        # Sort the unknown words first by 'counts' descending and then by 'unknown_tokens' alphabetically
        sorted_unknown = sorted(Counter(unknown).items(), key=lambda item: (-item[1], item[0]))
        
        # Append the data for this file to the list
        data_list.append({
            "file_name": txt_file.as_posix(),
            "token_count": len(tokens),
            "unknown_count": len(unknown),
            "readability": readability,
        })
        
        # Write the DataFrame to a new sheet in the Excel file
        pd.DataFrame([data_list[-1]]).to_excel(writer, sheet_name=os.path.splitext(txt_file.name)[0], index=False)
        
        # Write the unknown tokens and counts vertically
        sheet = writer.sheets[os.path.splitext(txt_file.name)[0]]
        start_row = 6  # Add two extra rows of space
        sheet.cell(row=start_row, column=1, value="unknown_tokens").font = Font(bold=True)
        sheet.cell(row=start_row, column=2, value="counts").font = Font(bold=True)
        for i, (token, count) in enumerate(sorted_unknown, start=start_row + 1):
            sheet.cell(row=i, column=1, value=token)
            sheet.cell(row=i, column=2, value=count)
        
        # Print a message indicating the file has been processed
        print(txt_file, "checked for readability.")
    
    # Create a DataFrame from the collected data
    df = pd.DataFrame(data_list)
    
    # Write the combined DataFrame to a summary sheet
    df.to_excel(writer, sheet_name='Summary', index=False)

# Load the workbook to adjust cell alignment
wb = load_workbook(f'{doc_name}_spellcheck_data.xlsx')

# Align cells in the summary sheet
summary_sheet = wb['Summary']
for row in summary_sheet.iter_rows(min_row=2, max_row=summary_sheet.max_row, min_col=1, max_col=4):
    for cell in row:
        cell.alignment = Alignment(vertical='top')

# Align cells in each individual sheet
for sheet_name in wb.sheetnames:
    if sheet_name != 'Summary':
        sheet = wb[sheet_name]
        for row in sheet.iter_rows(min_row=2, max_row=sheet.max_row, min_col=1, max_col=4):
            for cell in row:
                cell.alignment = Alignment(vertical='top')

# Save the workbook with the updated alignment
wb.save(f'{doc_name}_spellcheck_data.xlsx')

# Preview the DataFrame
df

/home/lucas-jerusalimiec/Documents/OCR Text/Text/Bodin/Le paradoxe/Le paradoxe.txt checked for readability.


Unnamed: 0,file_name,token_count,unknown_count,readability
0,/home/lucas-jerusalimiec/Documents/OCR Text/Te...,16319,3811,76.65


In [5]:
unknown_words = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0',
                 'ainfi', 'chofe', 'chofes', 'fapience', 'fouuerain', 'fou- \nuarain', 'fou- \nuerain', 'magnani- \nmité']
known_words = ['', '', '', '', '', '', '', '', '', '',
               'ainsi', 'chose', 'choses', 'sapience', 'souverain', 'souverain', 'souverain', 'magnanimité']

In [6]:
# Identify the sample_output file path.
outputfile = f'{outputfile_path.as_posix()}/{doc_name}_corrected.txt'

# Apply the following loop to one file at a time in filePath.
for file in texts_folder.glob('*.txt'):
    
    # Open a file in "read" (r) mode.
    text = open(file, "r")
    
    # Read in the contents of that file.
    word_correction = text.read()

    word_correction = word_correction.lower()
    
    # Find instances of and unknown word and replace
    # with a known word
    for i in range(len(known_words)):
          
        unknown_word = unknown_words[i]
    
        known_word = known_words[i]

        word_correction = word_correction.replace(unknown_word, known_word)
      
        print(f"All instances of '{unknown_word}' replaced with '{known_word}'.")
 
    # Reopen the file in "write" (w) mode.
    file = open(outputfile, "w")
    
    # Add the changed word into the reopened file.
    file.write(word_correction)
    
    # Close the file.
    file.close()

All instances of '1' replaced with ''.
All instances of '2' replaced with ''.
All instances of '3' replaced with ''.
All instances of '4' replaced with ''.
All instances of '5' replaced with ''.
All instances of '6' replaced with ''.
All instances of '7' replaced with ''.
All instances of '8' replaced with ''.
All instances of '9' replaced with ''.
All instances of '0' replaced with ''.
All instances of 'ainfi' replaced with 'ainsi'.
All instances of 'chofe' replaced with 'chose'.
All instances of 'chofes' replaced with 'choses'.
All instances of 'fapience' replaced with 'sapience'.
All instances of 'fouuerain' replaced with 'souverain'.
All instances of 'fou- 
uarain' replaced with 'souverain'.
All instances of 'fou- 
uerain' replaced with 'souverain'.
All instances of 'magnani- 
mité' replaced with 'magnanimité'.


In [7]:
# Process each .txt file in the folder
for filename in os.listdir(outputpath):
    if filename.endswith('.txt'):
        file_path = os.path.join(outputpath, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text_data = file.read()

        text_data = text_data.replace("-\n", "")
    
        words = wordpunct_tokenize(text_data)
    
        misspelled = spell.unknown(words)

        if len(misspelled) != 0:
            readability = round(100 - (float(len(misspelled)) / float(len(words)) * 100), 2)
        else:
            readability = 100

        # Count the frequency of each misspelled word
        word_counts = Counter(misspelled)

        # Sort word_counts first by 'count' descending and then by 'word' alphabetically
        sorted_word_counts = sorted(word_counts.items(), key=lambda item: (-item[1], item[0]))

        # Create a DataFrame and sort by frequency
        misspelled_df = pd.DataFrame(sorted_word_counts, columns=['word', 'count'])

        # Add readability as the first row
        readability_header = pd.DataFrame([["Readability", readability]], columns=['word', 'count'])
        blank_rows = pd.DataFrame([["", ""]], columns=['word', 'count'])
        header_row = pd.DataFrame([["word", "count"]], columns=['word', 'count'])
        misspelled_df = pd.concat([readability_header, blank_rows, header_row, misspelled_df], ignore_index=True)

        # Save to a CSV file named after the .txt file
        csv_filename = os.path.splitext(filename)[0] + '_spellchecker.csv'
        misspelled_df.to_csv(os.path.join(outputpath, csv_filename), index=False, header=False)
        
        print(f'Spellchecked {filename}. Readability = {readability}')

Spellchecked Le paradoxe_corrected.txt. Readability = 82.48
