In [1]:
# Import glob, a module that helps with file management.
import glob
import pandas as pd
from spellchecker import SpellChecker
from pathlib import Path
from nltk import wordpunct_tokenize
from collections import Counter
import os
import re
from openpyxl import load_workbook
from openpyxl.styles import Alignment, Font
import numpy as np

In [2]:
# Define the output path and create the directory if it doesn't exist
outputpath = "./final"
outputfile_path = Path(outputpath)
outputfile_path.mkdir(exist_ok=True)

# Get the current working directory
texts_folder = Path.cwd()

# Find all .txt files in the current directory
texts_list = glob.glob("*.txt")
print("Text files to be spellchecked:", texts_list)

current_directory = os.path.basename(os.getcwd())

print(f"current_directory = '{current_directory}'")

Text files to be spellchecked: ['Harangue - Saint Germain.txt', 'Harangue - Orléans 2.txt', 'Harangue - septembre.txt', 'Harangue - Fontainebleau.txt', 'Harangue - lit de justice.txt', 'Harangue - religion.txt', 'Harangue - Orléans.txt', 'Harangue - ouverture de parlement.txt', 'Harangue - parlement.txt', 'Lit de justice.txt', 'Harangue - parlement 3.txt', 'Harangue - Rouen.txt', 'Harangue - Poissy.txt', 'Harangue - parlement 2.txt']
current_directory = 'Harangues'


In [3]:
# Load the spellchecker dictionary.
# Replace the language attribute with another 2 letter code
# to select another language. Options are: English - ‘en’, Spanish - ‘es’,
# French - ‘fr’, Portuguese - ‘pt’, German - ‘de’, Russian - ‘ru’.

spell = SpellChecker(language='fr')

In [4]:
# Initialize an empty list to collect data
data_list = []

with pd.ExcelWriter(f'{current_directory}_spellcheck_data.xlsx', engine='openpyxl') as writer:
    for txt_file in texts_folder.glob('*.txt'):
        
        # Extract the relevant parts of the file path
        parts = txt_file.parts[-3:]
        file_name = os.path.join(parts[0], parts[1], parts[2])
        
        # Open each text file and read text into `ocrText`
        with open(txt_file, 'r') as inputFile:
            ocrText = inputFile.read()
            
        # Join hyphenated words that are split between lines
        ocrText = ocrText.replace("-\n", "")
        
        # Tokenize the text
        tokens = wordpunct_tokenize(ocrText)
        
        # Lowercase all tokens and filter out non-alphabetic tokens
        tokens = [token.lower() for token in tokens if token.isalpha()]
        
        # Identify unknown words
        spell = SpellChecker()
        unknown = spell.unknown(tokens)
        
        # Calculate readability
        if len(unknown) != 0:
            readability = round(100 - (float(len(unknown)) / float(len(tokens)) * 100), 2)
        else:
            readability = 100
        
        # Sort the unknown words first by 'counts' descending and then by 'unknown_tokens' alphabetically
        sorted_unknown = sorted(Counter(unknown).items(), key=lambda item: (-item[1], item[0]))
        
        # Append the data for this file to the list
        data_list.append({
            "file_name": file_name,
            "token_count": len(tokens),
            "unknown_count": len(unknown),
            "readability": readability,
        })
        
        # Write the DataFrame to a new sheet in the Excel file
        pd.DataFrame([data_list[-1]]).to_excel(writer, sheet_name=os.path.splitext(txt_file.name)[0], index=False)
        
        # Write the unknown tokens and counts vertically
        sheet = writer.sheets[os.path.splitext(txt_file.name)[0]]
        start_row = 6  # Add two extra rows of space
        sheet.cell(row=start_row, column=1, value="unknown_tokens").font = Font(bold=True)
        sheet.cell(row=start_row, column=2, value="counts").font = Font(bold=True)
        for i, (token, count) in enumerate(sorted_unknown, start=start_row + 1):
            sheet.cell(row=i, column=1, value=token)
            sheet.cell(row=i, column=2, value=count)
        
        # Print a message indicating the file has been processed
        print(txt_file, "checked for readability.")
    
    # Create a DataFrame from the collected data
    df = pd.DataFrame(data_list)

  
    # Write the combined DataFrame to a summary sheet
    df.to_excel(writer, sheet_name='Summary', index=False)

# Load the workbook to adjust cell alignment
wb = load_workbook(f'{current_directory}_spellcheck_data.xlsx')

# Align cells in the summary sheet
summary_sheet = wb['Summary']
for row in summary_sheet.iter_rows(min_row=2, max_row=summary_sheet.max_row, min_col=1, max_col=4):
    for cell in row:
        cell.alignment = Alignment(vertical='top')

# Align cells in each individual sheet
for sheet_name in wb.sheetnames:
    if sheet_name != 'Summary':
        sheet = wb[sheet_name]
        for row in sheet.iter_rows(min_row=2, max_row=sheet.max_row, min_col=1, max_col=4):
            for cell in row:
                cell.alignment = Alignment(vertical='top')

# Save the workbook with the updated alignment
wb.save(f'{current_directory}_spellcheck_data.xlsx')

df.head(20)

/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Harangues/Harangue - Saint Germain.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Harangues/Harangue - Orléans 2.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Harangues/Harangue - septembre.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Harangues/Harangue - Fontainebleau.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Harangues/Harangue - lit de justice.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Harangues/Harangue - religion.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Harangues/Harangue - Orléans.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Harangues/Harangue - ouverture de parlement.txt checked for readability.
/home/lucas-jerus



/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Harangues/Harangue - parlement 3.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Harangues/Harangue - Rouen.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Harangues/Harangue - Poissy.txt checked for readability.
/home/lucas-jerusalimiec/Documents/OCR Text/Text/L'Hospital/Harangues/Harangue - parlement 2.txt checked for readability.


Unnamed: 0,file_name,token_count,unknown_count,readability
0,L'Hospital/Harangues/Harangue - Saint Germain.txt,2930,836,71.47
1,L'Hospital/Harangues/Harangue - Orléans 2.txt,1306,406,68.91
2,L'Hospital/Harangues/Harangue - septembre.txt,1047,405,61.32
3,L'Hospital/Harangues/Harangue - Fontainebleau.txt,738,281,61.92
4,L'Hospital/Harangues/Harangue - lit de justice...,3566,884,75.21
5,L'Hospital/Harangues/Harangue - religion.txt,2259,646,71.4
6,L'Hospital/Harangues/Harangue - Orléans.txt,7221,1675,76.8
7,L'Hospital/Harangues/Harangue - ouverture de p...,2505,738,70.54
8,L'Hospital/Harangues/Harangue - parlement.txt,3992,1052,73.65
9,L'Hospital/Harangues/Lit de justice.txt,2971,714,75.97


In [5]:
unknown_words = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'ad- \nvient', 'aul- \ntres', 'autun', 
                 'baul- \ndoin', 'céré- \nmonies', 'concile', 'conciles', 'créan- \nciers',
                 'esdict', 'estran- \ngers',  'hon- \nneur',
                 'jus- \ntice', 'llesdict', 'llesdictz', 'nor- \nmandie',  'of- \nficierz',
                 'paiz', 'parle- \nment', 'pra- \nticque', 'présen- \ntement',
                'quel- \nques',  'rai- \nsons', 'rap- \nport', 'réfor- \nmation', 'reli- \ngions',
                 'sei- \ngneur',  'seule- \nment', 'suc- \ncession',
                've- \nnoient']
known_words = ['', '', '', '', '', '', '', '', '', '', 'advient',  'aultres', 'aucun',
               'bauldoin', 'cérémonies', 'conseil', 'conseils', 'créanciers', 'lesdict',
               'estrangers', 'honneur',
               'justice', 'lesdict', 'lesdictz', 'normandie', 'officierz',
               'paix', 'parlement', 'praticque', 'présentement',
               'quelques', 'raisons', 'rapport', 'réformation', 'religions',
               'seigneur',  'seulement', 'succession', 
               'venoient'
               ]

In [6]:
first_file_iteration = True  # Flag to control the print statement

for file in texts_list:
    
    # Identify the output file path for each text file
    outputfile = f'{outputfile_path}/{Path(file).stem}_corrected.txt'
    
    # Open a file in "read" (r) mode
    with open(file, "r") as text:
        # Read in the contents of that file
        word_correction = text.read()

    word_correction = word_correction.lower()
    
    # Find instances of an unknown word and replace with a known word
    for i in range(len(known_words)):
        unknown_word = unknown_words[i]
        known_word = known_words[i]
        word_correction = word_correction.replace(unknown_word, known_word)
        
        if first_file_iteration:
            print(f"All instances of '{unknown_word}' replaced with '{known_word}'.")
    
    # Open the file in "write" (w) mode
    with open(outputfile, "w") as file:
        # Add the changed word into the reopened file
        file.write(word_correction)
    
    print(f"Corrected file saved as {outputfile}.")
    first_file_iteration = False  # Disable the flag after the first file

All instances of '1' replaced with ''.
All instances of '2' replaced with ''.
All instances of '3' replaced with ''.
All instances of '4' replaced with ''.
All instances of '5' replaced with ''.
All instances of '6' replaced with ''.
All instances of '7' replaced with ''.
All instances of '8' replaced with ''.
All instances of '9' replaced with ''.
All instances of '0' replaced with ''.
All instances of 'aul- 
tres' replaced with 'aultres'.
All instances of 'esdict' replaced with 'lesdict'.
All instances of 'estran- 
gers' replaced with 'estrangers'.
All instances of 'rap- 
port' replaced with 'rapport'.
All instances of 'parle- 
ment' replaced with 'parlement'.
All instances of 'présen- 
tement' replaced with 'présentement'.
All instances of 'céré- 
monies' replaced with 'cérémonies'.
All instances of 'sei- 
gneur' replaced with 'seigneur'.
All instances of 'nor- 
mandie' replaced with 'normandie'.
All instances of 'reli- 
gions' replaced with 'religions'.
All instances of 'seule- 
me

In [7]:
# Create an Excel writer object
with pd.ExcelWriter(f'{outputfile_path}/{Path.cwd().name}_corrected_spellcheck_data.xlsx', engine='openpyxl') as writer:

    for filename in os.listdir(outputpath):
        if filename.endswith('.txt'):
            file_path = os.path.join(outputpath, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text_data = file.read()

            text_data = text_data.replace("-\n", "")

            words = wordpunct_tokenize(text_data)

            misspelled = spell.unknown(words)

            if len(misspelled) != 0:
                readability = round(100 - (float(len(misspelled)) / float(len(words)) * 100), 2)
            else:
                readability = 100

            # Count the frequency of each misspelled word
            word_counts = Counter(misspelled)

            # Sort word_counts first by 'count' descending and then by 'word' alphabetically
            sorted_word_counts = sorted(word_counts.items(), key=lambda item: (-item[1], item[0]))

            # Create a DataFrame and sort by frequency
            misspelled_df = pd.DataFrame(sorted_word_counts, columns=['word', 'count'])

            # Add readability as the first row
            readability_header = pd.DataFrame([["Readability", readability]], columns=['word', 'count'])
            blank_rows = pd.DataFrame([["", ""]], columns=['word', 'count'])
            header_row = pd.DataFrame([["word", "count"]], columns=['word', 'count'])
            misspelled_df = pd.concat([readability_header, blank_rows, header_row, misspelled_df], ignore_index=True)

            # Write the DataFrame to a new sheet in the Excel file
            sheet_name = os.path.splitext(filename)[0]
            misspelled_df.to_excel(writer, sheet_name=sheet_name, index=False, header=False)

            print(f'Spellchecked {filename}. Readability = {readability}')

print(f"All data combined into {outputfile_path}/{Path.cwd().name}_corrected_spellcheck_data.xlsx")


Spellchecked Harangue - Saint Germain_corrected.txt. Readability = 76.68
Spellchecked Harangue - Orléans 2_corrected.txt. Readability = 74.02
Spellchecked Harangue - religion_corrected.txt. Readability = 76.34
Spellchecked Harangue - ouverture de parlement_corrected.txt. Readability = 75.84
Spellchecked Lit de justice_corrected.txt. Readability = 80.22
Spellchecked Harangue - parlement 3_corrected.txt. Readability = 71.32
Spellchecked Harangue - lit de justice_corrected.txt. Readability = 79.88
Spellchecked Harangue - Fontainebleau_corrected.txt. Readability = 69.61
Spellchecked Harangue - septembre_corrected.txt. Readability = 68.67
Spellchecked Harangue - parlement 2_corrected.txt. Readability = 76.77
Spellchecked Harangue - parlement_corrected.txt. Readability = 78.38
Spellchecked Harangue - Orléans_corrected.txt. Readability = 81.0
Spellchecked Harangue - Poissy_corrected.txt. Readability = 75.79
Spellchecked Harangue - Rouen_corrected.txt. Readability = 79.78




All data combined into final/Harangues_corrected_spellcheck_data.xlsx
