In [1]:
# Import glob, a module that helps with file management.
import glob
import pandas as pd
from spellchecker import SpellChecker
from pathlib import Path
from nltk import wordpunct_tokenize
from collections import Counter
import os

In [2]:
#doc_name = 'Démonomanie'
outputpath = "./final"
outputfile_path = Path(outputpath)
outputfile_path.mkdir(exist_ok=True)
texts_folder = Path.cwd()
doc_name = texts_folder.name
texts_list = glob.glob("*.txt")
print("Text files to be spellchecked:", texts_list)

Text files to be spellchecked: ['Le paradoxe.txt']


In [3]:
# Load the spellchecker dictionary.
# Replace the language attribute with another 2 letter code
# to select another language. Options are: English - ‘en’, Spanish - ‘es’,
# French - ‘fr’, Portuguese - ‘pt’, German - ‘de’, Russian - ‘ru’.

spell = SpellChecker(language='fr')

In [4]:
### Dictionary Test a Folder of .txt Files ###

# We'll use Pandas to create a dataframe (a table) that can hold 
# information about an OCR'ed page and display it in a tabular format.
# This dataframe will start out empty with only its column headers 
# defined. We'll add information to it one page at a time. So each
# row will represent 1 page.

placeholder_values = ['placeholder', 0, 0, 0, 'placeholder', 'placeholder']

df = pd.DataFrame([placeholder_values], columns=["file_name","token_count","unknown_count","readability","unknown_words","text"])

# Set the folder for the input images

for txt_file in texts_folder.glob('*.txt'):
    
    # Open each text file and read text into `ocrText`
    with open(txt_file, 'r') as inputFile:
        ocrText = inputFile.read()
        
    # Join hyphenated words that are split between lines by 
    # looking for a hyphen followed by a newline character: "-\n"
    # "\n" is an "escape character" and represents the 
    # "newline," a character that is usually invisible 
    # to human readers but that computers use to mark the 
    # end/beginning of a line. Each time you press the 
    # Enter/Return key on your keyboard, an invisible "\n" 
    # is created to mark the beginning of a new line.
    ocrText = ocrText.replace("-\n","")
    
    # First, we'll use NLTK to "tokenize" text. 
            # "Tokenize" here means to take a page of our OCR'ed text,
            # which Python is currently reading as one big glob of data,
            # and separate each word out so that it can be read as an
            # individual piece of data within a larger data structure 
            # (a list). This process also removes punctuation.
    tokens = wordpunct_tokenize(ocrText)
    
    # Lowercase all tokens
    tokens = [token.lower() for token in tokens if token.isalpha()]
    
    # Now we can get all of the words that don't match the 
    # spellchecker dictionary or our list of place names--
    # these are the potential spelling errors.
    unknown = spell.unknown(tokens)
    
    # Let's use a little math to find out how many potential 
    # spelling errors were identified. As part of this process, 
    # we'll create a "readability" score that will give us a 
    # percentage of how readable each file is--how much of the 
    # OCR'ed is "correct."
        
    # If the list of unknown tokens (words) is greater than 0 
    # (i.e. if the list is not empty):
    if len(unknown) != 0:
            
               # Following order of operations, here's what's happening 
               # in the readability variable below:
               # 1. Divide the number of unknown tokens (len(unknown)) 
                    # by the total number of tokens on the page
                    # (len(tokens)). Use "float" to specify that Python
                    # returns a decimal number:
                        # (float(len(unknown))/float(len(tokens))
               # 2. Multiply the number from step 1 by 100.
                    # (float(len(unknown))/float(len(tokens)) * 100)
               # 3. Subtract the number from step 2 from 100.
                    # 100 - (float(len(unknown))/float(len(tokens)) * 100)
               # 4. Round the number from step 3 to 2 decimal places
                    # round(100 - (float(len(unknown))/float(len(tokens)) * 100), 2)
            
        readability = round(100 - (float(len(unknown))/float(len(tokens)) * 100), 2)
        
        # If the list of unknown tokens is empty (or equal to 0), then readability is 100!
    else:
        readability = 100
    
    # Let's create a record of the readability information 
    # for this page that we'll add to the dataframe. 
    # The following is a Python dictionary, another way of 
    # storing data. Each word or phrase to the left of the : is a
    # "key" -- think of it as a column header. Each piece of 
    # information to the right is a "value" -- information 
    # written in a single cell below each header. 

    df2 = pd.DataFrame({
            "file_name" : txt_file.as_posix(),
            "token_count" : len(tokens),
            "unknown_count" : len(unknown),
            "readability" : readability,
            "unknown_words" : [unknown],
            "text" : ocrText
            })

    df = pd.concat([df, df2])

    # This statement lets us know if a page has been successfully 
    # checked for readability.
    print(txt_file, "checked for readability.")
    
# This time, instead of creating individual .txt files for each page,
# we're going to save all of the OCR'ed text and readability 
# information to a single .csv ("comma separated value") file. 
# We can view this file format as a table. Having everything stored 
# like this will help us with clean up and future analysis.
df.to_csv('spellcheck_data.csv', header=True, index=False, sep=',')

# We have the data stored in a file now, but we can also 
# preview it here:
df

/home/lucas-jerusalimiec/Documents/OCR Text/Text/Bodin/Le paradoxe/Le paradoxe.txt checked for readability.


Unnamed: 0,file_name,token_count,unknown_count,readability,unknown_words,text
0,placeholder,0,0,0.0,placeholder,placeholder
0,/home/lucas-jerusalimiec/Documents/OCR Text/Te...,16319,3811,76.65,"{décenraifon, yautre, mcs, recouurer, nousauós...",\n\n LE PARADOXE MORAL \nDE. JEAN BODIN ANG...


In [5]:
unknown_words = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'magnani-', '\nuerain', 'uarain', 'fou-', ' mité', '\nmité']
known_words = ['', '', '', '', '', '', '', '', '', '', 'magnanimité', 'fouuerain', 'fouuerain', '', '', '']

In [6]:
# Identify the sample_output file path.
# Remember that our readability output is also stored 
# in this file as a .csv. We don't want to change it, 
# so we'll use glob to look for only .txt files.
outputfile = f'{outputfile_path.as_posix()}/{doc_name}_corrected.txt'

# Apply the following loop to one file at a time in filePath.
for file in texts_folder.glob('*.txt'):
    
    # Open a file in "read" (r) mode.
    text = open(file, "r")
    
    # Read in the contents of that file.
    word_correction = text.read()

    word_correction = word_correction.lower()
    
    # Find instances of and unknown word and replace
    # with a known word
    for i in range(len(known_words)):
          
        unknown_word = unknown_words[i]
    
        known_word = known_words[i]

        word_correction = word_correction.replace(unknown_word, known_word)

        print("All instances of " + unknown_word + " replaced with " + known_word + ".")    
 
    # Reopen the file in "write" (w) mode.
    file = open(outputfile, "w")
    
    # Add the changed word into the reopened file.
    file.write(word_correction)
    
    # Close the file.
    file.close()

All instances of 1 replaced with .
All instances of 2 replaced with .
All instances of 3 replaced with .
All instances of 4 replaced with .
All instances of 5 replaced with .
All instances of 6 replaced with .
All instances of 7 replaced with .
All instances of 8 replaced with .
All instances of 9 replaced with .
All instances of 0 replaced with .
All instances of magnani- replaced with magnanimité.
All instances of 
uerain replaced with fouuerain.
All instances of uarain replaced with fouuerain.
All instances of fou- replaced with .
All instances of  mité replaced with .
All instances of 
mité replaced with .


In [7]:
# Process each .txt file in the folder
for filename in os.listdir(outputpath):
    if filename.endswith('.txt'):
        file_path = os.path.join(outputpath, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text_data = file.read()

        text_data = text_data.replace("-\n","")
    
        words = wordpunct_tokenize(text_data)
    
        misspelled = spell.unknown(words)

        if len(misspelled) != 0:
                                
            readability = round(100 - (float(len(misspelled))/float(len(words)) * 100), 2)
        
        # If the list of unknown tokens is empty (or equal to 0), then readability is 100!
        else:
            readability = 100

        # Count the frequency of each misspelled word
        word_counts = Counter(misspelled)

        # SCreate a DataFrame and sort by frequency
        misspelled_df = pd.DataFrame(word_counts.items(), columns=['word', 'count'])
        misspelled_df = misspelled_df.sort_values(by='count', ascending=False)

        # Save to a CSV file named after the .txt file
        csv_filename = os.path.splitext(filename)[0] + '_spellechecker.csv'
        misspelled_df.to_csv(os.path.join(outputpath, csv_filename), index=False)
        
        print(f'Spellchecked {filename}. Readability = {readability}')

Spellchecked Le paradoxe_corrected.txt. Readability = 82.46
