# Import libraries

In [1]:
from text_matcher.matcher import Text, Matcher
import json
import pandas as pd
import random
from IPython.display import clear_output
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
#pd.set_option('display.max_colwidth', None)
import os, json, uuid

# Load in our Data Files

🛑 Input a link to the json file of articles to run the spell-check on


In [4]:
# Load in the JSON file with our JSTOR articles and data from TextMatcher
# (Note: must have the file 'default.json' in the same directory as this notebook)
articles = pd.read_json('../algorithm-testing/jstor-gender-trouble-all-articles.jsonl', lines=True)

# Importing the spell-check package

Languages tested:
* English - ‘en’
* Spanish - ‘es’
* French - ‘fr’
* German - ‘de’

Make sure to run `pip install pyspellchecker` in terminal before running the cell below

Recommended upper bound of anomaly beyond 75% percentile based on normal distribution:

    mean = 0.12764241550612537
    n = 30
    std = 0.02330421008
    [25 percentile, 75 percentile]: [0.1119237258,0.1433611052]

    Reccomended bound: 85.66

20 articles in 12.7 seconds: 0.64

50 articles in 39.7 seconds: 0.79

100 articles in 59.4 seconds: 0.594

500 articles in 385.5 seconds: 0.77

~ 0.775 seconds/article from median of 100 articles

In [13]:
from spellchecker import SpellChecker

# ‼️ 🛑 Make sure to change the variable below to the desired accuracy bound 🛑  ‼️
bound = 85.66

#initialize dictionary pairing readability scores and article id's
articles_read_scores = {}

# goes through each row (article) in the dataframe:
for index in range(0, 30):
    article_index = index
    
    # defining variables
    article_id  = articles['id'].loc[article_index] 
    article_text = articles['fullText'].loc[article_index]
    article_title = articles['title'].loc[article_index]

    # get articleID number
    article_number = article_id.split('/')[-1]

    # Assign the full text of this article to a variable called `cleaned_article_text`, with text-matcher's Text function
    cleaned_article_text = Text(article_text, article_title)

    word_list = cleaned_article_text.getTokens()
    
    # finding the document language
    languages = ['en','fr','es','de']
    abbrev_word_list = random.sample(word_list, int(len(word_list)/float(40)))
    incorrect = []
    for lg in languages:
        spell = SpellChecker(language = lg)
        misspelled = spell.unknown(abbrev_word_list)
        incorrect.append(len(misspelled))
    lang = languages[incorrect.index(min(incorrect))]

    # find those words that may be misspelled
    spell = SpellChecker(language = lang)
    misspelled = spell.unknown(word_list)

    # output the readability score 
    incorrect_percentage = float(len(misspelled))/len(word_list)

    #adds the article id:[index, title, and percentage] to the dictionary
    if (1 - incorrect_percentage) < float(bound)/100 and lang == 'en':
        articles_read_scores[article_number] = [article_id, article_index, article_title, incorrect_percentage, lang]

#articles and scores stored in dictionary articles_read_scores
articles_scores = pd.DataFrame.from_dict(articles_read_scores, orient='index', columns=['ArticleID', 'OriginalIndex', 'Title', 'Score', 'Language'])

# Visualizing the article scores as a table

In [8]:
display(articles_scores)

Unnamed: 0,ArticleID,OriginalIndex,Title,Score,Language
40978757,http://www.jstor.org/stable/40978757,2,Références bibliographiques des ouvrages et ar...,0.277379,en


# Run OCR On PDF

## Preparing for rerunning OCR

Download the pdf's of the articles listed above and place them into a folder called 'incorrect_articles_pdfs'

Name the pdfs with the corresponding articleID number (ie 409787) (JSTOR should automatically do this when you download the files).

## Rerunning OCR and Spellcheck

This cell iterates through all the pdfs in the "incorrect_articles_pdfs" folder, reruns ocr on them, runs spellcheck on the new ocr-ed text, and replaces the article text in the json data file. 
   
Before this can work, you need to install tesseract and all tesseract languages. This can be done with "brew install tesseract" and "brew install tesseract-lang" on Mac. For other installation, check the documenation: https://tesseract-ocr.github.io/

You also need to install pdf2image, which converts the pdf into an image usuable for the tesseract OCR. Installation guide here: https://pypi.org/project/pdf2image/

In [None]:
# ‼️ 🛑 Make sure to change the variable below to the desired replacement bound (ie how much improvement to replace the old OCR with new OCR) 🛑  ‼️

replace = 0.10


In [14]:
import pytesseract
from pdf2image import convert_from_path
import os

# mapping of language shortcut for spellcheck to tessarect
lang_mapping = {
    "en": "eng",
    "es": "spa",
    "fr": "fra",
    "de": "deu"
}

folder = "incorrect_articles_pdfs"

articles_chg = pd.read_json('../algorithm-testing/jstor-gender-trouble-all-articles.jsonl', lines=True)

no_improvement = {}

for pdf in os.listdir(folder):

    # remove the ".pdf" to have the article_number
    article_number = pdf[:-4]

    # converts the pdf into images that are used for OCR 
    pdf_page_images = convert_from_path("incorrect_articles_pdfs/" + pdf)  

    pdf_lang = articles_read_scores[article_number][4]
    article_title = articles_read_scores[article_number][2]

    # run ocr 
    new_ocr_text = ""
    for image in pdf_page_images[1:]:
        new_ocr_text += pytesseract.image_to_string(image, lang=lang_mapping[pdf_lang])

    # rerun spellcheck for readability score
    spell = SpellChecker(language = pdf_lang)
    text = Text(new_ocr_text, article_title)
    word_list = text.getTokens()
    misspelled = spell.unknown(word_list)
    incorrect_percentage = float(len(misspelled))/len(word_list)
    print(f"New incorrect percentage for:{article_number}", incorrect_percentage)


    # TODO: replace texts with in json file (decide when we shold do this and whether user input should be involved)
    old_error = articles_scores.loc[article_number, ['Score']]
    if old_error - incorrect_percentage > 0.10:
        articles_chg.loc[articles_scores.loc[article_number, ['OriginalIndex']],['fullText']] = new_ocr_text
        print('+')
    else:
        no_improvement[article_number] = [article_number, article_title, articles_scores.loc[article_number, ['ArticleID']], old_error, pdf_lang]
        print('-')

    articles_chg.head(5)
    

New incorrect percentage: 0.1407035175879397


## Replace fixable articles with new article text

In [None]:
# create randomly named temporary file to avoid 
# interference with other thread/asynchronous request
filename = '../algorithm-testing/jstor-gender-trouble-all-articles.jsonl'

tempfile = os.path.join(os.path.dirname(filename), str(uuid.uuid4()))

with open(tempfile, 'w') as f:
    json.dump(articles_chg, f, indent=4)

# rename temporary file replacing old file
os.rename(tempfile, filename)

## Displays the articles that were unable to be fixed through rerunning OCR

In [None]:
no_improvement_pd = pd.DataFrame.from_dict(articles_chg, orient='index', columns=['ArticleNumber', 'ArticleTitle', 'ArticleID', 'Score', 'Language'])

display(no_improvement_pd)