# Import libraries

In [2]:
from text_matcher.matcher import Text, Matcher
import json
import pandas as pd

from IPython.display import clear_output
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
#pd.set_option('display.max_colwidth', None)

# Load in our Data Files

In [3]:
# Load Middlemarch .txt file 
# (Note: must have 'middlemarch.txt' in this directory)
with open('../algorithm-testing/middlemarch.txt') as f: 
    rawMM = f.read()

mm = Text(rawMM, 'Middlemarch')

# Load in the JSON file with our JSTOR articles and data from TextMatcher
# (Note: must have the file 'default.json' in the same directory as this notebook)
df = pd.read_json('../algorithm-testing/jstor-middlemarch-articles.json')

# Importing the spell-check package

## Input an article_id to run the spell-check on

Languages tested:
* English - ‘en’
* Spanish - ‘es’
* French - ‘fr’
* German - ‘de’

In [34]:
from spellchecker import SpellChecker

# ‼️ 🛑 Make sure to change the variable below to the correct article id 🛑  ‼️
article_id  = 'http://www.jstor.org/stable/439034' # CHANGE THIS to article id

spell_check_language = 'en'

spell = SpellChecker(language = spell_check_language)

# Use article_id to get the index of the article in our DataFrame
article_index = df[df['id'] == article_id].index[0]
article_text = df['fullText'].loc[article_index]
article_title = df['title'].loc[article_index]

# Assign the full text of this article to a variable called `cleaned_article_text`, with text-matcher's Text function
cleaned_article_text = Text(article_text, article_title)

word_list = ((" ").join(article_text)).split(" ")

# find those words that may be misspelled
incorrect = []
for lang in ['en','fr','es','de']:
    spell = SpellChecker(language = lang)
    misspelled = spell.unknown(word_list)
    incorrect.append(len(misspelled))

incorrect_percentage = float(min(incorrect))/len(word_list)



['From', 'Good', 'Looks', 'to', 'Good', 'Thoughts:', 'Popular', "Women's", 'History', 'and', 'the', 'Invention', 'of', 'Modernity,', 'ca.', '1830-1870', 'MIRIAM', 'ELIZABETH', 'BURSTEIN', 'State', 'University', 'of', 'New', 'York-Brockport', 'In', 'the', 'famous', '"Prelude"', 'and', 'conclusion', 'to', 'Middlemarch', '(1871-72)', 'George', 'Eliot', 'addresses', 'the', 'question', 'of', "woman's", 'work', 'in', 'the', 'modern', 'world', 'via', 'the', 'figure', 'of', 'Saint', 'Theresa.', 'The', '"later-born', 'Theresas,"', 'Eliot', 'writes,', '"were', 'helped', 'by', 'no', 'coherent', 'social', 'faith', 'and', 'order', 'which', 'could', 'perform', 'the', 'function', 'of', 'knowledge', 'for', 'the', 'ardently', 'willing', 'soul";', 'sim-', 'ilarly,', 'she', 'says', 'in', 'the', 'conclusion,', '"[a]', 'new', 'Theresa', 'will', 'hardly', 'have', 'the', 'opportunity', 'of', 'reforming', 'a', 'conventual', 'life,', 'any', 'more', 'than', 'a', 'new', 'Anti-', 'gone', 'will', 'spend', 'her', '

# Run OCR On PDF

This cell converts a pdf to an image and then runs ocr on it.

In [37]:
from PIL import Image
import pytesseract
import urllib.request
from pdf2image import convert_from_path

pdf = "test_pdf.pdf"
pages = convert_from_path(pdf)

new_ocr_text = ""
for page in pages:
    new_ocr_text += pytesseract.image_to_string(page, lang="eng")

['@', '|', 'THE', 'UNIVERSITY', 'OF', 'CHICAGO', 'PRESS', 'JOURNALS\n\nFrom', 'Good', 'Looks', 'to', 'Good', 'Thoughts:', 'Popular', "Women's", 'History', 'and', 'the', 'Invention', 'of\nModernity,', 'ca.', '1830-1870\n\nAuthor(s):', 'Miriam', 'Elizabeth', 'Burstein\nSource:', 'Modern', 'Philology,', 'Aug.,', '1999,', 'Vol.', '97,', 'No.', '1', '(Aug.,', '1999),', 'pp.', '46-75\nPublished', 'by:', 'The', 'University', 'of', 'Chicago', 'Press\n\nStable', 'URL:', '|https://www.jstor.org/stable/', '439034\n\nREFERENCES\n\nJSTOR', 'is', 'a', 'not-for-profit', 'service', 'that', 'helps', 'scholars,', 'researchers,', 'and', 'students', 'discover,', 'use,', 'and', 'build', 'upon', 'a', 'wide\nrange', 'of', 'content', 'in', 'a', 'trusted', 'digital', 'archive.', 'We', 'use', 'information', 'technology', 'and', 'tools', 'to', 'increase', 'productivity', 'and\nfacilitate', 'new', 'forms', 'of', 'scholarship.', 'For', 'more', 'information', 'about', 'JSTOR,', 'please', 'contact', 'support@jstor.o

# Rerun Spellcheck

In [41]:
spell_check_language = 'en'
print(new_ocr_text[0])
word_list = new_ocr_text.split(" ")
spell = SpellChecker(language = spell_check_language)

misspelled = spell.unknown(word_list)
print(misspelled)
incorrect_percentage = float(len(misspelled))/len(word_list)

print(incorrect_percentage)

@
{'hilton,', 'century,', 'so\nfar', 'catalogs,', 'at\nleast', 'exemplarity\nwith', 'demarcates', 'character;', 'world.\n\naccordingly,', 'england.', 'by\nirrational', 'mcleod,', 'of\n(and', 'particularity.', 'visible\nwoman,', 'enthusiasm”;', 'rohan\nmaitzen', 'evangelicalism', 'maitzen,', 'https://about.jstor.org/terms\n72', 'elizabeth’s', '(1877),', 'masculinization', 'mill,', '1850;', 'purposes.', '1851),', 'l.', '“m.a.k.”', 'anachronism;', 'past\nages,', '262;', 'claims,', 'silence,', 'miriam,', '1837-1876', 'contemporary\nmorals—because', 'accumulation.', 'the\nimpossibility', 'ours,”\nin', 'diffusive:', 's.', 'along\nlines', 'was\npublished,', 'factors:\nthe', 'do-\nmesticated', 'future\nparticipation', 'manuals—\nalso', '10-11.\n\n60.', 'caused,”', 'bowles,', 'in\n\n12.', 'prog-\nress', 'best,', 'time,', 'words\nof', 'whose\nvirtues,', 'practices.', 'mod-\nern', 'events,', 'for”', 'cox’s\nfemale', 'with\nher', 'magazine\n102', 'anything\nother', 'women”', 'feminists,', 'faculti