# OCR Metrics - CER and WER (Example Notebook)

Example notebook for Medium article - Evaluating OCR Output with CER and WER 

*Highly recommended to run this notebook in Google Colab!*

Link to article: https://towardsdatascience.com/evaluating-ocr-output-quality-with-character-error-rate-cer-and-word-error-rate-wer-853175297510       
Link to GitHub repo: https://github.com/Hanqing15/OCR-ancient-coin-legend-recognition.git

Author: **YANG Hanqing**

Referenced from author: **Kenneth Leung**

___
## 1. Install and import necessary libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:

!pip install pybind11
!pip install fastwer
!pip install pytesseract
!sudo apt install tesseract-ocr
!pip install pandas
!pip install xrld
!pip install openpyxl



[notice] A new release of pip available: 22.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.2 -> 22.2.2




[notice] To update, run: python.exe -m pip install --upgrade pip


'sudo' 不是内部或外部命令，也不是可运行的程序
或批处理文件。



[notice] A new release of pip available: 22.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip


ERROR: Could not find a version that satisfies the requirement xrld (from versions: none)
ERROR: No matching distribution found for xrld



[notice] A new release of pip available: 22.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip




In [3]:
import cv2
import pytesseract
import fastwer
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image, display
import xlrd
import openpyxl
import sklearn


___
## 2. Simple example of CER calculation

In [5]:
# Define reference text and output text
ref = "IMP CONSTANTINVS P F AVG "
output = ' IMP ab'
# Obtain Sentence-Level Character Error Rate (CER)
fastwer.score_sent(output, ref, char_level=True) # "True" refers to CER

100.0

In [2]:
# Obtain Sentence-Level Word Error Rate (WER)
fastwer.score_sent(output, ref, char_level=False) # "True" refers to WER

SyntaxError: invalid character '，' (U+FF0C) (Temp/ipykernel_16116/1126390725.py, line 2)

___
## 3. Read reference from excel

In [4]:
# Open workbook
file_path = r"C://Users//Ana//Desktop//ancient_monaie//Monnaies_copy.xlsx"
data = openpyxl.load_workbook(file_path)

# Choose the desiring sheet
sheet_monnaies = data.get_sheet_by_name("Monnaies_copy")
mintMark = data.get_sheet_by_name("mintMark")

# Getting legend_obverse from sheets
for i in range (2,863):
    print(sheet_monnaies.cell(row = i, column = 4).value)

# Getting legend_reverse from sheets
for i in range (2,863):
    print(sheet_monnaies.cell(row = i, column = 7).value)

# Getting mintMark from sheets
for i in range (2,796):
    print(mintMark.cell(row = i, column = 1).value)


D N MAXIMIANO FELICISSIMO SEN AVG
I MP MAXIMIANVS AVG
IMP MAXIMIANVS AV G
DIVO CONSTANTIO PIO
IMP MAXIMIANVS P F AVG
CONSTANTINVS P F AVG
IMP C CONSTANTIVS P F AVG
 
IMP DIOCLETIANVS AVG
IMP DIOCLETIANVS P F AVG
CONSTANTIVS NOB CAES
MAXIMIANVS NOB CAES
IMP DIOCLETIANVS AVG
IMP C CONSTANTINVS P F AVG
IMP MAXIMIANVS P F AVG
FL VAL CONSTANTI 
IMP MAXIMIANVS P F AVG
GAL VAL MAXIMINVS NOBIL C
IMP CONSTANTINVS P F AVG
DIVO CONSTANTIO AVG
D N MAXIMIANO BAEATISSIMO SEN AVG
IMP MAXIMIANVS P F AVG
IMP DIOCLETIANVS P F AVG
CONSTANTIVS NOB C
IMP C CONSTANTINVS P F AVG
MAXIMIANVS NOB CAES
IMP DIOCLETIANVS P F AVG
CONSTANTIVS NOB CAES
MAXIMIANVS NOBIL CAES
MAXIMIANVS NOB CAES
CONSTANTIVS NOB CAES
IMP MAXIMIANVS P AVG
FL VAL CONSTANTIVS NOB C
IMP C MAXIMIANVS P F AVG 
IMP MAXIMIANVS AVG
IMP C VAL MAXIMIANVS P F AVG
IMP C MAXIMIANVS P F AVG
MAXIMIANVS NOB CAES
IMP DIOCLETIANVS P F AVG
IMP MAXIMIANVS P AVG
CONSTANTIVS NOB CAES
IMP MAXIMIANVS AVG
DIVO CONSTANTIO PIO
IMP CONSTANTINVS P F AVG
IMP MAXIMIAN

___
## 4. 1 obverse

In [8]:
# Show the sample images that we will work on (Need to create a folder to upload sample images in Colab environment)
# 1.obverse image list
img_list_obverse = os.listdir('./toulouse_coins_1_obverse')
img_list_obverse.sort(key=lambda x:int(x.split('toulouse_1_')[1].split('_obverse.jpg')[0]))
#img_list_obverse(uncommenting to see the list of images)


# 2.Create empty dataframe to store output
df_output_obverse = pd.DataFrame(columns = ['img_filename', 'ocr_output'])

#Loop through obverse images
pytesseract.pytesseract.tesseract_cmd = r'D://Users//Tesseract//tesseract.exe'# add the path of your tesseract.exe
custom_config_obverse = r'--oem 0 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTVXY'# test when psm = 6/7/8/10 respectively
for img in img_list_obverse:
  im = cv2.imread(f'./toulouse_coins_1_obverse/{img}')
  img_rgb = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
  output_obverse = pytesseract.image_to_string( img_rgb, lang = 'lat', config=custom_config_obverse ) 
  #output_obverse = pytesseract.image_to_string( img_rgb, lang = 'lat_cus', config=custom_config_obverse) # test with your own language package
  dictionary_obverse = {'img_filename':img, 'ocr_output':str(output_obverse)}
  df_output_obverse = df_output_obverse.append(dictionary_obverse, ignore_index=True)

# 3.Create new columns for reference, CER and WER
#For obverse
df_output_obverse['ref_text'] = ''
df_output_obverse['cer'] = ''
df_output_obverse['wer'] = ''

# Replace new lines in output
df_output_obverse['ocr_output'] = df_output_obverse['ocr_output'].apply(lambda x: x.replace('\n',''))

for i,img in enumerate(img_list_obverse):
    df_output_obverse.loc[df_output_obverse['img_filename'] == img, 'ref_text'] = sheet_monnaies.cell(row = i + 2, column = 4).value

#df_output_obverse(uncommenting to see the results util this step）


# 4. CER and WER for obverse
for index, row in df_output_obverse.iterrows():
  filename = row['img_filename']
  ref = row['ref_text']
  output_obverse = row['ocr_output']
  cer = fastwer.score_sent(output_obverse, ref, char_level=True)
  wer = fastwer.score_sent(output_obverse, ref, char_level=False)
  df_output_obverse.loc[df_output_obverse['img_filename'] == filename, 'cer'] = round(cer,2) # Round value to 2 decimal places
  df_output_obverse.loc[df_output_obverse['img_filename'] == filename, 'wer'] = round(wer,2)

df_output_obverse

Unnamed: 0,img_filename,ocr_output,ref_text,cer,wer
0,toulouse_1_1_obverse.jpg,NR KFETMJT J M VA T J JNQ SQKFRQ Q...,D N MAXIMIANO FELICISSIMO SEN AVG,315.15,800.0
1,toulouse_1_2_obverse.jpg,M N NQTTVFNIVQPVQQMAVQQMVE VDVQ V V VDQVTVV ...,I MP MAXIMIANVS AVG,694.74,750.0
2,toulouse_1_3_obverse.jpg,QN LQQ E QSJNHQ Q Q Q Q VA Q VT EF F AL JQJQLL...,IMP MAXIMIANVS AV G,1736.84,2175.0
3,toulouse_1_4_obverse.jpg,A S QSF PAPA M APIJNREP KMIMN I A A A I TAMSBQ...,DIVO CONSTANTIO PIO,1068.42,1866.67
4,toulouse_1_5_obverse.jpg,JLFJ MRVE AFP JQI VINVJFTI II VSVL I L I V J V...,IMP MAXIMIANVS P F AVG,1004.55,1560.0
...,...,...,...,...,...
856,toulouse_1_857_obverse.jpg,QSJ P X S SQ MS FI MSNJSIMFAAQQPTV V C GNIKM I...,FL VAL CONSTANTINVS NOB C,756.0,940.0
857,toulouse_1_858_obverse.jpg,M M MM PM MV V M MMTNPM M A M M M MMMMV MEPTM ...,MAXIMIANVS NOB CAES,2200.0,3633.33
858,toulouse_1_859_obverse.jpg,VA M I Q V V T VIQV I AEPF P NMVV IQ V N V ...,IMP CONSTANTINVS P F AVG,579.17,1320.0
859,toulouse_1_860_obverse.jpg,JJ MM P EMP N II MF KNINF TESL Q VEQL QT...,IMP CONSTANTINVS P F AVG,325.0,640.0


In [9]:
# 5. Overall performances for obverse
mean_cer = df_output_obverse['cer'].mean()
mean_wer = df_output_obverse['wer'].mean()
print(f'Mean obverse CER = {mean_cer}%, Mean obverse WER = {mean_wer}%')


Mean obverse CER = 1243.0216027874571%, Mean obverse WER = 1776.291277584203%


## 4.2 Reverse

In [None]:
# 1.reverse image list
img_list_reverse = os.listdir('./toulouse_coins_1_reverse')
img_list_reverse.sort(key=lambda x:int(x.split('toulouse_1_')[1].split('_reverse.jpg')[0]))
#img_list_reverse(uncommenting to see the list of images)

# 2.Create empty dataframe to store output
df_output_reverse = pd.DataFrame(columns = ['img_filename', 'ocr_output'])

#Loop through reverse images
custom_config_reverse = r'--oem 0 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTVXY'# test when psm = 6/7/8/10 respectively
for img in img_list_reverse:
  im = cv2.imread(f'./toulouse_coins_1_reverse/{img}')
  img_rgb = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
  output_reverse = pytesseract.image_to_string( img_rgb,lang = 'lat', config = custom_config_reverse )
  #output_reverse = pytesseract.image_to_string( img_rgb,lang = 'lat_cus', config = custom_config_reverse)# test with your own language package
  dictionary_reverse = {'img_filename':img, 'ocr_output':str(output_reverse)}
  df_output_reverse = df_output_reverse.append(dictionary_reverse, ignore_index=True)
    
    
# 3.For reverse
df_output_reverse['ref_text'] = ''
df_output_reverse['cer'] = ''
df_output_reverse['wer'] = ''

# Replace new lines in output
df_output_reverse['ocr_output'] = df_output_reverse['ocr_output'].apply(lambda x: x.replace('\n',''))

for i,img in enumerate(img_list_reverse):
    df_output_reverse.loc[df_output_reverse['img_filename'] == img, 'ref_text'] = sheet_monnaies.cell(row = i + 2, column = 7).value

#df_output_reverse（uncommenting to see the results uitl this step）


# 4. CER and WER for reverse
for index, row in df_output_reverse.iterrows():
  filename = row['img_filename']
  ref = row['ref_text']
  output_reverse = row['ocr_output']
  cer = fastwer.score_sent(output_reverse, ref, char_level=True)
  wer = fastwer.score_sent(output_reverse, ref, char_level=False)
  df_output_reverse.loc[df_output_reverse['img_filename'] == filename, 'cer'] = round(cer,2) # Round value to 2 decimal places
  df_output_reverse.loc[df_output_reverse['img_filename'] == filename, 'wer'] = round(wer,2)

df_output_reverse

In [None]:
# 5.Overall performances for reverse
mean_cer = df_output_reverse['cer'].mean()
mean_wer = df_output_reverse['wer'].mean()
print(f'Mean reverse CER = {mean_cer}%, Mean reverse WER = {mean_wer}%')


## 4.3 MintMark

In [11]:
# 1.mintMark image list
img_list_mintMark = os.listdir('./toulouse_coins_1_mintMark')
img_list_mintMark.sort(key=lambda x:int(x.split('toulouse_1_')[1].split('_mintMark.jpg')[0]))
#img_list_mintMark （uncommenting to see the list of images）

pytesseract.pytesseract.tesseract_cmd = r'D://Users//Tesseract//tesseract.exe'
# 2.Create empty dataframe to store output
df_output_mintMark = pd.DataFrame(columns = ['img_filename', 'ocr_output'])

#Loop through mintMark images

custom_config_mintMark = r'--oem 0 --psm 10 -c tessedit_char_whitelist=ABCDEFGHIKLMNOPQRSTXΔϛЄΓΘ'#test when psm = 6/7/8/10 respectively
for img in img_list_mintMark:
  im = cv2.imread(f'./toulouse_coins_1_mintMark/{img}')
  img_rgb = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
  output_mintMark = pytesseract.image_to_string( img_rgb, config=custom_config_mintMark ) #config=custom_config
  #output_mintMark = pytesseract.image_to_string( img_rgb, lang = 'lat_cus', config=custom_config_mintMark )
  dictionary_mintMark = {'img_filename':img, 'ocr_output':str(output_mintMark)}
  df_output_mintMark = df_output_mintMark.append(dictionary_mintMark, ignore_index=True)
    
    
# 3. For mintMark
df_output_mintMark['ref_text'] = ''
df_output_mintMark['cer'] = ''
df_output_mintMark['wer'] = ''

# Replace new lines in output
df_output_mintMark['ocr_output'] = df_output_mintMark['ocr_output'].apply(lambda x: x.replace('\n',''))

for i,img in enumerate(img_list_mintMark):
    df_output_mintMark.loc[df_output_mintMark['img_filename'] == img, 'ref_text'] = mintMark.cell(row = i + 2, column = 1).value

#df_output_mintMark（uncommenting to see the results util this step）


# 4. CER and WER for mintMark
for index, row in df_output_mintMark.iterrows():
  filename = row['img_filename']
  ref = row['ref_text']
  output_mintMark = row['ocr_output']
  cer = fastwer.score_sent(output_mintMark, ref, char_level=True)
  wer = fastwer.score_sent(output_mintMark, ref, char_level=False)
  df_output_mintMark.loc[df_output_mintMark['img_filename'] == filename, 'cer'] = round(cer,2) # Round value to 2 decimal places
  df_output_mintMark.loc[df_output_mintMark['img_filename'] == filename, 'wer'] = round(wer,2)

df_output_mintMark

Unnamed: 0,img_filename,ocr_output,ref_text,cer,wer
0,toulouse_1_1_mintMark.jpg,,TT,100.0,100.0
1,toulouse_1_2_mintMark.jpg,M,PLG,100.0,100.0
2,toulouse_1_3_mintMark.jpg,,PLG,100.0,100.0
3,toulouse_1_4_mintMark.jpg,M,PTR,100.0,100.0
4,toulouse_1_5_mintMark.jpg,M,B,200.0,100.0
...,...,...,...,...,...
789,toulouse_1_857_mintMark.jpg,,PLG,100.0,100.0
790,toulouse_1_858_mintMark.jpg,,Q,100.0,100.0
791,toulouse_1_859_mintMark.jpg,,PLG,100.0,100.0
792,toulouse_1_860_mintMark.jpg,,PTR,100.0,100.0


In [12]:
# 5. Overall performances for mintMark
mean_cer = df_output_mintMark['cer'].mean()
mean_wer = df_output_mintMark['wer'].mean()
print(f'Mean mintMark CER = {mean_cer}%, Mean mintMark WER = {mean_wer}%')


Mean mintMark CER = 103.31654911838791%, Mean mintMark WER = 100.0%
