<a href="https://colab.research.google.com/github/milmor/deep-puma/blob/main/Image-preprocessing/Evaluacion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluación del rendimiento del OCR

### Acceso a Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Bibliotecas

In [None]:
!pip install datasets
!pip install jiwer jiwer

In [None]:
from datasets import load_metric
import pandas as pd

### Character Error Rate (CER)
$$CER = \frac{S + D + I}{N} = \frac{S + D + I}{S + D + C}$$

Donde:

* S: número de sustituciones.
* D: Número de eliminaciones.
* I: Número de inserciones.
* N: Número de caracteres en la referencia.
* C: Número correcto de caracteres.


In [None]:
cer = load_metric('cer')

### Word Error Rate (WER)

$$ WER = \frac{S + D + I}{N} = \frac{S + D + I}{S + D + C} $$

Donde:

* S: número de sustituciones.
* D: Número de eliminaciones.
* I: Número de inserciones.
* N: Número de caracteres en la referencia.
* C: Número correcto de caracteres.

In [None]:
wer = load_metric('wer')

In [None]:
def get_prediction_text(path):
  text = open(path,'r').read()
  return text

In [None]:
def get_ocr_evaluation(outputs_path, files_info, references, metric):
  filenames = [f.split('.')[0]  for f in files_info]
  predictions = [get_prediction_text('{}{}_text.txt'.format(outputs_path,fn)) for fn in filenames]
  cer_score = metric.compute(predictions=predictions, references=references)
  return cer_score

### Lectura de datos

In [None]:
TRANSCRIPTIONS_PATH = 'drive/MyDrive/Datos - Hackathon JusticIA/JusticIA_DatosTranscripciones.csv'

In [None]:
transcriptions_df = pd.read_csv(TRANSCRIPTIONS_PATH)

### Evaluación de Fichas_manual

In [None]:
manual_df = transcriptions_df[(transcriptions_df['Conjunto']== 'Fichas_manual')]

In [None]:
files_info = manual_df['NombreArchivo']
references = manual_df['Texto']

In [None]:
cer_manual = get_ocr_evaluation(outputs_path='drive/MyDrive/HackathonRIIAA2021/Texts/Fichas_manual/', 
                                files_info=files_info, 
                                references=references, 
                                metric=cer)

In [None]:
wer_manual = get_ocr_evaluation(outputs_path='drive/MyDrive/HackathonRIIAA2021/Texts/Fichas_manual/', 
                                files_info=files_info, 
                                references=references, 
                                metric=wer)

In [None]:
print('Evaluación directorio Fichas_manual:\nCER = {}\nWER={}'.format(cer_manual, wer_manual))

### Evaluación de Fichas_auto

In [None]:
auto_df = transcriptions_df[(transcriptions_df['Conjunto']== 'Fichas_auto')]

In [None]:
files_info = auto_df['NombreArchivo']
references = auto_df['Texto']

In [None]:
cer_auto = get_ocr_evaluation(outputs_path='drive/MyDrive/HackathonRIIAA2021/Texts/Fichas_auto/', 
                                files_info=files_info, 
                                references=references, 
                                metric=cer)

In [None]:
wer_auto = get_ocr_evaluation(outputs_path='drive/MyDrive/HackathonRIIAA2021/Texts/Fichas_auto/', 
                                files_info=files_info, 
                                references=references, 
                                metric=wer)

In [None]:
print('Evaluación directorio Fichas_auto:\nCER = {}\nWER={}'.format(cer_auto, wer_auto))