## Perform OCR with Google Cloud Vision instead of tesseract to improve the dataset quality

### Get text for all the scanned PDFs

In [1]:
from utils import ocr
import pandas as pd
from tqdm import tqdm

path = 'corpus/descargados/'

# get all the PDF files in the directory
import os
pdf_files = [f for f in os.listdir(path) if f.endswith('.pdf')]

print(len(pdf_files), 'files found')

4201 files found


### Count total number of scanned PDF pages in entire corpus

In [2]:
total_pages = 0
for file in tqdm(pdf_files):
    if ocr.is_scanned_pdf(path + file):
        pages = ocr.count_pages(path + file)
        total_pages += pages

print('Total pages:', total_pages)

  0%|          | 0/4201 [00:00<?, ?it/s]

100%|██████████| 4201/4201 [12:59<00:00,  5.39it/s]

Total pages: 28639





Total scanned PDF pages: 28639

First 1k: free  
Next: 29.1966 mxn / unit  
  
Total Google Cloud Vision price = 27639 / 1000 * 29.1966 = 806.96 mxn

### Convert PDFs to text using OCR from Google Cloud Vision

In [None]:
from utils import ocr
import pandas as pd
from tqdm import tqdm

path = 'corpus/descargados/'

# get all the PDF files in the directory
import os
pdf_files = [f for f in os.listdir(path) if f.endswith('.pdf')]

df_google_ocr = pd.DataFrame(columns=['codigo', 'text'])

for file in tqdm(pdf_files, desc="Processing PDFs"):
    text = ocr.pdf2text(path + file)
    codigo = file.split('.')[0]

    # Create a new row as a DataFrame
    new_row = pd.DataFrame({'codigo': [codigo], 'text': [text]})
    
    # Concatenate the new row with the existing dataframe
    df_google_ocr = pd.concat([df_google_ocr, new_row], ignore_index=True)

# Save the dataframe to a CSV file for future use
df_google_ocr.to_csv('google_ocr_results.csv', index=False)

print("All files processed and saved to google_ocr_results.csv.")


### Process single PDF with Google Cloud Vision just to show the difference

In [3]:
google_text = ocr.pdf2text('corpus/descargados/47095.pdf')

with open('corpus/google_text_47095_PDF.txt', 'w') as f:
    f.write(google_text)

Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...


In [6]:
df = pd.read_csv('corpus/corpus.csv')

# Get the text of the row with codigo 47095
text = df[df['Codigo'] == 47095]['text'].values[0]

with open('corpus/47095_PDF.txt', 'w') as f:
    f.write(text)