In [1]:
import os
import pandas as pd
from google.cloud import storage

# Initialize the Google Cloud Storage client
client = storage.Client()

In [2]:
# Define the bucket and directories
bucket_name = 'declaraciones-renta-ai'
source_folder = 'data_pruebas/luvan_base_400_resultados/DEC_RENTA'
incorrect_folder = 'data_calidad_tercer_filtro/incorrectos_files'
correct_folder = 'data_calidad_tercer_filtro/correctos_files'

In [3]:
# Ensure directories exist
os.makedirs(incorrect_folder, exist_ok=True)
os.makedirs(correct_folder, exist_ok=True)

In [5]:
# Function to download files from GCP bucket
def download_files_from_bucket(file_names, download_folder, bucket_name, source_folder):
    bucket = client.bucket(bucket_name)
    for file_name in file_names:
        source_blob_name = f"{source_folder}/{file_name}"
        destination_file_name = os.path.join(download_folder, file_name)
        blob = bucket.blob(source_blob_name)
        if blob.exists():
            blob.download_to_filename(destination_file_name)
            print(f"Downloaded {file_name} to {download_folder}")
        else:
            print(f"File {file_name} not found in bucket.")

In [5]:
# Step 1: Read the bad files CSV and extract file names
df_bad = pd.read_csv('quality_score_ocr_premium_5 - bad_second_version.csv')
bad_files = df_bad['file_name'].tolist()

# Step 2: Download bad files
download_files_from_bucket(bad_files, incorrect_folder, bucket_name, source_folder)

# Step 3: Save the amount of downloaded files
amount_files = len(bad_files)
print(f"Number of bad files downloaded: {amount_files}")

Downloaded 1110518358_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/incorrectos_files
Downloaded archivo4 (76)_dr_page_001.png to data_calidad_tercer_filtro/incorrectos_files
Downloaded 41778530-mariela ochoa fonseca_dr_page_001.png to data_calidad_tercer_filtro/incorrectos_files
Downloaded archivo4 (78)_dr_page_001.png to data_calidad_tercer_filtro/incorrectos_files
Downloaded 79536746_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/incorrectos_files
Downloaded archivo4 - 2023-12-22t162748.693_dr_page_001.png to data_calidad_tercer_filtro/incorrectos_files
Downloaded 42826276_dr_inconsistentes_page_002.png to data_calidad_tercer_filtro/incorrectos_files
Downloaded archivo4 - 2023-12-21t151937.837_dr_page_001.png to data_calidad_tercer_filtro/incorrectos_files
Downloaded 1022972783_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/incorrectos_files
Downloaded 7222437_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/incorrectos_files
Do

In [6]:
# Step 4: Read the good files CSV and extract file names
df_good = pd.read_csv('quality_score_ocr_premium_5 - good.csv')
good_files = df_good['file_name'].tolist()

# Step 5: Reduce the good files list to the number of bad files
reduced_good_files = good_files[:amount_files]

# Step 6: Download the reduced list of good files
download_files_from_bucket(reduced_good_files, correct_folder, bucket_name, source_folder)

print(f"Downloaded {len(reduced_good_files)} good files to {correct_folder}")

Downloaded 23646877_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 1013635023-2022_dr_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 39794630_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded renta 2022 (1)_dr_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 1045675943_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 19260000_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 22527776_dr_consistentes_page_002.png to data_calidad_tercer_filtro/correctos_files
Downloaded archivo4 - 2023-12-21t141506.629_dr_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 33311851_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 37274270_dr_consistentes_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 17653660_dr_consistentes_page

In [2]:
df_good = pd.read_csv('quality_score_ocr_premium_5 - good.csv')
good_files = df_good['file_name'].tolist()


# # Step 7: Extract 50 files after the reduced_good_files
# start_index = len(reduced_good_files)
# end_index = start_index + 50
# reduced_good_files_50 = good_files[start_index:end_index]

In [3]:
len(good_files)

367

In [8]:
# Step 8: Download the reduced list of good files
download_files_from_bucket(good_files, "data_calidad_tercer_filtro/correctos_files", bucket_name, source_folder)

print(f"Downloaded {len(good_files)} good files to data_calidad_tercer_filtro/correctos_files")

Downloaded 23646877_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 1013635023-2022_dr_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 39794630_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded renta 2022 (1)_dr_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 1045675943_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 19260000_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 22527776_dr_consistentes_page_002.png to data_calidad_tercer_filtro/correctos_files
Downloaded archivo4 - 2023-12-21t141506.629_dr_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 33311851_dr_inconsistentes_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 37274270_dr_consistentes_page_001.png to data_calidad_tercer_filtro/correctos_files
Downloaded 17653660_dr_consistentes_page

In [5]:
import os
import cv2
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from concurrent.futures import ProcessPoolExecutor, as_completed

# Functions to apply image transformations
def apply_blur(image, ksize=(5, 5)):
    return cv2.GaussianBlur(image, ksize, 0)

def add_noise(image, mean=0, sigma=10):
    gauss = np.random.normal(mean, sigma, image.shape).astype('uint8')
    noisy_image = cv2.add(image, gauss)
    return noisy_image

# Function to reduce contrast
def reduce_contrast(image, factor=0.5):
    # Convert to LAB color space to manipulate contrast
    lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    
    # Apply contrast reduction on the L channel
    l = cv2.addWeighted(l, factor, l, 0, 128*(1-factor))
    
    # Merge and convert back to BGR
    lab = cv2.merge((l, a, b))
    return cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)

# Function to apply JPEG compression (simulates compression artifacts)
def jpeg_compression(image, quality=30):
    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
    result, encimg = cv2.imencode('.jpg', image, encode_param)
    return cv2.imdecode(encimg, 1)

def process_image(file_name, input_folder, output_folder):
    input_file_path = os.path.join(input_folder, file_name)
    output_file_path = os.path.join(output_folder, file_name)

    # Read the image
    image = cv2.imread(input_file_path)
    
    if image is None:
        print(f"Error loading image: {input_file_path}")
        return

    # Apply transformations
    image_mod = apply_blur(image, ksize=(5, 5))
    image_mod = add_noise(image_mod, mean=5, sigma=5)
    image_mod = reduce_contrast(image_mod, factor=0.99)

    # Save the transformed image to the output directory
    cv2.imwrite(output_file_path, image_mod)

    print(f"Processed and saved image: {output_file_path}")
    return output_file_path

# Parallel processing function
def parallel_process_images(png_files, input_folder, output_folder, max_workers):
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks to the thread pool
        futures = [executor.submit(process_image, file_name, input_folder, output_folder) for file_name in png_files]
        
        # Wait for all tasks to complete and process the results
        for future in as_completed(futures):
            try:
                result = future.result()
                print(f"Completed processing: {result}")
            except Exception as exc:
                print(f"Exception occurred: {exc}")

# Example usage
input_folder = 'data_calidad_tercer_filtro/correctos_files'
output_folder = 'data_calidad_tercer_filtro/incorrectos_files'
png_files = [f for f in os.listdir(input_folder) if f.endswith('.png')]

parallel_process_images(png_files, input_folder, output_folder, max_workers=32)

Processed and saved image: data_calidad_tercer_filtro/incorrectos_files/17655718_dr_consistentes_page_001.png
Completed processing: data_calidad_tercer_filtro/incorrectos_files/17655718_dr_consistentes_page_001.png
Processed and saved image: data_calidad_tercer_filtro/incorrectos_files/544.declarenta - 2023-12-09t140615.325_dr_page_001.png
Processed and saved image: data_calidad_tercer_filtro/incorrectos_files/24370790_dr_consistentes_page_002.png
Processed and saved image: data_calidad_tercer_filtro/incorrectos_files/51894444_dr_inconsistentes_page_002.png
Processed and saved image: data_calidad_tercer_filtro/incorrectos_files/copia de 544.declarenta(15)_dr_page_001.png
Processed and saved image: data_calidad_tercer_filtro/incorrectos_files/544.declarenta - 2023-12-17t103926.630_dr_page_001.pngCompleted processing: data_calidad_tercer_filtro/incorrectos_files/544.declarenta - 2023-12-09t140615.325_dr_page_001.png
Completed processing: data_calidad_tercer_filtro/incorrectos_files/24370