# A. Librerias y rutas

In [8]:
import os
import re
import time
import json
import platform
import unicodedata
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [9]:
# 1. Definir rutas y sistema operativo
os_platform = platform.system().lower()

# Definir la carpeta de descargas para los archivos JSON
LOG_FILE_PATH = os.path.join("./scraping_log.txt")

# 2. Configurar rutas del proyecto según el sistema operativo
if os_platform == "darwin":
    CHROMEDRIVER_BASE_PATH = "/Volumes/Projects/GitHubProjects/UNI-SCRAPING"
elif os_platform == "windows":
    CHROMEDRIVER_BASE_PATH = "D:\\GithubProjects\\Tesis\\Transparencia"
elif os_platform == "linux":
    CHROMEDRIVER_BASE_PATH = "/home/brew_test_gcp_01/Desktop/WallyScraper"

if os_platform == "darwin":
    CHROMEDRIVER_PATH = os.path.join(CHROMEDRIVER_BASE_PATH, "chromedriver", "darwin", "chromedriver")
elif os_platform == "windows":
    CHROMEDRIVER_PATH = os.path.join(CHROMEDRIVER_BASE_PATH, "chromedriver", "windows", "chromedriver.exe")
elif os_platform == "linux":
    CHROMEDRIVER_PATH = os.path.join(CHROMEDRIVER_BASE_PATH, "chromedriver", "linux", "chromedriver")


# 3. Imprimir variables
print("ESTAMOS EN ---> ", os_platform)
print("CHROME_DRIVER_PATH: ", CHROMEDRIVER_PATH)
print("")

ESTAMOS EN --->  darwin
CHROME_DRIVER_PATH:  /Volumes/Projects/GitHubProjects/UNI-SCRAPING/chromedriver/darwin/chromedriver



# B. Funciones

In [10]:
def get_chrome_driver(chromedriver_path, print_view=False, headless=False):
    # Configurar las opciones de Selenium
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--start-maximized")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--ignore-certificate-errors")

    # Argumentos opcionales
    if print_view:
        options.add_argument("--disable-print-preview")
    if headless:
        options.add_argument("--headless=new")

    # Configurar el servicio del driver
    service = Service(executable_path=chromedriver_path)
    driver = webdriver.Chrome(service=service, options=options)
    
    return driver

def read_last_position():
    if os.path.exists(LOG_FILE_PATH):
        with open(LOG_FILE_PATH, 'r') as log_file:
            last_position = log_file.read()
            return int(last_position) if last_position.isdigit() else 0
    return 0

def write_last_position(position):
    with open(LOG_FILE_PATH, 'w') as log_file:
        log_file.write(str(position))

def to_snake_case(s):
    # Eliminar tildes
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    # Eliminar caracteres no alfanuméricos y reemplazar espacios con '_'
    s = re.sub(r'[^\w\s]', '', s).strip()  # Quitar caracteres especiales
    s = re.sub(r'\s+', '_', s.lower())  # Reemplazar múltiples espacios con '_', y convertir a minúsculas
    return s

# C. Prueba

In [26]:
driver = get_chrome_driver(chromedriver_path=CHROMEDRIVER_PATH, print_view=False, headless=False)

In [27]:
url = 'https://e-consultaruc.sunat.gob.pe/cl-ti-itmrconsruc/FrameCriterioBusquedaWeb.jsp'
driver.get(url)

In [29]:
# Ingresar el RUC en el campo de texto con By.ID, "txtRuc"
driver.find_element(By.ID, "txtRuc").send_keys("10000002858")

In [30]:
# Click en el botón con By.ID, "btnAceptar"
driver.find_element(By.ID, "btnAceptar").click()

In [31]:
# Extraer el html de la página
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
# soup

In [32]:
driver.quit()

# D. Automatizar

In [11]:
def scrape_ruc(driver, ruc):
    url = 'https://e-consultaruc.sunat.gob.pe/cl-ti-itmrconsruc/FrameCriterioBusquedaWeb.jsp'
    driver.get(url)
    
    try:
        # Ingresar el RUC en el campo de texto
        ruc_input = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.ID, "txtRuc"))
        )
        ruc_input.clear()
        ruc_input.send_keys(ruc)
        
        # Hacer clic en el botón "Buscar"
        search_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, "btnAceptar"))
        )
        search_button.click()
        time.sleep(2)  # Esperar un momento para que se cargue la página
        
        # Leer el HTML actual
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        
        # Extraer la información de la búsqueda
        panel_div = soup.find('div', class_='panel panel-primary')
        data = {}
        if panel_div:
            resultado_busqueda = panel_div.find('div', class_='panel-heading').get_text(strip=True)
            data[to_snake_case("Resultado de la Búsqueda")] = resultado_busqueda.strip()
            
            items = panel_div.find_all('div', class_='list-group-item')
            for item in items:
                row = item.find('div', class_='row')
                if row:
                    label_div = row.find('div', class_='col-sm-5')
                    value_div = row.find('div', class_='col-sm-7')
                    
                    if label_div and value_div:
                        label = to_snake_case(label_div.get_text(strip=True))
                        value = value_div.get_text(strip=True).strip()
                        data[label] = value
                    
                    col_sm3_items = row.find_all('div', class_='col-sm-3')
                    if len(col_sm3_items) == 4:
                        label_inscripcion = to_snake_case(col_sm3_items[0].get_text(strip=True))
                        fecha_inscripcion = col_sm3_items[1].get_text(strip=True).strip()
                        data[label_inscripcion] = fecha_inscripcion
                        
                        label_inicio = to_snake_case(col_sm3_items[2].get_text(strip=True))
                        fecha_inicio = col_sm3_items[3].get_text(strip=True).strip()
                        data[label_inicio] = fecha_inicio
        else:
            print(f"No se encontró el panel para RUC {ruc}")
        
        return data
    
    except Exception as e:
        print(f"Error durante la operación con RUC {ruc}: {e}")
        return None

In [12]:
with open('../data/ids.txt', 'r') as file:
    rucs = file.read().splitlines()

rucs = rucs[:4]

In [13]:
driver = get_chrome_driver(chromedriver_path=CHROMEDRIVER_PATH, print_view=False, headless=False)

for index, ruc in enumerate(rucs):
    record_id = f"record_{index}"  # Generar un ID único basado en el índice
    
    # Scraping para cada RUC
    data = scrape_ruc(driver, ruc)
    
    if data:
        # Guardar en un archivo JSON en la carpeta ./results
        file_name = f"{record_id}_data.json"
        file_path = os.path.join("./results", file_name)
        os.makedirs("./results", exist_ok=True)  # Asegurar que la carpeta exista
        
        with open(file_path, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, indent=4, ensure_ascii=False)
        
        print(f"Datos de RUC {ruc} guardados en {file_name}")
    else:
        print(f"No se pudieron obtener datos para RUC {ruc}")


Datos de RUC 10000002858 guardados en record_0_data.json
Datos de RUC 10000003412 guardados en record_1_data.json
Datos de RUC 10000015968 guardados en record_2_data.json
Datos de RUC 10000019581 guardados en record_3_data.json
