In [1]:
import pandas as pd
from pandas import DataFrame
import json
import urllib
from dotenv import dotenv_values
import os
import re
import pathlib



In [2]:
FUENTES_RAW_ID = dotenv_values()['FUENTES_RAW_ID']
FUENTES_CLEAN_ID = dotenv_values()['FUENTES_CLEAN_ID']
DATA_REPO_FOLDER = "argendata-data-fork"
TRANSFORMER_REPO_FOLDER = "argendata-transformers-fork"
GRAFICOS_REPO_FOLDER = "argendata-graficos"


In [3]:
def list_files_from_data(path:str, full_path:bool = False, ext:str=""):
    prefix = ""
    if full_path:
        prefix = path + "/"
    return [prefix + x for x in os.listdir(path) if x.endswith(ext)]

def get_fuentes(metadata_dict:dict):
    fuentes : str | list[str] = metadata_dict['fuentes']   
    return fuentes

def read_json(json_path:str):
    with open(json_path, "r") as f:
        data : dict = json.load(f)
    return data

def get_mappings(ruta:str): 
    mappings = read_json(ruta)
    return mappings

def get_metadata(metadata_path:str):
    metadata = read_json(json_path=metadata_path)
    return metadata

def parse_id(fuente_id:str): 
    
    match = re.match(r'R(\d+)C(\d+)', fuente_id)
    if match:
        raw , clean   = int(match.group(1)), int(match.group(2))
        if clean == 0: 
            raw = fuente_id
            clean = None
        else: 
            raw = f"R{raw}C0"
            clean = fuente_id
        return raw, clean
    else:
        raise ValueError("String no coincide con el formato esperado")

def drop_ext_from_name(filename:str): 
    return "".join(filename.split(".")[:-1])

def get_filename_from_plot_id(plot_id:str, mappings_dict:str):
    not_found = True
    for k,list_values in mappings_dict.items(): 
        for d in list_values: 
            if d['public'] == plot_id:
                not_found = False
                return drop_ext_from_name(filename = k)
    if not_found: 
        raise(ValueError(f"No se ha encontrado nada con codigo {plot_id}"))


def get_pipeline_from_plot_id(plot_id, mappings_dict):
    subtopico = plot_id.split("_")[0]
    output_name = get_filename_from_plot_id(plot_id=plot_id, mappings_dict=mappings_dict)
    json_filename = output_name + ".json"
    metadata_path = f"../{DATA_REPO_FOLDER}/{subtopico}/{json_filename}"
    pipeline = []
    if pathlib.Path(metadata_path).exists():
        try:
            metadata_dict = get_metadata(metadata_path=metadata_path)
            fuentes = get_fuentes(metadata_dict=metadata_dict)
            for fuente_id in fuentes: 
                raw, clean = parse_id(fuente_id=fuente_id)
                pipeline.append((subtopico, plot_id, output_name, clean, raw))
        except UnicodeDecodeError:
            print("--- No se pudo leer adecuadamente el json")
            pipeline.append((subtopico, output_name, None, None))
    else:
        print("--- No existe el JSON")
        pipeline.append((subtopico, plot_id, output_name, None, None))
    return pipeline


def list_plot_id_from_subtopic(subtopic_id): 
    subtopic_path = f"../{GRAFICOS_REPO_FOLDER}/{subtopic_id}"
    plot_id_list = list_files_from_data(path=subtopic_path)
    return sorted(plot_id_list)


def get_subtopic_pipelines(subtopic_id):
    plot_id_list = list_plot_id_from_subtopic(subtopic_id=subtopic_id)
    subtop_pipelines = []
    ruta_mappings = f"../{TRANSFORMER_REPO_FOLDER}/{subtopic_id}/mappings.json"
    if pathlib.Path(ruta_mappings).exists():
        mappings_dict = get_mappings(ruta=ruta_mappings)
        for plot_id in plot_id_list:
            print(f"--{plot_id}") 
            if mappings_dict:
                plot_pipelines = get_pipeline_from_plot_id(plot_id=plot_id, mappings_dict=mappings_dict)
                subtop_pipelines.extend(plot_pipelines)
    else:
        subtop_pipelines = [(subtopic_id, plot_id, None, None) for plot_id in plot_id_list]
    
    return subtop_pipelines

def get_all_pipelines(subtopic_list:list[str]): 
    all_pipelines = []
    for subtop_id in subtopic_list: 
        print("\n", subtop_id)
        subtop_pipelines = get_subtopic_pipelines(subtopic_id=subtop_id)
        all_pipelines.extend(subtop_pipelines)
    return all_pipelines

In [4]:
SUBTOPICOS_SCRIPTING = ["ACECON", "PRECIO", "CAMCLI", "TRANEN", "MERTRA", "INFDES", "POBREZ", "SALING", "DESIGU"]
pipelines = get_all_pipelines(subtopic_list=SUBTOPICOS_SCRIPTING)


 ACECON

 PRECIO
--PRECIO_g01
--PRECIO_g02
--PRECIO_g03
--- No existe el JSON
--PRECIO_g04
--PRECIO_g05
--PRECIO_g06
--PRECIO_g07
--PRECIO_g08
--- No existe el JSON
--PRECIO_g09
--- No existe el JSON
--PRECIO_g10
--PRECIO_g11
--PRECIO_g12
--PRECIO_g13
--PRECIO_g14
--PRECIO_g15
--PRECIO_g16
--PRECIO_g17

 CAMCLI
--CAMCLI_g01
--- No existe el JSON
--CAMCLI_g02
--CAMCLI_g03
--CAMCLI_g04
--- No se pudo leer adecuadamente el json
--CAMCLI_g05
--- No se pudo leer adecuadamente el json
--CAMCLI_g06
--- No existe el JSON
--CAMCLI_g07
--- No se pudo leer adecuadamente el json
--CAMCLI_g08
--CAMCLI_g09
--- No se pudo leer adecuadamente el json
--CAMCLI_g10
--- No se pudo leer adecuadamente el json
--CAMCLI_g11
--- No se pudo leer adecuadamente el json
--CAMCLI_g12
--- No existe el JSON
--CAMCLI_g13
--- No se pudo leer adecuadamente el json
--CAMCLI_g14
--- No se pudo leer adecuadamente el json
--CAMCLI_g15
--- No se pudo leer adecuadamente el json
--CAMCLI_g16
--- No existe el JSON
--CAMCLI_g17

In [5]:
pipelines_df = pd.DataFrame(pipelines, columns=['subtopico', 'grafico','dataset_analista','raw','clean'])
pipelines_df

Unnamed: 0,subtopico,grafico,dataset_analista,raw,clean
0,ACECON,ACECON_g01,,,
1,ACECON,ACECON_g02,,,
2,ACECON,ACECON_g03,,,
3,ACECON,ACECON_g04,,,
4,ACECON,ACECON_g05,,,
...,...,...,...,...,...
298,DESIGU,DESIGU_g25,brecha_horas_trabajadas_genero,,R66C0
299,DESIGU,DESIGU_g25,brecha_horas_trabajadas_genero,,R67C0
300,DESIGU,DESIGU_g25,brecha_horas_trabajadas_genero,,R68C0
301,DESIGU,DESIGU_g25,brecha_horas_trabajadas_genero,,R69C0
