In [44]:
import pandas as pd
from pandas import DataFrame
import json
import urllib
from dotenv import dotenv_values
import os
import re
import pathlib
import numpy as np


In [2]:
FUENTES_RAW_ID = dotenv_values()['FUENTES_RAW_ID']
FUENTES_CLEAN_ID = dotenv_values()['FUENTES_CLEAN_ID']
DATA_REPO_FOLDER = "argendata-data-fork"
TRANSFORMER_REPO_FOLDER = "argendata-transformers-fork"
GRAFICOS_REPO_FOLDER = "argendata-graficos"


In [72]:
def list_files_from_data(path:str, full_path:bool = False, ext:str=""):
    prefix = ""
    if full_path:
        prefix = path + "/"
    return [prefix + x for x in os.listdir(path) if x.endswith(ext)]

def get_fuentes(metadata_dict:dict):
    fuentes : str | list[str] = metadata_dict['fuentes']   
    return fuentes

def read_json(json_path:str):
    with open(json_path, "r", errors='ignore') as f:
        data : dict = json.load(f)
    return data

def get_mappings(ruta:str): 
    mappings = read_json(ruta)
    return mappings

def get_metadata(metadata_path:str):
    metadata = read_json(json_path=metadata_path)
    return metadata

def parse_id(fuente_id:str): 
    raw, clean = None, None
    match = re.match(r'R(\d+)C(\d+)', fuente_id)
    if match:
        raw , clean   = int(match.group(1)), int(match.group(2))
        if clean == 0: 
            raw = f"R{raw}C0"
            clean = None
        else: 
            raw = f"R{raw}C0"
            clean = fuente_id
        return raw, clean
    else:
        raise ValueError("String no coincide con el formato esperado")

def drop_ext_from_name(filename:str): 
    return "".join(filename.split(".")[:-1])

def get_filename_from_plot_id(plot_id:str, mappings_dict:str):
    not_found = True
    for k,list_values in mappings_dict.items(): 
        for d in list_values: 
            if d['public'] == plot_id:
                not_found = False
                return drop_ext_from_name(filename = k)
    if not_found: 
        raise(ValueError(f"No se ha encontrado nada con codigo {plot_id}"))


def get_pipeline_from_plot_id(plot_id, mappings_dict):
    subtopico = plot_id.split("_")[0]
    output_name = get_filename_from_plot_id(plot_id=plot_id, mappings_dict=mappings_dict)
    json_filename = output_name + ".json"
    metadata_path = f"../{DATA_REPO_FOLDER}/{subtopico}/{json_filename}"
    pipeline = []
    if pathlib.Path(metadata_path).exists():
        try:
            metadata_dict = get_metadata(metadata_path=metadata_path)
            fuentes = get_fuentes(metadata_dict=metadata_dict)
            for fuente_id in fuentes: 
                raw, clean = parse_id(fuente_id=fuente_id)
                pipeline.append((subtopico, plot_id, output_name, raw, clean))
        except Exception as e:
            print(repr(e))
            pipeline.append((subtopico, output_name, "error", "error"))
    else:
        print(f"--- No existe el JSON: {output_name}.json")
        pipeline.append((subtopico, plot_id, output_name, "sin json", "sin json"))
    return pipeline


def list_plot_id_from_subtopic(subtopic_id): 
    subtopic_path = f"../{GRAFICOS_REPO_FOLDER}/{subtopic_id}"
    plot_id_list = list_files_from_data(path=subtopic_path)
    return sorted(plot_id_list)


def get_subtopic_pipelines(subtopic_id):
    plot_id_list = list_plot_id_from_subtopic(subtopic_id=subtopic_id)
    subtop_pipelines = []
    ruta_mappings = f"../{TRANSFORMER_REPO_FOLDER}/{subtopic_id}/mappings.json"
    if pathlib.Path(ruta_mappings).exists():
        mappings_dict = get_mappings(ruta=ruta_mappings)
        for plot_id in plot_id_list:
            print(f"--{plot_id}") 
            if mappings_dict:
                plot_pipelines = get_pipeline_from_plot_id(plot_id=plot_id, mappings_dict=mappings_dict)
                subtop_pipelines.extend(plot_pipelines)
    else:
        subtop_pipelines = [(subtopic_id, plot_id, "sin mapping", "sin mapping", "sin mapping") for plot_id in plot_id_list]
    
    return subtop_pipelines

def get_all_pipelines(subtopic_list:list[str]): 
    all_pipelines = []
    for subtop_id in subtopic_list: 
        print("\n", subtop_id)
        subtop_pipelines = get_subtopic_pipelines(subtopic_id=subtop_id)
        all_pipelines.extend(subtop_pipelines)
    return all_pipelines


def genSankey(df:DataFrame, cat_cols:list[str], output_title:str = 'Sankey Diagram'):
        
        colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
        labelList = []
        colorNumList = []
        for catCol in cat_cols:
            labelListTemp =  list(set(df[catCol].values))
            colorNumList.append(len(labelListTemp))
            labelList = labelList + labelListTemp

        # remove duplicates from labelList
        labelList = list(dict.fromkeys(labelList))

        # define colors based on number of levels
        colorList = []
        for idx, colorNum in enumerate(colorNumList):
            colorList = colorList + [colorPalette[idx]]*colorNum

        # transform df into a source-target pair
        sourceTargetDf = DataFrame()
        for i in range(len(cat_cols)-1):
                tempDf = df[[cat_cols[i],cat_cols[i+1]]]
                tempDf.columns = ['source','target']
                sourceTargetDf = pd.concat([sourceTargetDf,tempDf])

        sourceTargetDf = sourceTargetDf.groupby(['source','target'], dropna=True).size().reset_index()
        sourceTargetDf.columns = ['source','target','count']

        # add index for source-target pair
        sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
        sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))

        fig = go.Figure(data = [go.Sankey(
            node = dict(
              pad = 15,
              thickness = 20,
              line = dict(
                color = "black",
                width = 0.5),
              label = labelList,
              color = colorList
            ),
            link = dict(
              source = sourceTargetDf['sourceID'],
              target = sourceTargetDf['targetID'],
              value = sourceTargetDf['count']
            )

            )]
        )

        fig.update_layout(title_text = output_title, 
            font_size = 10,
            margin=dict(l=20, r=20, t=20, b=20))
        
        return fig

In [73]:
SUBTOPICOS_SCRIPTING = ["ACECON", "PRECIO", "CAMCLI", "TRANEN", "MERTRA", "INFDES", "POBREZ", "SALING", "DESIGU"]
pipelines = get_all_pipelines(subtopic_list=SUBTOPICOS_SCRIPTING)
pipelines_df = pd.DataFrame(pipelines, columns=['subtopico', 'grafico','dataset_analista','raw','clean'])


 ACECON

 PRECIO
--PRECIO_g01
--PRECIO_g02
--PRECIO_g03
--- No existe el JSON: 3_tasa_de_inflacion_anual_argentina_1935_2022.json
--PRECIO_g04
--PRECIO_g05
--PRECIO_g06
--PRECIO_g07
--PRECIO_g08
--- No existe el JSON: 8_evol_precios_aperturas_relativos_a_evol_precios_generales_argentina_1947_2022.json
--PRECIO_g09
--- No existe el JSON: 9_inflacion_acumulada_argentina_dic2001_dic2002.json
--PRECIO_g10
--PRECIO_g11
--PRECIO_g12
--PRECIO_g13
--PRECIO_g14
--PRECIO_g15
--PRECIO_g16
--PRECIO_g17

 CAMCLI
--CAMCLI_g01
--- No existe el JSON: emisiones_anuales_co2_region_2021.json
--CAMCLI_g02
--CAMCLI_g03
--CAMCLI_g04
--CAMCLI_g05
--CAMCLI_g06
--- No existe el JSON: emisiones_subsec_arg_2018.json
--CAMCLI_g07
--CAMCLI_g08
--CAMCLI_g09
--CAMCLI_g10
--CAMCLI_g11
--CAMCLI_g12
--- No existe el JSON: emisiones_piup_1990_2018.json
--CAMCLI_g13
--CAMCLI_g14
--CAMCLI_g15
--CAMCLI_g16
--- No existe el JSON: emisiones_anuales_co2_ch4_n20_1850_2022.json
--CAMCLI_g17
--- No existe el JSON: emisiones_anu

In [75]:
cols

Index(['subtopico', 'grafico', 'dataset_analista', 'raw', 'clean'], dtype='object')

In [80]:
import plotly.graph_objects as go

subtop_visu = "DESIGU"

visu_df = pipelines_df[(pipelines_df.dataset_analista != "sin mapping") & (pipelines_df.raw != "sin json") & (pipelines_df.subtopico==subtop_visu)]

cols = visu_df.columns 
cols = cols[cols!="subtopico"]

fig = genSankey(df=visu_df, cat_cols=cols, output_title=f"Pipelines Diagram: {subtop_visu}")
fig.show()