In [3]:
# Prueba 9-7-2024 00:05

In [36]:
#importacion  de librerias
import numpy as np
import pandas as pd
import uuid
import random 
from datetime import datetime, timedelta 

In [38]:
#funcion para generar IDs Unicos
def generate_unique_id():
    return str(uuid.uuid4()) 
    

In [49]:
# funcion para crear dataframes 

def create_dataframe(config, existing_dataframes):
    columns = config['columns'] 
    data = {}  
    max_length = 0 

    for col in columns:  
        if col['type'] == 'category': 
            values = col['values'] 
           
            data[col['name']] = values * (max_length // len(values)) + values[:max_length % len(values)]
            if len(values) > max_length: 
                max_length = len(values)

        elif col['type'] == 'unique':  
            data[col['name']] = [generate_unique_id() for _ in range(max_length)]
        
        elif col['type'] == 'date': 
            min_date = datetime.strptime(col['values']['min'], "%Y-%m-%d")
            max_date = datetime.strptime(col['values']['max'], "%Y-%m-%d")
            date_range = (max_date - min_date).days 
            data[col['name']] = [min_date + timedelta(days=i) for i in range(date_range + 1)]
            if date_range + 1 > max_length: 
                max_length = date_range + 1

        elif col['type'] == 'foreign': 
            dataset_name, column_name = col['values'].split('.') 
            if dataset_name not in existing_dataframes: 
                raise ValueError(f"Dataset {dataset_name} not found.")
            if column_name not in existing_dataframes[dataset_name].columns: 
                valid_columns = ", ".join(existing_dataframes[dataset_name].columns)
                raise ValueError(f"Column {column_name} not found in dataset {dataset_name}. Valid columns are: {valid_columns}.")
            foreign_values = existing_dataframes[dataset_name][column_name].unique() 
            data[col['name']] = random.choices(foreign_values, k=max_length)  
    dataframe = pd.DataFrame(data)  
    return dataframe 

def build_dataframes(config_list):
    dataframes = {}  
    
    for config in config_list: 
        df_name = config['ds'] 
        df = create_dataframe(config, dataframes)  
        dataframes[df_name] = df  
    
    return dataframes  

In [71]:
# Definimos la configuración para el primer dataframe, 'empleados'.
d1 = {
    "ds": "Empleados",  
    "columns": [  
        {
            "name": "departamento",  
            "type": "category",  
            "values": ["RRHH", "Finanzas", "IT", "Marketing"]  
        },
        {
            "name": "Empleado_id",  
            "type": "unique"  
        }
    ],
    "random": False  
}

# Definimos la configuración para el segundo dataframe, 'Empleado_det'.
d2 = {
    "ds": "Empleado_det",  
    "columns": [  
        {
            "name": "det_id", 
            "type": "unique" 
        },
        {
            "name": "departamento", 
            "type": "foreign", 
            "values": "Empleados.departamento"  
        },
        {
            "name": "role",  
            "type": "category", 
            "values": ["Gerente", "Analista", "Developer", "Desarrollador"]  
        }
    ],
    "random": False  
}

# Definimos la configuración para el tercer dataframe, 'salarios'.
d3 = {
    "ds": "salarios",  
    "columns": [ 
        {
            "name": "salary_id",  
            "type": "unique"  
        },
        {
            "name": "role",  
            "type": "foreign",  
            "values": "Empleado_det.role"  
        },
        {
            "name": "salario",  
            "type": "numeric",  
            "values": {"min": 30000, "max": 120000}  
        },
        {
            "name": "bonificaciones",  
            "type": "numeric", 
            "values": {"min": 5000, "max": 20000, "std": 3000, "mean": 10000}  
        }
    ],
    "random": True,  
    "random_rows": 1000  
}

# Creamos una lista con las configuraciones de los dataframes.
conf_list = [d1, d2, d3]
