In [None]:
import uuid
import random
import numpy as np
from datetime import datetime, timedelta
from scipy.stats import truncnorm
import pandas as pd


In [None]:
def generate_unique_id(size):
	return [uuid.uuid4().hex[:16] for _ in range(size)]

In [None]:
def get_random_dates(start_date, end_date, size):
	start = datetime.strptime(start_date, '%Y-%m-%d')
	end = datetime.strptime(end_date, '%Y-%m-%d')
	delta = end - start
	return [(start + timedelta(days=random.randint(0, delta.days))).strftime("%Y-%m-%d") for _ in range(size)]

In [None]:
def get_random_category(values, size):
	return [random.choice(values) for _ in range(size)]

In [None]:
def generate_truncated_normal_data(mean, std, min_val, max_val, size):
    # Calcular los parámetros de la distribución normal truncada
    a, b = (min_val - mean) / std, (max_val - mean) / std
    data = truncnorm(a, b, loc=mean, scale=std).rvs(size)
    return data

In [None]:
def simpleNormal(min,max,n):
    list=[]
    mean=(min+max)/2
    std=(max-min)/6
    return np.random.normal(mean,std,n)

In [None]:

def encontrar_maximo(config):
    maximo = 0
    
    if not config.get("random", True):
        for columna in config["columns"]:
            if columna["type"] == "date":
                fecha_min = datetime.strptime(columna["values"]["min"], '%Y-%m-%d')
                fecha_max = datetime.strptime(columna["values"]["max"], '%Y-%m-%d')
                dias = (fecha_max - fecha_min).days
                maximo = max(maximo, dias)
            elif columna["type"] == "category":
                maximo = max(maximo, len(columna["values"]))
            elif columna["type"] == "numeric":
                maximo = max(maximo, columna["values"]["max"])
    
    return maximo

In [None]:

d1 = {
	"ds": "dataset",
	"columns": [
		{
			"name": "area",
			"type": "category",
			"values": ["TI", "FIN", "HR"]
		},
		{
			"name": "id",
			"type": "unique"
		}
	],
	"random": False
}
d3 = {
"ds": "dataset3",
"columns": [
{
"name": "id",
"type": "unique"
},
{
"name": "subarea",
"type": "category",
"values": "dataset2.id"
},
{
"name": "income",
"type": "numeric",
"values": {"min": 10, "max": 20}
},
{
"name": "goal",
"type": "numeric",
"values": {"min": 10, "max": 100, "std": 0.5,
"mean": 110/2}
}
],
"random": True,
"random_rows": 1000
}
d4 = {
	"ds": "dataset",
	"columns": [
		{
			"name": "Fecha",
			"type": "date",
			"values": {
				"min": "2024-01-01",
				"max": "2024-02-28"
			}
		}
	],
	"random": False
}

In [None]:
config_list = [d1, d4]
dataFrameList = []

for config in config_list:
	dataElement = {}
	
	size = config["random_rows"] if config.get("random", False)  else  encontrar_maximo(config)
	
	for column in config["columns"]:
		col_type = column["type"]
		
		if col_type == "unique":
			dataElement[column["name"]] = generate_unique_id(size)
		elif col_type == "date":
			dataElement[column["name"]] = get_random_dates(column["values"]["min"], column["values"]["max"], size)
		elif col_type == "category":
			dataElement[column["name"]] = get_random_category(column["values"], size)
		elif col_type == "numeric":
			values = column["values"]
			if "mean" in values:
				dataElement[column["name"]] = generate_truncated_normal_data(
					values["mean"], values["std"], values["min"], values["max"], size)
			else:
				dataElement[column["name"]] = simpleNormal(values["min"], values["max"], size)
	
	dataFrameList.append(pd.DataFrame(dataElement))

In [None]:
print(dataFrameList)
