In [1]:
import pandas as pd
import numpy as np
import random
import json
from IPython.display import display
from typing import Dict, List, Tuple
import os
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool, cpu_count

In [2]:
source = "../data/normalized_new_zeland.csv"
sink = "../generated"
synonims_path = "../data/synonims.json"

In [3]:
def load_synonims(path:str):
    """Load synonims from the given path.
        path:str, path from which to given path.
    """
    with open(path, "r") as f:
        data = json.load(f)
    return {
        key: value + [key] for key, value in data.items()
    }

In [4]:
def sample(array:np.array):
    """Randomly sample given array."""
    return random.sample(array.tolist(), random.randint(1, array.size))

In [5]:
def wiggle_names(name:str, word_drop:float):
    """Drop words from the name with given probability.
        name:str, name to edit.
        word_drop:float, probability to drop word.
    """
    new_name = ",".join([
        word for word in name.split(",") if random.random() > word_drop
    ])
    return new_name if new_name else random.choice(name.split(","))

In [6]:
def generate_dataframe(original:pd.DataFrame, names_column:str, synonims:Dict[str, List[str]], path:str, nan_percentage:float=0.01, gaussian_noise:float=0.1, noise_base_variance:float=0, name_word_drop:float=0.1):
    """Generate a random pandas DataFrame and saves it and its original data.
        original:pd.DataFrame, the original dataframe from which to generate the new datasets.
        names_column:str, the name of the column containing the row names or descripttion.
        synonims:Dict[str, List[str]], dictionary of synonims.
        path:str, base directory to which save results.
        nan_percentage:float=0.01, probability of turning values to NaN.
        gaussian_noise:float=0.1, gaussian noise intensity (mean).
        noise_base_variance:float=0, basic noise variance.
        name_word_drop:float=0.1, probability to drop a word from the name.
    """
    assert names_column in original.columns
    params = {
        "nan_percentage":nan_percentage,
        "gaussian_noise":gaussian_noise,
        "noise_base_variance":noise_base_variance,
        "name_word_drop":name_word_drop
    }
    os.makedirs(
        path,
        exist_ok=True
    )
    cols =  [names_column] + sample(original.columns[1:])
    index = sample(original.index)
    new = original[cols].loc[index]
    # Dropping words in name
    new[names_column] = [
        wiggle_names(name, name_word_drop) for name in new[names_column]
    ]
    # Renaming columns
    new = new.rename(index=str, columns={
        c:random.choice(synonims[c]) for c in cols
    })
    # Applying gaussian noise
    variance = np.power(np.nanvar(new[new.columns[1:]].values, axis=0), 1/3)
    noise = np.vstack([
        np.random.normal(gaussian_noise, var, new.shape[0]) for var in variance+noise_base_variance
    ]).T
    new[new.columns[1:]] = np.abs(new[new.columns[1:]]+noise)
    # Applying a mask of NaN
    nan_mask = np.random.choice([True, False], size=new[new.columns[1:]].shape, p=[nan_percentage, 1-nan_percentage])
    values = new[new.columns[1:]].values
    values[nan_mask] = np.NaN
    new[new.columns[1:]] = values
    # Saving parameters
    with open("{path}/params.json".format(path=path), "w") as f:
        json.dump(params, f)
    # Saving original columns and indices
    pd.DataFrame({
        "original": cols,
        "generated": new.columns
    }).to_csv("{path}/columns.csv".format(path=path))
    pd.DataFrame({
        "original": index,
        "generated": new.index
    }).to_csv("{path}/index.csv".format(path=path))
    # Saving generated dataset
    new.to_csv("{path}/generated.csv".format(path=path))
    

In [7]:
def job(task):
    generate_dataframe(*task)

In [8]:
tasks = []
dataframe = pd.read_csv(source, index_col=0)
synonims = load_synonims(synonims_path)
i=0
for nan_percentage in tqdm(np.linspace(0, 0.75, 10), desc="Task generation"):
    for gaussian_noise in np.linspace(0, 20, 10):
        for noise_base_variance in np.linspace(0, 10, 10):
            for name_word_drop in np.linspace(0, 1, 10):
                tasks.append((
                    dataframe,
                    "short food name",
                    synonims,
                    "../generated/{i}".format(i=i),
                    nan_percentage,
                    gaussian_noise,
                    noise_base_variance,
                    name_word_drop
                ))
                i+=1

HBox(children=(IntProgress(value=0, description='Task generation', max=10, style=ProgressStyle(description_wid…




In [9]:
with Pool(cpu_count()) as p:
    list(tqdm(p.imap(job, tasks), desc="Rendering", total=len(tasks)))

HBox(children=(IntProgress(value=0, description='Rendering', max=10000, style=ProgressStyle(description_width=…




