# Create a synthetic version of your own CSV or DataFrame

This blueprint utilizes Gretel's premium SDKs to create a synthetic version of your own data. Our SDKs create automatic data validators to help ensure the data generated has the same semantics as the source data. Additionally, the SDKs do autmoatic header clustering to help maintain statistical relations between columns.

In [20]:
!pip install -U gretel-client gretel-synthetics pandas
!pip install gretel-synthetics

Requirement already up-to-date: gretel-client in /usr/local/lib/python3.7/dist-packages (0.7.12)
Requirement already up-to-date: gretel-synthetics in /usr/local/lib/python3.7/dist-packages (0.15.6)
Collecting pandas
  Using cached https://files.pythonhosted.org/packages/51/51/48f3fc47c4e2144da2806dfb6629c4dd1fa3d5a143f9652b141e979a8ca9/pandas-1.2.4-cp37-cp37m-manylinux1_x86_64.whl
[31mERROR: gretel-helpers 0.8.3 has requirement pandas<1.2,>=1.0.0, but you'll have pandas 1.2.4 which is incompatible.[0m
[31mERROR: google-colab 1.0.0 has requirement pandas~=1.1.0; python_version >= "3.0", but you'll have pandas 1.2.4 which is incompatible.[0m
[31mERROR: google-colab 1.0.0 has requirement requests~=2.23.0, but you'll have requests 2.25.1 which is incompatible.[0m
Installing collected packages: pandas
  Found existing installation: pandas 1.1.5
    Uninstalling pandas-1.1.5:
      Successfully uninstalled pandas-1.1.5
Successfully installed pandas-1.2.4




In [21]:
# Load your Gretel API key. You can acquire this from the Gretel Console 
# @ https://console.gretel.cloud

import pandas as pd
from gretel_client import get_cloud_client

pd.set_option('max_colwidth', None)

client = get_cloud_client(prefix="api", api_key="prompt")
client.install_packages()

Enter Gretel API key: ··········


INFO pkg_installers.py: Authenticating with package manager
INFO pkg_installers.py: Installing packages (this might take a while)
ERROR pkg_installers.py: /usr/bin/python3 -m pip --disable-pip-version-check install https://gretel-opt-prod-usw2.s3.amazonaws.com/priv/pip/gretel-helpers/0.8.3/gretel_helpers-0.8.3-py3-none-any.whl?AWSAccessKeyId=ASIARC2BUADHTDHTC443&Signature=v%2F74qVH6S77EmHL1GJ4ecKL8WAg%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEP3%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJGMEQCIDTUS8kQqFo7YuukXFWjbDow%2FnMjfiowe02CiIW32kOMAiAueF6lxWyjsLPKtNFCnhRo69niLSByxjYkHds%2BbgX4ISrnAQiG%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA3NDc2MjY4MjU3NSIMYPaWZGVm42KouTK6KrsBcyPh1w3BHyqgkUBmzK8s6fdYpg9GHqZ0ibivaYDNDka2zwp%2F6%2F6wROjxSLjXVHHi1bptPcvb3%2Fn4rCRfBP15ckOfoDjJo3%2Bo5%2FZN3U5BHkoG9fdCmCAG2Ng3%2FUBUVwtFF%2BwSm6u1m8MfpxyA0oVPxgT1Y9SjtORrsVBxbjp%2FCK44jWfKz6QfozuUHD2bcejTPQIqbX0l7JuH3rV3zdxoqXhviFKwrUDauUnIZ6nbHkunMbkxP%2BjiTGmryzDq6%2BKEBjrhAU5Si2vCAI1w%2BAxtpOwc0Se5t5mDeLV8qt%2Fwvh1

In [159]:
# Load and preview dataset

import pandas as pd

dataset_path = './datosTesisCorrected.csv'
nrows = 1000  # We will use this later when generating data
training_df = pd.read_csv(dataset_path, nrows=nrows)
print(training_df.head())

      CLL      NDVI       CVI     GNDVI      NRVI       RVI       GCI
0  114.51  0.227151  1.458211  0.519744  0.227151  1.589461  1.519888
1  111.03  0.232475  1.504808  0.550758  0.232475  1.607123  1.550893
2  117.61  0.227241  1.433650  0.505362  0.227241  1.589634  1.505504
3  103.48  0.229360  1.445648  0.515255  0.229360  1.596564  1.515392
4  132.56  0.228474  1.515317  0.550987  0.228474  1.593706  1.551143


In [160]:
# Create the Gretel Synthtetics Training / Model Configuration
#
# Gretel now offers Configuration Templates that provide starting points for a variety
# of training data characteristics.
#
# You may browse the options here: https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics
#
# The helper function below will fetch the configuration based on the filename *WITHOUT the file extension*

from pathlib import Path

checkpoint_dir = str(Path.cwd() / "checkpoints")

try:
    from gretel_client import get_synthetics_config
    
    # NOTE: Replace the "default" param with any of the configuration filenames (minus extension)
    #
    # https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics
    #
    # example: get_synthetics_config("low-record-count")

    config_template = get_synthetics_config("default")
    print(f"Loaded config: {config_template}")
except ImportError:
    print("ERROR: Could not load remote template, using default params. Please ensure you have the latest gretel-client installed.")

    

# Set or update any custom parameters here
 
config_template= {
    "checkpoint_dir": checkpoint_dir,
    "vocab_size": 2000,
    "overwrite":True
}



Loaded config: {'epochs': 100}


In [161]:
# Capture transient import errors in Google Colab

try:
    from gretel_helpers.synthetics import SyntheticDataBundle
except FileNotFoundError:
    from gretel_helpers.synthetics import SyntheticDataBundle

In [162]:
# Create a Gretel Synthetic Data Bundle

from gretel_helpers.synthetics import create_df, SyntheticDataBundle

model = SyntheticDataBundle(
    training_df=training_df,
    delimiter=None, # if ``None``, it will try and automatically be detected, otherwise you can set it
    auto_validate=True, # build record validators that learn per-column, these are used to ensure generated records have the same composition as the original
    synthetic_config=config_template, # the config for Synthetics,
)

INFO synthetics.py: Detecting record field delimiter...


In [163]:
model.build()

INFO synthetics.py: Analyzing DataFrame for optimal column batches and ordering...
INFO synthetics.py: Creating model and data storage directories...
INFO batch.py: Creating directory structure for batch jobs...
INFO synthetics.py: Generating training data from source dataset...
INFO batch.py: Generating training DF and CSV for batch 0
INFO synthetics.py: Creating data validators...
INFO synthetics.py: Creating validator for synthetic batch 0


In [164]:
model.train()





100%|██████████| 1000/1000 [00:00<00:00, 53459.23it/s]


Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (64, None, 256)           233728    
_________________________________________________________________
dropout_42 (Dropout)         (64, None, 256)           0         
_________________________________________________________________
lstm_28 (LSTM)               (64, None, 256)           525312    
_________________________________________________________________
dropout_43 (Dropout)         (64, None, 256)           0         
_________________________________________________________________
lstm_29 (LSTM)               (64, None, 256)           525312    
_________________________________________________________________
dropout_44 (Dropout)         (64, None, 256)           0         
_________________________________________________________________
dense_14 (Dense)             (64, None, 913)         

In [165]:
# num_lines: how many rows to generate
# max_invalid: the number of rows that do not pass semantic validation, if this number is exceeded, training will
# stop
model.generate(num_lines=nrows, max_invalid=nrows)

HBox(children=(FloatProgress(value=0.0, description='Valid record count ', max=1000.0, style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='Invalid record count ', max=1000.0, style=ProgressStyle(d…

In [166]:
model.get_synthetic_df()

Unnamed: 0,CLL,NDVI,CVI,GNDVI,NRVI,RVI,GCI
0,129.89,0.234308,1.620920,0.615024,0.234308,1.613943,1.615195
1,133.44,0.230433,1.410069,0.499809,0.230433,1.599938,1.499957
2,104.41,0.229379,1.456347,0.521475,0.229379,1.596693,1.521607
3,140.40,0.232754,1.556912,0.579622,0.232754,1.608176,1.579778
4,117.81,0.227241,1.433650,0.505362,0.227241,1.589634,1.505504
...,...,...,...,...,...,...,...
76,129.36,0.232307,1.536267,0.568191,0.232307,1.607055,1.568373
77,117.72,0.238740,1.607450,0.613232,0.238740,1.630465,1.613431
78,129.89,0.226654,1.382172,0.478727,0.226654,1.587226,1.478877
79,129.36,0.233843,1.491940,0.545247,0.233843,1.612762,1.545430


In [167]:
# Generate report that shows the statistical performance between the training and synthetic data
import IPython

report_path = './report.html'
model.generate_report(report_path=report_path)
IPython.display.HTML(filename=report_path)

INFO synthetics.py: Creating report...
INFO synthetics.py: Report saved to: ./report.html


0,1,2,3,4,5
Synthetic Data Use Cases,Excellent,Good,Moderate,Poor,Very Poor
Significant tuning required to improve model,,,,,
Improve your model using our tips and advice,,,,,
Demo environments or mock data,,,,,
Pre-production testing environments,,,,,
Balance or augment machine learning data sources,,,,,
Machine learning or statistical analysis,,,,,

Unnamed: 0,Training Data,Synthetic Data
Row Count,1000,81
Column Count,7,7
Training Lines Duplicated,--,5

Field,Unique,Missing,Ave. Length,Type,Distribution Stability
CLL,56,0,5.91,Numeric,Moderate
RVI,58,0,13.68,Numeric,Moderate
NRVI,58,0,13.12,Numeric,Moderate
NDVI,58,0,13.54,Numeric,Moderate
CVI,58,0,13.27,Numeric,Moderate
GCI,58,0,13.44,Numeric,Good
GNDVI,58,0,12.16,Numeric,Good


In [168]:
# Optionally save your model

model.save("my_model.tar.gz")

INFO synthetics.py: Building temporary Tarball...
INFO synthetics.py: Copying Tarball to target location...


In [169]:
# Save synthetic dataframe locally and to a private Gretel project 

df = model.get_synthetic_df()
df.to_csv('synthetic-data.csv', index=False)

# Publish newly created synthetic data to a new private Gretel project 
project = client.get_project(display_name="Blueprint: Create Synthetic Data", create=True)
project.send_dataframe(df, detection_mode="all")
print(f"View this project at: {project.get_console_url()}")







82 records [00:01, 54.12 records/s]        

View this project at: https://console.gretel.cloud/narteagal-21790





In [None]:
from google.colab import drive
drive.mount('/content/drive')