# TEST - DATA Generator
This notebook initialises functions which can be used for generation of independent data. The other powerful methods in this repo generate synthetic data based on an already existing dataset. This notebook includes functions which take in user inputs which could easily be ported into an app and by simply using different prompts, a lot of features could be added.

Types of data, features and how to randomize choosing them:

* Integer: min/max, distribution, specific probabilities (weighted random)
* Float: min/max, distribution, specific probabilities (weighted random)
* categorical (types) : specific probabilities (weighted random), pseudo random (coin toss)
* dates/timestamps: start date/end date, amount per day, amount at specific times
* Boolean : specific probabilities (weighted random), pseudo random (coin toss)

## Imports & Configuration

In [1]:
# Imports
import warnings # Must be first

import pandas as pd
import numpy as np

from datetime import datetime
from CastConverter import *
from DataGenerator import DatasetGenerator, DatetimeGenerator, GanGenerator, NumberGenerator

# Configuration
field_name = 'numbers'
size = 3
dg = DatasetGenerator()
nr = NumberGenerator()
dtr = DatetimeGenerator()
gg = GanGenerator()

## Numbers creation

### 1. Number

In [None]:
#params = { 'df': 1 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.CHISQUARE,
#                            size=3,
#                            params=params)

#params = { 'scale': 1 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.EXPONENTIAL,
#                            size=3,
#                            params=params)

#params = { 'shape': 1.99 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.GAMMA,
#                            size=3,
#                            params=params)

#params = { 'mu': 0, 'beta': 0.1 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.GUMBEL,
#                            size=size,
#                            params=params)

#params = { 'loc': 0, 'scale': 1 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.LAPLACE,
#                            size=size,
#                            params=params)

#params = { 'loc': 0, 'scale': 0.1 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.LOGISTIC,
#                            size=size,
#                            params=params)

#params = { 'df': 3, 'nonc': 20 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.NONCENTRALCHISQUARE,
#                            size=size,
#                            params=params)

#params = { 'dfnum': 3, 'dfden': 20, 'nonc': 3 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.NONCENTRALF,
#                            size=size,
#                            params=params)

#params = { 'mean': 10, 'std': 5 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.NORMAL,
#                            size=size,
#                            params=params)

#params = { 'shape': 3 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.PARETO,
#                            size=size,
#                            params=params)

#params = { 'lam': 5 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.POISSON,
#                            size=size,
#                            params=params)

#params = { 'a': 5 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.POWER,
#                            size=size,
#                            params=params)

#params = { 'scale': 1.1 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.RAYLEIGH,
#                            size=size,
#                            params=params)

#params = { 'df': 5 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.STDT,
#                            size=size,
#                            params=params)

#params = { 'left': -5, 'mode': 0, 'right': 5 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.TRIANGULAR,
#                            size=size,
#                            params=params)

#params = { 'min': 0, 'max': 1 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.UNIFORM,
#                            size=size,
#                            params=params)

#params = { 'mu': 0, 'kappa': 4 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.VONMISES,
#                            size=size,
#                            params=params)

#params = { 'mean': 3, 'scale': 2 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.WALD,
#                            size=size,
#                            params=params)

#params = { 'shape': 5 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.WEIBULL,
#                            size=size,
#                            params=params)

#params = { 'a': 5, 'weights': [0.1, 0, 0.3, 0.6, 0] }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.WEIGHTED,
#                            size=size,
#                            params=params)

#params = { 'a': 4 }
#numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.ZIPF,
#                            size=size,
#                            params=params)

#numbers

### 2. Number - Complete example

In [3]:
# Calls the Generator outputting a pandas df of type int
params = { 'mean':10, 'std':5, 'min':0, 'max':20 }

numbers = nr.get_numbers(distribution=NumberGenerator.Distribution.NORMAL,
                            size=size,
                            params=params)
# Convert floats to integers if required
int_array = convert_floats_to_ints(numbers)
# Insert a new column into a dataframe
dfNumbers = pd.DataFrame(data=int_array, columns=[field_name])

# Cleaning
del dfNumbers, int_array, numbers

### 3. Categorical

In [None]:
# Calls the Generator or randomizes without distribution outputting a pandas df for categorical data
params = { 'a': 5, 'weights': [0.1, 0.1, 0.3, 0.4, 0.1] }
categories = nr.get_numbers(distribution=NumberGenerator.Distribution.WEIGHTED,
                            size=10,
                            params=params)

# Convert floats to integers if required
int_categories = convert_floats_to_ints(categories)
int_categories

### 4. Boolean

In [None]:
# 
params = { 'a': [True, False], 'weights': [0.5, 0.5] }
categories = nr.get_numbers(distribution=NumberGenerator.Distribution.WEIGHTED,
                            size=10,
                            params=params)

# Convert floats to integers if required
int_categories = convert_floats_to_ints(categories)
int_categories

### 5. Timestamp

In [None]:
params = { 'start': '1/1/2024', 'end': '1/11/2024', 'periods': 4, 'freq': None }
#params = { 'start': '1/1/2024', 'end': None, 'periods': 4, 'freq': "D" }

values = dtr.get_timestamps("tstamp", params)
values = dtr.get_timestamps_pd("tstamp", params)
values

## Dataframe Creation

### 1. "Manual" example

In [None]:
size = 100
params = { 'a': 5,
          'end': '1/11/2024',
          'freq': None,
          'mean': 10,
          'periods': size,
          'start': '1/1/2024',
          'std': 5,
          'weights': [0.1, 0.1, 0.3, 0.4, 0.1]
          }

# Timestamp
timestamps = dtr.get_timestamps_pd("when", params)

# Category
tmp_categories = nr.get_numbers(distribution=NumberGenerator.Distribution.WEIGHTED,
                                size=size,
                                params=params)

int_categories = convert_floats_to_ints(tmp_categories)
categories = pd.DataFrame({'category':int_categories})

# Pricing
prices = nr.get_numbers(distribution=NumberGenerator.Distribution.NORMAL,
                        size=size,
                        params=params)

# Concatenate all
data = pd.concat([timestamps], ignore_index=False, axis=1)
#data = pd.concat([timestamps, categories], ignore_index=True, axis=1)
#data = pd.concat([timestamps, int_categories, prices], ignore_index=True, axis=1)
data.head()


### 2. "Automated" example

In [None]:

# Timestamp - When
params_a = { 'field': 'when',
          'fieldtype': DatasetGenerator.FieldType.DATETIME,
          'end': '1/11/2024',
          'freq': None,
          'periods': size,
          'start': '1/1/2024',
          }

# Category - Risk
params_b = { 'field': 'risk',
          'fieldtype': DatasetGenerator.FieldType.NUMBER,
          'distribution': NumberGenerator.Distribution.WEIGHTED,
          'a': 3,
          'weights': [0.1, 0.3, 0.6]
          }

# Number - Score
params_c = { 'field': 'score',
          'fieldtype': DatasetGenerator.FieldType.NUMBER,
          'distribution': NumberGenerator.Distribution.NORMAL,
          'max': 20,
          'mean': 10,
          'min': 0,
          'std': 5
          }

arr_params = [params_a, params_b, params_c]

df = dg.get_dataframe(arr_params=arr_params, size=10)
df.head()


## Dataframe - Synthetic

In [None]:
GanGenerator.get_metadata