# Generate synthetic data 

Generation of synthetic data is required to protect private information. Data will be generated for two aspects of the model: 

* Synthetic production of converted customer flags
* Synthetic customer behaviour in the specified columns

Import Libraries and sqlite dB engine

In [None]:
import numpy as np 
import pandas as pd
import sklearn.datasets as dt 

from sqlalchemy import create_engine

In [None]:
engine = create_engine('sqlite:///../data/orgs_customer_behaviours.db')

## Generate synthetic converted customers

In [None]:
df_org = pd.read_csv('../data/hd2019.csv', encoding='latin1')
synth_conversion = np.random.choice(['0','1'], len(df_org))
org_column_names = list(df_org.columns)
org_column_names.remove('UNITID')
df_org_unitid = df_org.drop(org_column_names, axis=1)
df_org_unitid.insert(1, 'CONVERTED', synth_conversion)
df_org_data = df_org.merge(df_org_unitid, how='left', on='UNITID')
df_org_data.to_csv('../data/hd2019_convert.csv', index=False)

## Generate synthetic customer behaviour

In [None]:
behaviour_column_names = ['DID_TRIAL', 'PURCHASED_PREVIOUS_PRODUCT', 'AMOUNT_OF_LICENSES',
                          'TERM_OF_LICENSE', 'HAS_TECH_DEPT', 'AMOUNT_OF_INTERACTIONS_W_SALES',
                          'AMOUNT_OF_CALLS', 'AMOUNT_OF_MESSAGES', 'ENGAGED_WITH_MESSAGING', 
                          'REACHED_NOT_ENGAGED_WITH_MESSAGING', 'ATTENDED_WEBINARS', 'WEBINAR_ATTENDANCE_SIZE']

In [None]:
x, y = dt.make_classification(n_samples=1000, n_features=12, n_informative=6, n_repeated=0, n_redundant=4, random_state=42)

In [None]:
df_x = pd.DataFrame(x)
df_y = pd.DataFrame(y)

In [None]:
df_x.columns = behaviour_column_names
df_y.columns = ['CONVERTED']

## Save data to a sqlite dB for further processing

In [None]:
df_x.to_sql('synth_customer_behaviour_data', engine, if_exists='replace', index=False)
df_y.to_sql('synth_customer_target_classifier_data', engine, if_exists='replace', index=False)