In [1]:
import pandas as pd
from datetime import datetime
import psutil
import os
import gc

from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.evaluation.single_table import get_column_plot
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import run_diagnostic

# importing matplotlib module
import matplotlib.pyplot as plt
plt.style.use('default')
# %matplotlib inline: only draw static
# images in the notebook
%matplotlib inline



# Global settings

In [2]:

pd.set_option('max_row', None)
#pd.options.display.max_rows = 1000

# Set a value for the home folder.
home_folder = "."

# Set values for the various paths.
input_path = home_folder + "\data"

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')
os.path.abspath(os.getcwd())

2024-08-05 22:19:26.165819
svmem(total=16756752384, available=5674205184, percent=66.1, used=11082547200, free=5674205184)


'C:\\Users\\m033\\OneDrive - GWLE\\Documents\\GitHub\\Data-Science-Thesis'

# Use SDV - Synthetic Data Vault
This is a Python package to generate synthetic data based on the dataset provided. The generated data could be single-table, multi-table, or time-series, depending on the scheme provided in the environment. Also, the generated data would have the same format properties and statistics as the provided dataset.¶

SDV generates synthetic data by applying mathematical techniques and machine learning models such as the deep learning model. Even if the data contain multiple data types and missing data, SDV will handle it, so we only need to provide the data (and the metadata when required).¶

The SDV creates synthetic data using machine learning. A synthesizer is an object that you can use to accomplish this task.¶
1.  You'll start by creating a synthesizer based on your metadata
2.  Next, you'll train the synthesizer using real data. In this phase, the synthesizer will learn patterns from the real data.
3.  Once your synthesizer is trained, you can use it to generate new, synthetic data.

Of the original data, only the contract number needs to synthesised as everything else is either a boolean that already has obfuscated the fund data or is a calculation based on PII data with the PII data removed.

https://docs.sdv.dev/sdv/single-table-data/data-preparation/single-table-metadata-api

# 1. Read back the second half of the full set of data.  There are 6,186,149 records.
# The first tranche synthesised the first 3,100,000.

# This tranche will synthesise 3,086,149.

# This is done to reduce a memory constraint on the runtime environment, which cannot be virtualised because of the sensitive nature of the data.

In [3]:
# giving directory name
filename = input_path + '\Full_Population_With_Markex_Index_Data.csv'

df_Switches_tranche_2 = pd.read_csv(filename,dtype = {'ContractNumber': str,
                                           'ProdCat': str,
                                           'ProdCode': str,
                                           'PolicyStatus': str,
                                           'Switch_Flag': bool,
                                           'ServicingBroker': str,
                                           'InitialBroker': str,
                                           'BrokerCategory': str,
                                           'Policy_Fee_Type_Desc': str,      
                                           'PricingVariant': int,
                                           'PayFreq': int,
                                           'VIPType': str,
                                          'Nationality': str,
#                                          'IsSmoker': bool,
                                          'IsSmoker': str,
                                          'PoliticallyExposed': str,
                                          'BirthPlace': str,
                                          'Gender': str,
                                          'MaritalStatus': str,
                                          'AddressCity': str,
                                          'PartnerType': str,
                                          'EUSanctioned': str,
#                                          'EUSanctioned': bool,
                                          'AddressDistrict': str,
                                          'Postal_Code_Name': str,
                                          'District': str,
                                          'State': str,
                                          'VN_MILIEU_CODE': str,
                                          'VN_TARGET_GROUP_CODE': str,
                                          'VN_MILIEU_DESCRIPTION': str,
                                          'VP_MILIEU_DESCRIPTION': str,  
# Tranche 1.                              'AnniversaryMth': int},low_memory=False, header=0, nrows=3100000)
# Tranche 2. - 3086149
                                           'AnniversaryMth': int},low_memory=False, header=0, skiprows=range(1, 3100000))


# Fill NULLs
df_Switches_tranche_2 = df_Switches_tranche_2.fillna(0)

# Track runtime and encironment.
print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

# print stats.
print(len(df_Switches_tranche_2.index))
print(df_Switches_tranche_2.dtypes)

# Turn off dislays to protect PII
#df_Switches_tranche_2.head(10)

FileNotFoundError: [Errno 2] No such file or directory: '.\\data\\Full_Population_With_Markex_Index_Data.csv'

In [None]:
df_Switches_table = df_Switches_tranche_2[['ContractNumber','Switch_Flag','Term_Passed','Term_Remaining','TermRemainingLTEQ_5',
                                       'TermRemainingLTEQ_10','Age','AnniversaryMth','ProdCat','ProdCode','PolicyStatus',
                                       'SnapshotDt','PolicyFeeType','PricingVariant','PayFreq','IndexationPercent',
                                       'Premium','PremiumCalcMethod','ServicingBroker','InitialBroker','BrokerCategory',
                                       'InitialRate','RenewalRate','VIPType','Nationality','IsSmoker','PoliticallyExposed',
                                       'BirthPlace','Gender','MaritalStatus','PensionDisability','WOPDisability',
                                       'AddressCity','PartnerType','EUSanctioned','AddressDistrict','Postal_Code',
                                       'Postal_Code_Name','District','State','VN_MILIEU_CODE','VN_TARGET_GROUP_CODE',
                                       'VN_MILIEU_DESCRIPTION','VP_MILIEU_DESCRIPTION','Term']]

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

# print stats.
print(len(df_Switches_tranche_2.index))
del df_Switches_tranche_2

print(len(df_Switches_table.index))

# Turn off dislays to protect PII
#df_Switches_table.head(10)

 # Look at the distribution of the target variable.

In [5]:
print(df_Switches_table['Switch_Flag'].value_counts())

False    3082121
True        4029
Name: Switch_Flag, dtype: int64


# 2. Generate single table metadata from the dataframe.

In [6]:
metadata = SingleTableMetadata()

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

#metadata
print("metadata-",metadata) 

2024-07-27 02:06:40.759653
svmem(total=16756752384, available=4876382208, percent=70.9, used=11880370176, free=4876382208)
metadata- {
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}


# 3. Auto detect metadata.
## The real fund switch data comes with no pre-prepared metadata automatically detect and generate the metadata based on the real data. 


In [7]:
metadata.detect_from_dataframe(data=df_Switches_table)

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')


#metadata
print("metadata-",metadata) 
metadata

2024-07-27 02:06:45.827300
svmem(total=16756752384, available=4819750912, percent=71.2, used=11937001472, free=4819750912)
metadata- {
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "ContractNumber": {
            "sdtype": "categorical"
        },
        "Switch_Flag": {
            "sdtype": "boolean"
        },
        "Term_Passed": {
            "sdtype": "numerical"
        },
        "Term_Remaining": {
            "sdtype": "numerical"
        },
        "TermRemainingLTEQ_5": {
            "sdtype": "numerical"
        },
        "TermRemainingLTEQ_10": {
            "sdtype": "numerical"
        },
        "Age": {
            "sdtype": "numerical"
        },
        "AnniversaryMth": {
            "sdtype": "numerical"
        },
        "ProdCat": {
            "sdtype": "categorical"
        },
        "ProdCode": {
            "sdtype": "categorical"
        },
        "PolicyStatus": {
            "sdtype": "categorical"
        },
        "Sna

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "ContractNumber": {
            "sdtype": "categorical"
        },
        "Switch_Flag": {
            "sdtype": "boolean"
        },
        "Term_Passed": {
            "sdtype": "numerical"
        },
        "Term_Remaining": {
            "sdtype": "numerical"
        },
        "TermRemainingLTEQ_5": {
            "sdtype": "numerical"
        },
        "TermRemainingLTEQ_10": {
            "sdtype": "numerical"
        },
        "Age": {
            "sdtype": "numerical"
        },
        "AnniversaryMth": {
            "sdtype": "numerical"
        },
        "ProdCat": {
            "sdtype": "categorical"
        },
        "ProdCode": {
            "sdtype": "categorical"
        },
        "PolicyStatus": {
            "sdtype": "categorical"
        },
        "SnapshotDt": {
            "sdtype": "categorical"
        },
        "PolicyFeeType": {
            "sdtype": "categorical"
        },


In [8]:
metadata.update_column(
    column_name='SnapshotDt',
    sdtype='datetime',
    datetime_format='%Y-%m-%d')

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

metadata

2024-07-27 02:06:45.849252
svmem(total=16756752384, available=4819193856, percent=71.2, used=11937558528, free=4819193856)


{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "ContractNumber": {
            "sdtype": "categorical"
        },
        "Switch_Flag": {
            "sdtype": "boolean"
        },
        "Term_Passed": {
            "sdtype": "numerical"
        },
        "Term_Remaining": {
            "sdtype": "numerical"
        },
        "TermRemainingLTEQ_5": {
            "sdtype": "numerical"
        },
        "TermRemainingLTEQ_10": {
            "sdtype": "numerical"
        },
        "Age": {
            "sdtype": "numerical"
        },
        "AnniversaryMth": {
            "sdtype": "numerical"
        },
        "ProdCat": {
            "sdtype": "categorical"
        },
        "ProdCode": {
            "sdtype": "categorical"
        },
        "PolicyStatus": {
            "sdtype": "categorical"
        },
        "SnapshotDt": {
            "sdtype": "datetime",
            "datetime_format": "%Y-%m-%d"
        },
        "PolicyFeeType": {
       

# Create a synthesizer




## Now reload this synthesiser from the pickle file to use on the real data.

In [9]:
synthesizer = GaussianCopulaSynthesizer.load(
    filepath=home_folder + '\switch_data_synthesizer.pkl'
)

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

2024-07-27 02:06:46.020674
svmem(total=16756752384, available=4780642304, percent=71.5, used=11976110080, free=4780642304)


## Create the synthetic data from the synthesiser.

In [10]:
synthetic_data = synthesizer.sample(num_rows=3100000)

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

#synthetic_data
print("synthetic_data-",synthetic_data) 

2024-07-27 02:14:14.100538
svmem(total=16756752384, available=8145747968, percent=51.4, used=8611004416, free=8145747968)
synthetic_data- 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Save the syntheised data to file.

In [11]:
# save the data as a CSV
synthetic_data.to_csv('data/synthetic_data_tranche2.csv', index=False)

print('==========================')
now = datetime.now()
print(now)
print('==========================')
print(psutil.virtual_memory())
print('==========================')

print(len(synthetic_data.index))

2024-07-27 02:33:55.876193
svmem(total=16756752384, available=10393202688, percent=38.0, used=6363549696, free=10393202688)
3100000


In [12]:
print(len(synthetic_data.index))

3100000


In [13]:
%whos DataFrame

Variable            Type         Data/Info
------------------------------------------
df_Switches_table   DataFrame            ContractNumber  S<...>086150 rows x 45 columns]
synthetic_data      DataFrame            ContractNumber  S<...>100000 rows x 45 columns]


In [14]:
del df_Switches_table

## Create a Python dictionary that corresponds to the metadata.

In [15]:
python_dict = metadata.to_dict()
python_dict

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'ContractNumber': {'sdtype': 'categorical'},
  'Switch_Flag': {'sdtype': 'boolean'},
  'Term_Passed': {'sdtype': 'numerical'},
  'Term_Remaining': {'sdtype': 'numerical'},
  'TermRemainingLTEQ_5': {'sdtype': 'numerical'},
  'TermRemainingLTEQ_10': {'sdtype': 'numerical'},
  'Age': {'sdtype': 'numerical'},
  'AnniversaryMth': {'sdtype': 'numerical'},
  'ProdCat': {'sdtype': 'categorical'},
  'ProdCode': {'sdtype': 'categorical'},
  'PolicyStatus': {'sdtype': 'categorical'},
  'SnapshotDt': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
  'PolicyFeeType': {'sdtype': 'categorical'},
  'PricingVariant': {'sdtype': 'numerical'},
  'PayFreq': {'sdtype': 'numerical'},
  'IndexationPercent': {'sdtype': 'numerical'},
  'Premium': {'sdtype': 'numerical'},
  'PremiumCalcMethod': {'sdtype': 'categorical'},
  'ServicingBroker': {'sdtype': 'categorical'},
  'InitialBroker': {'sdtype': 'categorical'},
  'BrokerCategory': {'sdtype': 'ca

In [16]:
#from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=df_Switches_table,
    synthetic_data=synthetic_data,
    metadata=metadata)

NameError: name 'df_Switches_table' is not defined

In [None]:
diagnostic_report = run_diagnostic(
    real_data=df_Switches_table,
    synthetic_data=synthetic_data,
    metadata=metadata)

# Diagnostics.

In [None]:
diagnostic_report = run_diagnostic(
    real_data=df_Switches_table,
    synthetic_data=synthetic_data,
    metadata=metadata)

## Data Quality - checks for statistical similarity between the real and the synthetic data. Use this to discover which patterns the synthetic data has captured from the real data.

https://docs.sdv.dev/sdv/multi-table-data/evaluation/data-quality

In [None]:
quality_report = evaluate_quality(
    real_data=df_Switches_table,
    synthetic_data=synthetic_data,
    metadata=metadata)

##  Visualising the evaluation of the synthesisation model by comparing the original data and the synthetic data.

https://github.com/sdv-dev/SDMetrics

In [62]:
%whos DataFrame

Variable         Type         Data/Info
---------------------------------------
Switches_table   DataFrame              POL_NUMB  API_T<...>[102975 rows x 4 columns]
synthetic_data   DataFrame          POL_NUMB           <...>18527.459563     202205  


In [63]:
psutil.virtual_memory().available * 100 / psutil.virtual_memory().total

27.75071840555521