## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import string

### Models

In [2]:
from sdv.tabular import GaussianCopula # Importing 'GaussianCoupla' Model
from sdv.tabular import CTGAN # Importing 'CTGAN' Model
from sdv.tabular import CopulaGAN # Importing 'CopulaGAN' Model

### Evaluation

In [3]:
from sdv.evaluation import evaluate # Importing 'evaluate' Class for Synthetic Dataset Evaluation

**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## Removing Warning Messages

In [4]:
warnings.filterwarnings(action = 'ignore') # Removing Warning Messages

**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## Importing Dataset

In [5]:
dataset = pd.read_spss('Data/bsa19_for_ukda.sav') # Importing Dataset

**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## Dataset Context

In [6]:
dataset.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3224 entries, 0 to 3223
Data columns (total 572 columns):
 #    Column                Dtype   
---   ------                -----   
 0    Sserial               category
 1    spoint                float64 
 2    StratID               category
 3    WtFactor              category
 4    OldWt                 category
 5    ABCVer                category
 6    Country               category
 7    GOR_ID                category
 8    ruralEW               category
 9    ruralSc               category
 10   HousehldE             category
 11   RSex                  category
 12   RageE                 category
 13   RAgeCat               category
 14   RAgeCat2              category
 15   RAgecat3              category
 16   RAgecat4              category
 17   RAgecat5              category
 18   RSexAge               category
 19   RSexAge2              category
 20   MarStat               category
 21   Married               category
 22 

**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## Data Preprocessing

In [7]:
dataset.isnull().sum() # Total NULL Values Before Preprocessing

Sserial                   0
spoint                    0
StratID                   0
WtFactor                  0
OldWt                     0
                       ... 
NatEuro                   0
eq_inc_deciles          932
eq_inc_quintiles        932
eq_bhcinc2_deciles      865
eq_bhcinc2_quintiles    865
Length: 572, dtype: int64

In [8]:
dataset = dataset.dropna(axis = 1, inplace = False) # Removing Columns with NULL Values

In [9]:
dataset.isnull().sum() # Total NULL Values After Preprocessing

Sserial     0
spoint      0
StratID     0
WtFactor    0
OldWt       0
           ..
HHIncQ      0
SelfComp    0
NatBrit     0
NatEng      0
NatEuro     0
Length: 167, dtype: int64

In [10]:
# Changing Data Type of every Column in the Dataset

for col in dataset.columns:
    dataset[col] = dataset[col].astype(object)

**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**

## Synthetic Data Generation

### Gaussian Copula Model

In [None]:
syn_model = GaussianCopula() # Initializing the Model
syn_model.fit(dataset) # Applying the Model on the Data

#### Saving The Synthetic Data Generator Model

In [None]:
syn_model.save('Saved Model/Gaussian.pkl')

#### Loading The Saved Synthetic Data Generator Model

In [11]:
gaussian = GaussianCopula.load('Saved Model/Gaussian.pkl')

In [12]:
syn_data = gaussian.sample(num_rows = 100) # Generating Synthetic Instances
syn_data # Printing the Data

Unnamed: 0,Sserial,spoint,StratID,WtFactor,OldWt,ABCVer,Country,GOR_ID,HousehldE,RSex,...,HEdQual3,EUParVot,AnyBN3,MainInc5,HHIncD,HHIncQ,SelfComp,NatBrit,NatEng,NatEuro
0,310505.0,342.0,117.0,1.379188,0.559625,Version A,England,Wales,3.0,Female,...,Degree,No,No,State retirement or widow's pension(s),"£4,911 - 6,610 p.m.","Less than £1,410 p.m.",yes,Mentioned,Not mentioned,Not mentioned
1,311327.0,404.0,156.0,0.591229,1.678875,Version A,England,Wales,1.0,Female,...,Degree,No,Yes,Earnings from employment (own or spouse / part...,"£2,561 - 3,160 p.m.","£2,561 - 4,350 p.m.",yes,Mentioned,Mentioned,Not mentioned
2,312459.0,190.0,166.0,1.153092,1.11925,Version B,England,London,2.0,Female,...,O level or equiv/CSE,No,Yes,Private pension(s),Less than £920 p.m.,"£2,561 - 4,350 p.m.",no self completion,Mentioned,Not mentioned,Not mentioned
3,312545.0,127.0,159.0,0.384759,1.678875,Version C,Scotland,South West,4.0,Male,...,No qualification,No,Yes,State retirement or widow's pension(s),"£6,611 or more p.m.",Refused information,no self completion,Not mentioned,Mentioned,Not mentioned
4,313171.0,412.0,218.0,0.529731,1.11925,Version B,England,London,4.0,Female,...,Higher educ below degree/A level,"Yes, voted",Yes,Earnings from employment (own or spouse / part...,"£1,591 - 2,030 p.m.","Less than £1,410 p.m.",yes,Mentioned,Not mentioned,Not mentioned
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,311140.0,263.0,215.0,1.110789,1.11925,Version B,England,North West,3.0,Male,...,Degree,"Yes, voted",Refusal,Earnings from employment (own or spouse / part...,"£921 - 1,240 p.m.","£1,411 - 2,560 p.m.",yes,Mentioned,Not mentioned,Mentioned
96,312056.0,340.0,241.0,0.601899,0.559625,Version B,England,East Midlands,2.0,Male,...,Higher educ below degree/A level,No,Yes,Incapacity benefit / Employment and Support Al...,Don`t know,Refused information,yes,Refusal,Not mentioned,Mentioned
97,310770.0,116.0,148.0,2.274837,1.678875,Version A,England,South East,3.0,Female,...,No qualification,"Yes, voted",Refusal,Jobseeker's Allowance (CAN INCLUDE PENSIONER P...,"£1,241 - 1,590 p.m.","£1,411 - 2,560 p.m.",yes,Mentioned,Mentioned,Mentioned
98,311730.0,124.0,127.0,0.940085,0.559625,Version C,England,East Midlands,5+,Female,...,Degree,No,Yes,Earnings from employment (own or spouse / part...,"£3,891 - 4,910 p.m.","£2,561 - 4,350 p.m.",yes,Mentioned,Refusal,Not mentioned


#### Evaluating Synthetic Instances

Statistical Metrics

In [17]:
general_evaluation = evaluate(syn_data, dataset) # Evaluating on All Metrics
CS_test = evaluate(syn_data, dataset, metrics = ['CSTest']) # Chi-Squared Test
KS_test = evaluate(syn_data, dataset, metrics = ['KSTest']) # Inverted Kolmogorov-Smirnov Test

In [18]:
evaluation = [general_evaluation, CS_test, KS_test] # Creating a List of the Results
evaluation = pd.DataFrame(evaluation) # Creating a Data Frame of the Results
evaluation.columns = ['Results'] # Gicing Column Name
evaluation.rename({0:'General Evaluation', 1:'Chi-Squared Test', 2:'Inverted Kolmogorov-Smirnov Test'}) # Renaming Row Name

Unnamed: 0,Results
General Evaluation,0.515714
Chi-Squared Test,0.865854
Inverted Kolmogorov-Smirnov Test,


In [19]:
p_CS_test = 1-CS_test # Calculating 'p-value' for Chi-Squared Test
p_KS_test = 1-KS_test # Calculating 'p-value' for Inverted Kolmogorov-Smirnov Test

In [20]:
p_value = [p_CS_test, p_KS_test] # Creating a List of the Results
p_value = pd.DataFrame(p_value) # Creating a Data Frame of the Results
p_value.columns = ['P Values'] # Gicing Column Name
p_value.rename({0:'Chi-Squared P Value', 1:'Inverted Kolmogorov-Smirnov P Value'}) # Renaming Row Name

Unnamed: 0,P Values
Chi-Squared P Value,0.134146
Inverted Kolmogorov-Smirnov P Value,


**xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx**