# CTGAN Model For Tabular Data

In [629]:
# !pip install sdv

In [630]:
import warnings; warnings.simplefilter('ignore')
import pandas as pd
from sdv.tabular import CTGAN
from sdv.evaluation import evaluate
from sdv.constraints import *

### Import dataset here...

In [631]:
data_path = './credit_merge_clean.csv'

In [632]:
real_data = pd.read_csv(data_path)
print("DATA SHAPE:", real_data.shape)

DATA SHAPE: (36457, 21)


In [633]:
real_data.head()

Unnamed: 0.1,Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,...,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,begin_month,target
0,0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,...,-12005,-4542,1,1,0,0,,2.0,-15.0,1
1,1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,...,-12005,-4542,1,1,0,0,,2.0,-14.0,1
2,2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,...,-21474,-1134,1,0,0,0,Security staff,2.0,-29.0,0
3,3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,...,-19110,-3051,1,0,1,1,Sales staff,1.0,-4.0,0
4,4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,...,-19110,-3051,1,0,1,1,Sales staff,1.0,-26.0,0


In [634]:
real_data = real_data.drop(['Unnamed: 0'], axis=1)
real_data = real_data.dropna() 
print(real_data.shape)

(25134, 20)


In [635]:
real_data

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,begin_month,target
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0,-29.0,0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,-4.0,0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,-26.0,0
5,5008810,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,-26.0,0
6,5008811,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,-38.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,5149828,M,Y,Y,0,315000.0,Working,Secondary / secondary special,Married,House / apartment,-17348,-2420,1,0,0,0,Managers,2.0,-11.0,1
36453,5149834,F,N,Y,0,157500.0,Commercial associate,Higher education,Married,House / apartment,-12387,-1325,1,0,1,1,Medicine staff,2.0,-23.0,1
36454,5149838,F,N,Y,0,157500.0,Pensioner,Higher education,Married,House / apartment,-12387,-1325,1,0,1,1,Medicine staff,2.0,-32.0,1
36455,5150049,F,N,Y,0,283500.0,Working,Secondary / secondary special,Married,House / apartment,-17958,-655,1,0,0,0,Sales staff,2.0,-9.0,1


Going to create 2 GAN models:
* One that emulates the complete the original data using the complete real datase;
* Another one from the real data, but spcifically with target value 1 (positive mortgage default), for balancing dataset puposes.

## 1 - GAN model using complete dataset...

In [636]:
# https://pypi.org/project/sdv/
# https://sdv.dev/SDV/user_guides/single_table/handling_constraints.html#load-a-tabular-demo

####################
# Constraints List #
####################

# days_birth greater than 21 yrs - 7665 days
age_lt_7665 = GreaterThan(
    low='DAYS_BIRTH',
    high=-7300,
    scalar='high',
    handling_strategy='reject_sampling'
)

# days_employed smaller than days_birth
days_birth_gt_days_employed = GreaterThan(
    low='DAYS_BIRTH',
    high='DAYS_EMPLOYED',
    handling_strategy='reject_sampling')

# days_birth, months_balance always negative
always_negative_no_zero = Negative(
    columns='DAYS_BIRTH',
    strict=False,
    handling_strategy='reject_sampling'
)
always_negative_with_zero = Negative(
    columns='begin_month',
    strict=True,  # includes zero
    handling_strategy='reject_sampling'
)

# If Days_Employed > 0: unemployed is pensioner
def NAME_INCOME_TYPE(data):
    return data['DAYS_EMPLOYED'] > 0
    
name_income_type_pensioner = ColumnFormula(
   column='NAME_INCOME_TYPE',
   formula=NAME_INCOME_TYPE,
   handling_strategy='transform'
)

constraints = [
    age_lt_7665,
    days_birth_gt_days_employed,
    always_negative_no_zero,
#     always_negative_with_zero,
#     name_income_type_pensioner
]

In [637]:
complete_ctgan = CTGAN(
#     epochs = 5,
    constraints=constraints,
#     cuda = True
)

In [638]:
complete_ctgan.fit(real_data)

In [652]:
# Generate N number of samples
synthetic_all_data = complete_ctgan.sample(num_rows=20000)

In [653]:
synthetic_all_data

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,begin_month,target
0,5144135,M,Y,N,0,159787.7,Working,Secondary / secondary special,Married,With parents,-15297,-981,1,0,0,0,Security staff,2.0,-24.0,0
1,5068349,M,Y,Y,0,305604.7,Commercial associate,Secondary / secondary special,Married,Rented apartment,-17361,-1076,1,0,0,0,Laborers,2.0,-19.0,0
2,5026434,F,N,Y,0,143568.7,State servant,Secondary / secondary special,Married,House / apartment,-10395,-3425,1,0,1,0,Sales staff,2.0,-13.0,0
3,5031188,M,Y,Y,1,123765.0,Working,Secondary / secondary special,Separated,Rented apartment,-10569,-216,1,0,0,0,Laborers,1.0,-19.0,1
4,5114643,M,Y,Y,1,342192.3,Commercial associate,Higher education,Married,House / apartment,-12266,-801,1,0,0,1,Accountants,3.0,-23.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,5009530,M,N,N,0,135375.9,Commercial associate,Higher education,Separated,House / apartment,-11976,-1300,1,1,1,1,Drivers,2.0,-21.0,0
19996,5046422,M,N,N,0,108572.5,Working,Secondary / secondary special,Married,Rented apartment,-12997,-3369,1,0,0,0,Laborers,2.0,-38.0,0
19997,5050208,F,N,Y,0,160162.1,Working,Secondary / secondary special,Widow,House / apartment,-21226,-5334,1,1,0,0,Medicine staff,2.0,-12.0,1
19998,5091480,F,N,N,0,162075.5,Working,Higher education,Married,House / apartment,-13878,-3284,1,1,0,0,Accountants,2.0,-47.0,0


Evaluate synthetic data created...

In [654]:
# 0 the worst and 1 the best possible score
evaluate(synthetic_all_data, real_data)

0.7127094189411184

Save model and data...

In [655]:
# save data on csv format
synthetic_all_data.to_csv('synthetic_all_data.csv')

# save model
complete_ctgan.save('gan_model_complete_default.pkl')

## 2 - GAN Model considering on default mortage individuals only...

In [643]:
real_data['target'].value_counts()

0    22045
1     3089
Name: target, dtype: int64

In [644]:
# create dataset with only default mortgage individuals
on_default_data = real_data[real_data['target'] == 1]

In [645]:
on_default_data.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,begin_month,target
18,5008825,F,Y,N,0,130500.0,Working,Incomplete higher,Married,House / apartment,-10669,-1103,1,0,0,0,Accountants,2.0,-25.0,1
19,5008826,F,Y,N,0,130500.0,Working,Incomplete higher,Married,House / apartment,-10669,-1103,1,0,0,0,Accountants,2.0,-30.0,1
20,5008830,F,N,Y,0,157500.0,Working,Secondary / secondary special,Married,House / apartment,-10031,-1469,1,0,1,0,Laborers,2.0,-31.0,1
21,5008831,F,N,Y,0,157500.0,Working,Secondary / secondary special,Married,House / apartment,-10031,-1469,1,0,1,0,Laborers,2.0,-19.0,1
22,5008832,F,N,Y,0,157500.0,Working,Secondary / secondary special,Married,House / apartment,-10031,-1469,1,0,1,0,Laborers,2.0,-34.0,1


In [646]:
def_ctgan = CTGAN(
#     epochs = 5,
    constraints=constraints,
#     cuda = True
)

In [647]:
def_ctgan.fit(on_default_data)

In [648]:
# Generate N number of samples
synthetic_on_default_data = def_ctgan.sample(num_rows=15000)

In [649]:
synthetic_on_default_data

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,begin_month,target
0,5065193,M,Y,Y,0,162472.5,Working,Higher education,Married,House / apartment,-12242,-11213,1,1,0,1,Laborers,2.0,-18.0,1
1,5087936,F,Y,Y,1,190337.7,Working,Incomplete higher,Married,House / apartment,-19262,-606,1,1,0,0,Laborers,2.0,-28.0,1
2,5106025,M,N,Y,0,153333.4,Working,Secondary / secondary special,Married,House / apartment,-15296,-5478,1,0,1,0,Laborers,2.0,-2.0,1
3,5044945,F,N,N,1,231938.0,State servant,Secondary / secondary special,Single / not married,House / apartment,-16471,-1223,1,0,0,0,Laborers,1.0,-2.0,1
4,5111347,F,Y,Y,1,376239.5,Commercial associate,Higher education,Married,Municipal apartment,-18415,-697,1,0,0,0,Accountants,2.0,-5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,5058756,M,Y,Y,1,367135.9,Working,Higher education,Married,House / apartment,-8374,-1301,1,0,0,0,High skill tech staff,3.0,-1.0,1
14996,5027978,M,Y,Y,1,226358.8,Working,Incomplete higher,Separated,House / apartment,-8677,-1367,1,0,0,0,Laborers,2.0,-44.0,1
14997,5060237,F,Y,Y,0,435857.5,Working,Secondary / secondary special,Married,House / apartment,-18796,-2588,1,1,0,0,Cooking staff,2.0,-41.0,1
14998,5012790,F,Y,N,0,238882.8,Working,Higher education,Single / not married,Rented apartment,-15810,-2435,1,0,0,0,Accountants,1.0,-8.0,1


Evaluate model...

In [650]:
# 0 the worst and 1 the best possible score
evaluate(synthetic_on_default_data, on_default_data)

0.6357171170999111

Save model and data...

In [651]:
# save data on csv format
synthetic_on_default_data.to_csv('on_default_mortgage_individuals_data.csv')

# save model
def_ctgan.save('gan_model_on_default_mortgage_individuals.pkl')