In [27]:
# sdv installation
%pip install sdv==1.0.0b1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sdv==1.0.0b1
  Downloading sdv-1.0.0b1-py2.py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sdv
  Attempting uninstall: sdv
    Found existing installation: sdv 1.0.0b0
    Uninstalling sdv-1.0.0b0:
      Successfully uninstalled sdv-1.0.0b0
Successfully installed sdv-1.0.0b1


In [1]:
# general imports
import pandas as pd

# SDV imports
from sdv.metadata import SingleTableMetadata
from sdv.lite import SingleTablePreset


In [2]:
# mount to google drive to read in data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# read in the train data
train_data = pd.read_csv('/content/drive//MyDrive/University/Year 4/DSA4266/DSA4266 Project 2/Data/fraudTrain.csv')

# read in test data
test_data = pd.read_csv('/content/drive//MyDrive/University/Year 4/DSA4266/DSA4266 Project 2/Data/fraudTest.csv')

In [4]:
# remove unique identifier column
train_data = train_data.iloc[: , 1:]
test_data = test_data.iloc[: , 1:]

In [5]:
# change is_fraud col to boolean
fraud_map = {1: True, 0: False}
train_data['is_fraud'] = train_data['is_fraud'].map(fraud_map)

In [19]:
# generate metadata for the train data
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=train_data)

In [21]:
# update wrong generated column metadata
metadata.update_column(column_name='trans_date_trans_time', sdtype='datetime', datetime_format='%Y-%m-%d %H:%M:%S')
metadata.update_column(column_name='cc_num', sdtype='credit_card_number', pii=True)
metadata.update_column(column_name='first', sdtype='first_name', pii=True)
metadata.update_column(column_name='last', sdtype='last_name', pii=True)
metadata.update_column(column_name='street', sdtype='street_address', pii=True)
metadata.update_column(column_name='city', sdtype='city', pii=True)
metadata.update_column(column_name='job', sdtype='job', pii=True)
metadata.update_column(column_name='dob', sdtype='datetime', datetime_format='%Y-%m-%d')
metadata.update_column(column_name='trans_num', sdtype='text')
metadata.update_column(column_name='is_fraud', sdtype='boolean')

In [22]:
# set primary key
metadata.set_primary_key(column_name='trans_num')

In [23]:
# validate the metadata
metadata.validate()

In [24]:
metadata

{
    "primary_key": "trans_num",
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "trans_date_trans_time": {
            "sdtype": "datetime",
            "datetime_format": "%Y-%m-%d %H:%M:%S"
        },
        "cc_num": {
            "sdtype": "credit_card_number",
            "pii": true
        },
        "merchant": {
            "sdtype": "categorical"
        },
        "category": {
            "sdtype": "categorical"
        },
        "amt": {
            "sdtype": "numerical"
        },
        "first": {
            "sdtype": "first_name",
            "pii": true
        },
        "last": {
            "sdtype": "last_name",
            "pii": true
        },
        "gender": {
            "sdtype": "categorical"
        },
        "street": {
            "sdtype": "street_address",
            "pii": true
        },
        "city": {
            "sdtype": "city",
            "pii": true
        },
        "state": {
            "sdtype": "catego

In [25]:
# create synthesizer and fit it to data
synthesizer = SingleTablePreset(
    metadata,
    name='FAST_ML'
)

synthesizer.fit(
    data=train_data
)

In [26]:
synthetic_data = synthesizer.sample(
    num_rows=500
)

synthetic_data.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-06-29 16:51:40,180090934066211,fraud_Morissette LLC,shopping_pos,68.830152,Amber,Ryan,F,093 Williams Lane Apt. 170,Cherylborough,...,42.81844,-105.092327,357435,"Solicitor, Scotland",1980-04-09,AAAAA,1340948598,43.343106,-104.974026,False
1,2019-07-26 07:04:12,30083012986584,fraud_Nolan-Williamson,shopping_pos,27.808836,Keith,Roberts,F,12986 Nichols Cliff,Brandonfurt,...,38.238537,-88.306243,139191,Horticultural consultant,1980-03-17,AAAAB,1343309088,37.871285,-88.213639,False
2,2019-05-19 12:54:37,2290394691481974,"fraud_Robel, Cummerata and Prosacco",shopping_pos,323.54647,Anthony,Johnson,M,39469 Emily Route Apt. 974,North Steven,...,39.163729,-76.080441,103275,Tree surgeon,1997-08-26,AAAAC,1337410226,39.018708,-77.011731,False
3,2019-09-14 07:24:12,4444812351068428276,fraud_Kub PLC,food_dining,117.126842,Desiree,Valdez,F,4481 John Island,North Charles,...,33.407842,-89.676512,233095,Air traffic controller,1971-09-07,AAAAD,1347528136,33.702352,-90.432062,False
4,2020-01-02 22:34:13,4875851017747494,fraud_Kilback LLC,gas_transport,1.0,Jason,Clark,M,68428 Thomas Isle,Port Lisa,...,45.479035,-85.232964,23,Wellsite geologist,1956-10-29,AAAAE,1357077945,44.867904,-85.425634,False


In [28]:
synthesizer.save('model_test.pkl')