In [2]:
import sys
sys.path.insert(0, '/users/fs2/hmehri/pythonproject/Thesis/synthetic')

from lib.field_info_v2 import FieldInfo_v2
from lib.tensor_encoder import TensorEncoder
from lib.prepare_data import preprocess_data_czech
from lib.modules import Encoder_Decoder_lstm
from sklearn.model_selection import train_test_split
from trainlstm import Train
import pandas as pd
import tensorflow as tf
import numpy as np
import unittest
pd.set_option('display.max_rows', 80)

In [None]:
class TestEncoderDecoderLSTM(unittest.TestCase):
    def setUp(self, info):
        # Common setup for all tests, if any
        self.config = {}
        self.config["ORDER"] = info.DATA_KEY_ORDER
        self.config["FIELD_STARTS_IN"] = info.FIELD_STARTS_IN
        self.config["FIELD_DIMS_IN"] = info.FIELD_DIMS_IN
        self.config["FIELD_STARTS_NET"] = info.FIELD_STARTS_NET
        self.config["FIELD_DIMS_NET"] = info.FIELD_DIMS_NET
        self.config["ACTIVATIONS"] = info.ACTIVATIONS
        self.inp_feat = sum( self.config["FIELD_DIMS_IN"].values())

    def test_banksformer_encoding(self):
        self._test_model_with_strategy(FieldInfo_v2('banksformer'))

    def test_banksformer_v2_encoding(self):
        self._test_model_with_strategy(FieldInfo_v2('banksformer_v2'))  

    def test_dates_onehot_encoding(self):
        self._test_model_with_strategy(FieldInfo_v2('dateonehot'))

    def test_clock_inp_out_encoding(self):
        self._test_model_with_strategy(FieldInfo_v2('dateclock'))

    def _test_model_with_strategy(self, info):
        # Common test logic for all encoding strategies

        

        model = Encoder_Decoder_lstm(config, self.inp_feat, conditional=True)

        # Test if the model has the expected number of dense layers
        self.assertEqual(len(model.dense_layers), len(config["FIELD_DIMS_NET"]))

        # Test each dense layer's configuration
        for name, dim in config["FIELD_DIMS_NET"].items():
            layer = model.dense_layers.get(name)
            self.assertIsNotNone(layer, f"Dense layer for {name} not found")
            self.assertEqual(layer.units, dim, f"Layer {name} does not have the expected units/dimensions")
            expected_activation = config["ACTIVATIONS"].get(name)
            if expected_activation:
                self.assertEqual(layer.activation.__name__, expected_activation, f"Layer {name} does not have the expected activation function")


In [2]:
raw_data = pd.read_csv('../DATA/tr_by_acct_w_age.csv')
data, LOG_AMOUNT_SCALE, TD_SCALE,ATTR_SCALE, START_DATE, TCODE_TO_NUM, NUM_TO_TCODE = preprocess_data_czech(raw_data)
data2 = data[['account_id','age','age_sc', 'tcode', 'tcode_num', 'datetime', 'month', 'dow', 'day','td', 'dtme', 'log_amount','log_amount_sc','td_sc']]
df= data2.copy()

n_tcodes = len(TCODE_TO_NUM)

info = FieldInfo_v2('dateonehot')

max_seq_len = 80
min_seq_len = 20

encoder = TensorEncoder(df, info, max_seq_len, min_seq_len)


In [3]:
encoder.encode()

2023-12-28 12:17:30.925864: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 47372 MB memory:  -> device: 0, name: Quadro RTX 8000, pci bus id: 0000:3b:00.0, compute capability: 7.5
2023-12-28 12:17:30.926365: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 47372 MB memory:  -> device: 1, name: Quadro RTX 8000, pci bus id: 0000:af:00.0, compute capability: 7.5


Finished encoding 2000 of 14354 seqs
Finished encoding 4000 of 14354 seqs
Finished encoding 6000 of 14354 seqs
Finished encoding 8000 of 14354 seqs
Finished encoding 10000 of 14354 seqs
Finished encoding 12000 of 14354 seqs
Finished encoding 14000 of 14354 seqs
Took 29.31 secs


In [5]:
encoder.inp_tensor.shape

(14354, 81, 99)

In [6]:
def make_batches(ds, buffer_size, batch_size):
    return ds.cache().shuffle(buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)

n_seqs, n_steps, n_feat_inp = encoder.inp_tensor.shape
x_tr, x_cv, inds_tr, inds_cv, targ_tr, targ_cv = train_test_split(encoder.inp_tensor, np.arange(n_seqs), encoder.tar_tensor, test_size=0.2)

# Create TensorFlow dataset
ds_all = tf.data.Dataset.from_tensor_slices((encoder.inp_tensor.astype(np.float32), encoder.tar_tensor.astype(np.float32)))
ds_tr = tf.data.Dataset.from_tensor_slices((x_tr.astype(np.float32), targ_tr.astype(np.float32)))
ds_cv = tf.data.Dataset.from_tensor_slices((x_cv.astype(np.float32), targ_cv.astype(np.float32)))

BUFFER_SIZE = ds_all.cardinality().numpy()
bs = 64  # batch size


train_batches = make_batches(ds_tr, BUFFER_SIZE, bs)
val_batches =  make_batches(ds_cv, BUFFER_SIZE, bs)

In [7]:
i = 1
for (batch_no, (inp, tar)) in enumerate(train_batches):
    if i == 1:
        break

In [8]:
config = {}
config["ORDER"] = info.DATA_KEY_ORDER
config["FIELD_STARTS_IN"] = info.FIELD_STARTS_IN
config["FIELD_DIMS_IN"] = info.FIELD_DIMS_IN
config["FIELD_STARTS_NET"] = info.FIELD_STARTS_NET
config["FIELD_DIMS_NET"] = info.FIELD_DIMS_NET
config["ACTIVATIONS"] = info.ACTIVATIONS
inp_feat = sum(info.FIELD_DIMS_IN.values())
lstm = Encoder_Decoder_lstm(config, inp_feat, conditional=False)
train = Train(lstm)

In [9]:
lstm(inp)

2023-12-28 12:21:13.902654: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600


{'tcode_num': <tf.Tensor: shape=(64, 80, 16), dtype=float32, numpy=
 array([[[-4.52196635e-02,  2.07667984e-02,  1.74494684e-01, ...,
           8.57742652e-02, -4.78449836e-02,  2.20244937e-02],
         [-4.06274619e-03, -1.26845855e-02,  1.86814159e-01, ...,
           8.04816559e-02, -3.74172889e-02,  1.60355642e-02],
         [ 2.39455905e-02, -2.61929631e-02,  1.83879301e-01, ...,
           7.55529702e-02, -3.67278978e-02,  2.01259982e-02],
         ...,
         [ 6.74544275e-02,  8.76125973e-03,  4.11684625e-02, ...,
          -3.63205038e-02, -2.17446387e-02,  4.86738980e-03],
         [ 6.74544722e-02,  8.76125321e-03,  4.11682464e-02, ...,
          -3.63203101e-02, -2.17443854e-02,  4.86778188e-03],
         [ 6.74544945e-02,  8.76126438e-03,  4.11680974e-02, ...,
          -3.63201462e-02, -2.17441805e-02,  4.86810552e-03]],
 
        [[-2.90129823e-03,  2.33692583e-03,  1.00408092e-01, ...,
           5.73414266e-02, -5.17890677e-02,  2.11411156e-02],
         [ 1.116130

: 

In [8]:
epochs = 5
early_stop = 3
train.train(train_batches, val_batches, epochs, early_stop)

2023-12-27 23:04:08.029628: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2023-12-27 23:04:08.304819: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x23cf2990 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-12-27 23:04:08.304843: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Quadro RTX 8000, Compute Capability 7.5
2023-12-27 23:04:08.304846: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (1): Quadro RTX 8000, Compute Capability 7.5
2023-12-27 23:04:08.329707: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-12-27 23:04:08.544273: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 1 Batch0 Loss 14.9649
Epoch 1 Batch50 Loss 7.2448
Epoch 1 Batch100 Loss 6.8731
Epoch 1 Batch150 Loss 6.7448


ValueError: Exception encountered when calling layer 'encoder__decoder_lstm' (type Encoder_Decoder_lstm).

Input 0 of layer "lstm" is incompatible with the layer: expected shape=(64, None, 26), found shape=(27, 80, 26)

Call arguments received by layer 'encoder__decoder_lstm' (type Encoder_Decoder_lstm):
  • inp=tf.Tensor(shape=(27, 81, 26), dtype=float32)

In [8]:
lstm.summary()

NameError: name 'lstm' is not defined

In [None]:
max_length = 25
n_seqs_to_generate = 3
attributes = encoder.attributes
n_seqs, seq_len, n_feat_inp = encoder.inp_tensor.shape
train.generate_synthetic_data(max_length, n_seqs_to_generate, df, attributes, n_feat_inp)

## evaluate

In [1]:
import sys
sys.path.insert(0, '/users/fs2/hmehri/pythonproject/Thesis/synthetic')

import pandas as pd
import matplotlib.pyplot as plt
from lib.prepare_data import preprocess_data_czech
from lib.eval import comapre_unidist_cont, compute_ngram_metrics, comapre_unidist_cat, compute_2d_categorical_metrics

In [2]:
raw_data = pd.read_csv('../DATA/tr_by_acct_w_age.csv')
raw_data = raw_data.sort_values(by = ["account_id", "date"])
data, LOG_AMOUNT_SCALE, TD_SCALE,ATTR_SCALE, START_DATE, TCODE_TO_NUM, NUM_TO_TCODE = preprocess_data_czech(raw_data)
real = data[['account_id','tcode', 'datetime','year', 'month', 'dow', 'day','td', 'dtme', 'amount', 'raw_amount']]
real_cf = real[["account_id", "month", "raw_amount", "year"]].groupby(["account_id", "month", "year"],as_index=False)["raw_amount"].sum()
real_sorted = real.sort_values(['account_id', 'year', 'month', 'day'])
real

Unnamed: 0,account_id,tcode,datetime,year,month,dow,day,td,dtme,amount,raw_amount
0,1,CREDIT__CREDIT IN CASH__nan,1995-03-24,1995,3,4,24,0.0,7,1000.0,1000.0
1,1,CREDIT__COLLECTION FROM ANOTHER BANK__nan,1995-04-13,1995,4,3,13,20.0,17,3679.0,3679.0
2,1,CREDIT__CREDIT IN CASH__nan,1995-04-23,1995,4,6,23,10.0,7,12600.0,12600.0
3,1,CREDIT__nan__INTEREST CREDITED,1995-04-30,1995,4,6,30,7.0,0,19.2,19.2
4,1,CREDIT__COLLECTION FROM ANOTHER BANK__nan,1995-05-13,1995,5,5,13,13.0,18,3679.0,3679.0
...,...,...,...,...,...,...,...,...,...,...,...
1056315,11382,DEBIT__CASH WITHDRAWAL__nan,1998-12-02,1998,12,2,2,2.0,29,25600.0,-25600.0
1056316,11382,CREDIT__COLLECTION FROM ANOTHER BANK__nan,1998-12-10,1998,12,3,10,8.0,21,46248.0,46248.0
1056317,11382,DEBIT__CASH WITHDRAWAL__nan,1998-12-25,1998,12,4,25,15.0,6,6300.0,-6300.0
1056318,11382,CREDIT__nan__INTEREST CREDITED,1998-12-31,1998,12,3,31,6.0,0,311.3,311.3


In [3]:
synth = pd.read_csv('synth_lstm_v1.csv')
synth.rename(columns={'days_passed': 'td', 'transaction_code': 'tcode'}, inplace=True)
synth['type'] = synth['tcode'].str.split('__').str[0]
synth['raw_amount'] = synth.apply(lambda row: row['amount'] if row['type'] == 'CREDIT' else -row['amount'], axis=1)

synth_sorted = synth.sort_values(['account_id', 'year', 'month', 'day'])

synth_cf = synth[["account_id", "month", "raw_amount", "year"]].groupby(["account_id", "month", "year"],as_index=False)["raw_amount"].sum()

In [4]:
synth 

Unnamed: 0,amount,tcode,account_id,year,month,day,date,td,type,raw_amount
0,7.99,DEBIT__CASH WITHDRAWAL__,0,1997,4,4,1997-04-04,0,DEBIT,-7.99
1,16.51,DEBIT__CASH WITHDRAWAL__HOUSEHOLD,0,1997,4,4,1997-04-04,0,DEBIT,-16.51
2,64.24,CREDIT__CREDIT IN CASH__nan,0,1997,4,12,1997-04-12,8,CREDIT,64.24
3,12.33,CREDIT__nan__INTEREST CREDITED,0,1997,4,18,1997-04-18,6,CREDIT,12.33
4,1992.07,CREDIT__CREDIT IN CASH__nan,0,1997,4,28,1997-04-28,10,CREDIT,1992.07
...,...,...,...,...,...,...,...,...,...,...
399995,3679.16,CREDIT__CREDIT IN CASH__nan,4999,2001,10,28,2001-10-28,58,CREDIT,3679.16
399996,3650.27,CREDIT__CREDIT IN CASH__nan,4999,2001,11,30,2001-11-30,33,CREDIT,3650.27
399997,1979.39,DEBIT__CASH WITHDRAWAL__nan,4999,2001,12,4,2001-12-04,4,DEBIT,-1979.39
399998,4390.42,CREDIT__CREDIT IN CASH__nan,4999,2002,3,6,2002-03-06,92,CREDIT,4390.42


In [5]:
CONT_FIELDS = ["amount", "td"]

CF_FIELD = 'raw_amount'

#compare univariate distribution of continuous columns
comapre_unidist_cont(CONT_FIELDS,CF_FIELD, real, synth, real_cf, synth_cf)

{'amount': {'wasser': 3758.2805235507876,
  'ks': 0.19091477506816124,
  'energy_d': 30.26799531316632},
 'td': {'wasser': 9.372185990608907,
  'ks': 0.18496081793395935,
  'energy_d': 1.5359866134349014},
 'CF': {'wasser': 3024.956730379958,
  'ks': 0.13474387289010603,
  'energy_d': 17.09733937403023}}

In [6]:
# JSD between the distributions of tcode 3-grams
combo_df, result = compute_ngram_metrics(real_sorted, synth_sorted, 'tcode', 3)
result

{'jsd': 0.22626377669517486,
 'entr_r': 5.425261658301507,
 'entr_g': 5.723924410498564,
 'NED': -0.29866275219705685,
 'l1': 1.0895311024867378,
 'l2': 0.09509572510477242,
 'jac': 0.5037678975131876,
 'count_r': 1431,
 'coverage_r': 0.349365234375,
 'count_g': 2540,
 'coverage_g': 0.6201171875,
 'count_max': 4096,
 'field': 'tcode',
 'n': 3,
 'pseudo_counts': 0.0}