In [17]:
import pandas as pd
from prepare_data import preprocess_data_czech
from eval import comapre_unidist_cont, compute_ngram_metrics, comapre_unidist_cat, compute_2d_categorical_metrics

In [21]:
raw_data = pd.read_csv('../DATA/tr_by_acct_w_age.csv')
raw_data = raw_data.sort_values(by = ["account_id", "date"])
data, LOG_AMOUNT_SCALE, TD_SCALE,ATTR_SCALE, START_DATE, TCODE_TO_NUM, NUM_TO_TCODE = preprocess_data_czech(raw_data)
real = data[['account_id','tcode', 'datetime','year', 'month', 'dow', 'day','td', 'dtme', 'amount', 'raw_amount']]
real_cf = real[["account_id", "month", "raw_amount", "year"]].groupby(["account_id", "month", "year"],as_index=False)["raw_amount"].sum()
real_sorted = real.sort_values(['account_id', 'year', 'month', 'day'])
real

Unnamed: 0,account_id,tcode,datetime,year,month,dow,day,td,dtme,amount,raw_amount
0,1,CREDIT__CREDIT IN CASH__nan,1995-03-24,1995,3,4,24,0.0,7,1000.0,1000.0
1,1,CREDIT__COLLECTION FROM ANOTHER BANK__nan,1995-04-13,1995,4,3,13,20.0,17,3679.0,3679.0
2,1,CREDIT__CREDIT IN CASH__nan,1995-04-23,1995,4,6,23,10.0,7,12600.0,12600.0
3,1,CREDIT__nan__INTEREST CREDITED,1995-04-30,1995,4,6,30,7.0,0,19.2,19.2
4,1,CREDIT__COLLECTION FROM ANOTHER BANK__nan,1995-05-13,1995,5,5,13,13.0,18,3679.0,3679.0
...,...,...,...,...,...,...,...,...,...,...,...
1056315,11382,DEBIT__CASH WITHDRAWAL__nan,1998-12-02,1998,12,2,2,2.0,29,25600.0,-25600.0
1056316,11382,CREDIT__COLLECTION FROM ANOTHER BANK__nan,1998-12-10,1998,12,3,10,8.0,21,46248.0,46248.0
1056317,11382,DEBIT__CASH WITHDRAWAL__nan,1998-12-25,1998,12,4,25,15.0,6,6300.0,-6300.0
1056318,11382,CREDIT__nan__INTEREST CREDITED,1998-12-31,1998,12,3,31,6.0,0,311.3,311.3


In [16]:
synth = pd.read_csv('../DATA/synth_transformerv1.csv')
synth.rename(columns={'days_passed': 'td', 'transaction_code': 'tcode'}, inplace=True)
synth['type'] = synth['tcode'].str.split('__').str[0]
synth['raw_amount'] = synth.apply(lambda row: row['amount'] if row['type'] == 'CREDIT' else -row['amount'], axis=1)

synth_sorted = synth.sort_values(['account_id', 'year', 'month', 'day'])

synth_cf = synth[["account_id", "month", "raw_amount", "year"]].groupby(["account_id", "month", "year"],as_index=False)["raw_amount"].sum()


In [19]:
# Wasserstein-1 distances for univariate amount (Amt) and Cash Flow
# the column names of continous features
CONT_FIELDS = ["amount", "td"]

CF_FIELD = 'raw_amount'

#compare univariate distribution of continuous columns
comapre_unidist_cont(CONT_FIELDS,CF_FIELD, real, synth, real_cf, synth_cf)

{'amount': {'wasser': 4476.63843451854,
  'ks': 0.29746698424719786,
  'energy_d': 38.785523800147864},
 'td': {'wasser': 0.7408301984247204,
  'ks': 0.05333058770069676,
  'energy_d': 0.2313779133495533},
 'CF': {'wasser': 4453.496975714692,
  'ks': 0.16088804173406016,
  'energy_d': 26.02654344354358}}

In [22]:
# JSD between the distributions of tcode 3-grams
combo_df, result = compute_ngram_metrics(real_sorted, synth_sorted, 'tcode', 3)
result

{'jsd': 0.09166750625551134,
 'entr_r': 5.425261658301508,
 'entr_g': 5.088144115583424,
 'NED': 0.3371175427180839,
 'l1': 0.6293103940019802,
 'l2': 0.06715191875757406,
 'jac': 0.40550964187327826,
 'count_r': 1431,
 'coverage_r': 0.349365234375,
 'count_g': 1463,
 'coverage_g': 0.357177734375,
 'count_max': 4096,
 'field': 'tcode',
 'n': 3,
 'pseudo_counts': 0.0}

In [23]:
# JSD result comparing the univariate distributions of the tcode (Tcode), and DOM
CAT_FIELDS = ['tcode', 'day', 'month']
result_jst_cat = {}
for field in CAT_FIELDS:
    result_jst_cat[field] = comapre_unidist_cat(real, synth, field)
result_jst_cat

{'tcode': 0.02609340476367692,
 'day': 0.0140182856775013,
 'month': 0.000851132867182967}

In [24]:
#joint distribution of two categorical columns(tcode, DOM)
field1 = 'tcode'
field2 = 'day'
compute_2d_categorical_metrics(real, synth, field1, field2)

{'jsd': 0.0461736517477647,
 'entr_r': 4.314351501994386,
 'entr_g': 4.237072912474713,
 'l1': 0.3846703105119664,
 'l2': 0.04790606930485251,
 'jac': 0.48878923766816146,
 'count_r': 240.0,
 'coverage_r': 0.4838709677419355,
 'count_g': 434.0,
 'coverage_g': 0.875,
 'count_max': 496}

In [9]:
synth

Unnamed: 0,amount,transaction_code,account_id,year,month,day,date,days_passed,td,tcode
0,948.05,DEBIT__CASH WITHDRAWAL__nan,0,1995,9,7,1995-09-07,0,0,DEBIT__CASH WITHDRAWAL__nan
1,1316.63,DEBIT__CASH WITHDRAWAL__nan,0,1995,9,8,1995-09-08,1,1,DEBIT__CASH WITHDRAWAL__nan
2,3142.01,CREDIT__CREDIT IN CASH__nan,0,1995,9,11,1995-09-11,3,3,CREDIT__CREDIT IN CASH__nan
3,2017.99,CREDIT__CREDIT IN CASH__nan,0,1995,9,17,1995-09-17,6,6,CREDIT__CREDIT IN CASH__nan
4,24.34,CREDIT__nan__INTEREST CREDITED,0,1995,9,30,1995-09-30,13,13,CREDIT__nan__INTEREST CREDITED
...,...,...,...,...,...,...,...,...,...,...
399995,3896.74,DEBIT__CASH WITHDRAWAL__nan,4999,1998,6,25,1998-06-25,6,6,DEBIT__CASH WITHDRAWAL__nan
399996,4403.67,DEBIT__CASH WITHDRAWAL__nan,4999,1998,6,28,1998-06-28,3,3,DEBIT__CASH WITHDRAWAL__nan
399997,162.98,CREDIT__nan__INTEREST CREDITED,4999,1998,6,30,1998-06-30,2,2,CREDIT__nan__INTEREST CREDITED
399998,16.13,DEBIT__CASH WITHDRAWAL__PAYMENT ON STATEMENT,4999,1998,6,30,1998-06-30,0,0,DEBIT__CASH WITHDRAWAL__PAYMENT ON STATEMENT
