In [3]:
import pandas as pd

# Load the data from CSV files
dim_data = pd.read_csv('dim.csv')
fact_data = pd.read_csv('fact.csv')

# Define the constants for transaction codes
PAYMENT_TYPE_CODE = 'OPTP0028'  # Code for payment transactions
SPENDING_TYPE_CODE = 'OPTP0000'  # Code for spending transactions

# Convert transaction time to datetime object for easier manipulation
fact_data['TXN_TM'] = pd.to_datetime(fact_data['TXN_TM'])

# Sort the transactions by time to ensure the order is correct for subsequent operations
fact_data_sorted = fact_data.sort_values(by='TXN_TM', ignore_index=True)

# Filter out invalid transaction
fact_data_sorted = fact_data_sorted.loc[~fact_data_sorted['TXN_SRC_ID'].isna()]

# Helper function to get the statement period based on a transaction date
def get_statement_period(txn_date):
    # Statement period starts on the 22nd of the previous month and ends on the 21st of the transaction month
    statement_start = txn_date.replace(day=22, hour=00, minute=00, second=00) - pd.DateOffset(months=1)
    statement_end = txn_date.replace(day=21, hour=00, minute=00, second=00)
    return statement_start, statement_end

# Helper function to calculate the due date of payment
def get_payment_due_date(txn_date):
    # Payment due date is the 5th of the month following the transaction month
    due_date = txn_date.replace(day=5, hour=00, minute=00, second=00) + pd.DateOffset(months=1)
    return due_date

fact_data_sorted.loc[fact_data_sorted['TML_WEB_AP_NM'] == SPENDING_TYPE_CODE, "NET_CASH_FLOW_AMT_LCY"] *= -1

# Add a column for the statement period start and end, and the payment due date to the payments dataframe
# payments = fact_data_sorted[fact_data_sorted['TML_WEB_AP_NM'] == PAYMENT_TYPE_CODE].copy()
# fact_data_sorted[['STATEMENT_START', 'STATEMENT_END']] = fact_data_sorted['TXN_TM'].apply(
#     lambda x: pd.Series(get_statement_period(x))
# )
fact_data_sorted['STATEMENT_START'], fact_data_sorted['STATEMENT_END'] = zip(
    *fact_data_sorted['TXN_TM'].apply(get_statement_period)
)
fact_data_sorted['PAYMENT_DUE_DATE'] = fact_data_sorted['TXN_TM'].apply(get_payment_due_date)


In [4]:
fact_data_sorted[fact_data_sorted['TML_WEB_AP_NM'] == PAYMENT_TYPE_CODE].head(3)

Unnamed: 0,CIF hash,CARD_NBR hash,TXN_TM,TML_WEB_AP_NM,DSC,NET_CASH_FLOW_AMT_LCY,TXN_SRC_ID,CARD_CLASSCIFICATION,STATEMENT_START,STATEMENT_END,PAYMENT_DUE_DATE
88,815e4858d422f45f27ff703fce8acfed,af446dd0dce35b7c0d687b32466726c89239b37dd8fcf9...,2022-05-31 21:23:35,OPTP0028,TT QUA TPBANK EBANKING 44 LE NGOC HAN HA NOI V...,1520565.0,272976284.0,VC,2022-04-22,2022-05-21,2022-06-05
91,73842a366de67e8d76320590e6a6ced8,5001c0d5c425bdfdcd5108671045068a43e3012dc4faf7...,2022-06-01 12:54:21,OPTP0028,TT QUA TPBANK EBANKING 44 LE NGOC HAN HA NOI V...,3500000.0,273092292.0,VC,2022-05-22,2022-06-21,2022-07-05
118,0828e14ddfd5dcfe9b2fc7a54eeba5f8,f0b80252f29a51ae72a1e4a259ca6857824f00e62cc868...,2022-06-06 15:57:12,OPTP0028,TT QUA TPBANK EBANKING 44 LE NGOC HAN HA NOI V...,143912.0,274393715.0,VC,2022-05-22,2022-06-21,2022-07-05


In [5]:
# filter type transaction
filtered_data = fact_data_sorted.loc[(fact_data_sorted['TML_WEB_AP_NM'] == PAYMENT_TYPE_CODE) | (fact_data_sorted['TML_WEB_AP_NM'] == SPENDING_TYPE_CODE)]

In [11]:
filtered_data['ACCUM_BALANCE'] = filtered_data.groupby(['CIF hash', 'CARD_NBR hash', 'STATEMENT_START', 'STATEMENT_END'])['NET_CASH_FLOW_AMT_LCY'].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['ACCUM_BALANCE'] = filtered_data.groupby(['CIF hash', 'CARD_NBR hash', 'STATEMENT_START', 'STATEMENT_END'])['NET_CASH_FLOW_AMT_LCY'].cumsum()


In [12]:
filtered_data = filtered_data.merge(dim_data, on=['CIF hash', 'CARD_NBR hash'])


In [32]:
filtered_data.groupby(['CIF hash', 'CARD_NBR hash', 'STATEMENT_START', 'STATEMENT_END']).agg({'TXN_TM': 'max', 'ACCUM_BALANCE': 'max'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TXN_TM,ACCUM_BALANCE
CIF hash,CARD_NBR hash,STATEMENT_START,STATEMENT_END,Unnamed: 4_level_1,Unnamed: 5_level_1
0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0aa7c3eff12fc6923c83,2022-08-22,2022-09-21,2022-09-27 13:53:45,-15728000.0
0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0aa7c3eff12fc6923c83,2022-10-22,2022-11-21,2022-11-22 11:16:44,16060000.0
0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0aa7c3eff12fc6923c83,2022-11-22,2022-12-21,2022-12-08 08:06:55,-44000.0
0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0aa7c3eff12fc6923c83,2022-12-22,2023-01-21,2023-01-03 15:28:38,16181000.0
0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0aa7c3eff12fc6923c83,2023-01-22,2023-02-21,2023-02-28 11:56:59,15500000.0
...,...,...,...,...,...
fabbc6f82889b39ea2529bcd3134ede2,a8ea8bc7dd239166636d3626765c69e5a056c57b0ffd1e15bd6af1fa6b4da2e2,2023-06-22,2023-07-21,2023-07-31 20:27:27,-225000.0
fabbc6f82889b39ea2529bcd3134ede2,a8ea8bc7dd239166636d3626765c69e5a056c57b0ffd1e15bd6af1fa6b4da2e2,2023-07-22,2023-08-21,2023-08-08 10:03:16,-338800.0
fb57dffb7f96a780b33ba00ba0f2d8c3,a466359e5b3742757bbbf27f9e33386f0f0c746fd6bdc8d04fbbe96c9314b853,2023-05-22,2023-06-21,2023-06-13 21:11:36,-725400.0
fb57dffb7f96a780b33ba00ba0f2d8c3,a466359e5b3742757bbbf27f9e33386f0f0c746fd6bdc8d04fbbe96c9314b853,2023-06-22,2023-07-21,2023-07-31 20:55:31,4256414.0


In [33]:
filtered_data[filtered_data['CIF hash']=='0340ab3837f34a1aa87d5b5a8a25a07e']

Unnamed: 0,CIF hash,CARD_NBR hash,TXN_TM,TML_WEB_AP_NM,DSC,NET_CASH_FLOW_AMT_LCY,TXN_SRC_ID,CARD_CLASSCIFICATION,STATEMENT_START,STATEMENT_END,PAYMENT_DUE_DATE,ACCUM_BALANCE,CREDIT_LIMIT
5462,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-09-19 21:52:59,OPTP0000,CTYVIMO*HOKINHDO 0824853333 704,-15728000.0,303376660.0,VC,2022-08-22,2022-09-21,2022-10-05,-15728000.0,17000000
5463,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-09-21 18:13:42,OPTP0000,Foody 19002042 704,-1000.0,303934425.0,VC,2022-08-22,2022-09-21,2022-10-05,-15729000.0,17000000
5464,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-09-21 18:13:45,OPTP0000,Foody 19002042 704,-1000.0,303934425.0,VC,2022-08-22,2022-09-21,2022-10-05,-15730000.0,17000000
5465,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-09-21 18:14:36,OPTP0000,Foody 19002042 704,-78000.0,303934767.0,VC,2022-08-22,2022-09-21,2022-10-05,-15808000.0,17000000
5466,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-09-27 13:53:45,OPTP0000,Foody 19002042 704,-253000.0,305578899.0,VC,2022-08-22,2022-09-21,2022-10-05,-16061000.0,17000000
5467,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-11-04 17:57:35,OPTP0028,TT QUA TPBANK EBANKING 44 LE NGOC HAN HA NOI V...,16060000.0,316780911.0,VC,2022-10-22,2022-11-21,2022-12-05,16060000.0,17000000
5468,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-11-04 17:58:46,OPTP0000,TPBANK QPAY 57 LY THUONG KIET HA NOI VNM 704 0...,-16000000.0,316781459.0,VC,2022-10-22,2022-11-21,2022-12-05,60000.0,17000000
5469,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-11-11 11:24:05,OPTP0000,Foody 19002042 704,-103000.0,318841726.0,VC,2022-10-22,2022-11-21,2022-12-05,-43000.0,17000000
5470,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-11-18 12:17:13,OPTP0000,Foody 19002042 704,-183000.0,321000870.0,VC,2022-10-22,2022-11-21,2022-12-05,-226000.0,17000000
5471,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-11-19 11:54:08,OPTP0000,Foody 19002042 704,-57900.0,321311937.0,VC,2022-10-22,2022-11-21,2022-12-05,-283900.0,17000000
