In [108]:
import pandas as pd

# Load the data from CSV files
dim_data = pd.read_csv('dim.csv')
fact_data = pd.read_csv('fact.csv')

# Define the constants for transaction codes
PAYMENT_TYPE_CODE = 'OPTP0028'  # Code for payment transactions
SPENDING_TYPE_CODE = 'OPTP0000'  # Code for spending transactions

# Convert transaction time to datetime object for easier manipulation
fact_data['TXN_TM'] = pd.to_datetime(fact_data['TXN_TM'])

# Sort the transactions by time to ensure the order is correct for subsequent operations
fact_data_sorted = fact_data.sort_values(by='TXN_TM', ignore_index=True)

# Filter out invalid transaction
fact_data_sorted = fact_data_sorted.loc[~fact_data_sorted['TXN_SRC_ID'].isna()]

# Helper function to get the statement period based on a transaction date
def get_statement_period(txn_date):
    # Statement period starts on the 22nd of the previous month and ends on the 21st of the transaction month
    statement_start = txn_date.replace(day=22, hour=00, minute=00, second=00) - pd.DateOffset(months=1)
    statement_end = txn_date.replace(day=21, hour=00, minute=00, second=00)
    return statement_start, statement_end

# Helper function to calculate the due date of payment
def get_payment_due_date(txn_date):
    # Payment due date is the 5th of the month following the transaction month
    due_date = txn_date.replace(day=5, hour=00, minute=00, second=00) + pd.DateOffset(months=1)
    return due_date

fact_data_sorted.loc[fact_data_sorted['TML_WEB_AP_NM'] == SPENDING_TYPE_CODE, "NET_CASH_FLOW_AMT_LCY"] *= -1

# Add a column for the statement period start and end, and the payment due date to the payments dataframe
# payments = fact_data_sorted[fact_data_sorted['TML_WEB_AP_NM'] == PAYMENT_TYPE_CODE].copy()
# fact_data_sorted[['STATEMENT_START', 'STATEMENT_END']] = fact_data_sorted['TXN_TM'].apply(
#     lambda x: pd.Series(get_statement_period(x))
# )
fact_data_sorted['STATEMENT_START'], fact_data_sorted['STATEMENT_END'] = zip(
    *fact_data_sorted['TXN_TM'].apply(get_statement_period)
)
fact_data_sorted['PAYMENT_DUE_DATE'] = fact_data_sorted['TXN_TM'].apply(get_payment_due_date)


In [109]:
fact_data_sorted[fact_data_sorted['TML_WEB_AP_NM'] == PAYMENT_TYPE_CODE].head(3)

Unnamed: 0,CIF hash,CARD_NBR hash,TXN_TM,TML_WEB_AP_NM,DSC,NET_CASH_FLOW_AMT_LCY,TXN_SRC_ID,CARD_CLASSCIFICATION,STATEMENT_START,STATEMENT_END,PAYMENT_DUE_DATE
88,815e4858d422f45f27ff703fce8acfed,af446dd0dce35b7c0d687b32466726c89239b37dd8fcf9...,2022-05-31 21:23:35,OPTP0028,TT QUA TPBANK EBANKING 44 LE NGOC HAN HA NOI V...,1520565.0,272976284.0,VC,2022-04-22,2022-05-21,2022-06-05
91,73842a366de67e8d76320590e6a6ced8,5001c0d5c425bdfdcd5108671045068a43e3012dc4faf7...,2022-06-01 12:54:21,OPTP0028,TT QUA TPBANK EBANKING 44 LE NGOC HAN HA NOI V...,3500000.0,273092292.0,VC,2022-05-22,2022-06-21,2022-07-05
118,0828e14ddfd5dcfe9b2fc7a54eeba5f8,f0b80252f29a51ae72a1e4a259ca6857824f00e62cc868...,2022-06-06 15:57:12,OPTP0028,TT QUA TPBANK EBANKING 44 LE NGOC HAN HA NOI V...,143912.0,274393715.0,VC,2022-05-22,2022-06-21,2022-07-05


In [110]:
# filter type transaction
filtered_data = fact_data_sorted.loc[(fact_data_sorted['TML_WEB_AP_NM'] == PAYMENT_TYPE_CODE) | (fact_data_sorted['TML_WEB_AP_NM'] == SPENDING_TYPE_CODE)]

In [97]:
statement_balances = filtered_data.groupby(['CIF hash', 'CARD_NBR hash', 'STATEMENT_START', 'STATEMENT_END'])
statement_balances = statement_balances['NET_CASH_FLOW_AMT_LCY'].sum().reset_index(name='STATEMENT_BALANCE')

In [98]:
statement_balances['ACCUM_BALANCE'] = statement_balances['STATEMENT_BALANCE'].cumsum()


In [99]:
statement_balances.head(20)

Unnamed: 0,CIF hash,CARD_NBR hash,STATEMENT_START,STATEMENT_END,STATEMENT_BALANCE,ACCUM_BALANCE
0,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-08-22,2022-09-21,-16061000.0,-16061000.0
1,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-10-22,2022-11-21,-824600.0,-16885600.0
2,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-11-22,2022-12-21,-44000.0,-16929600.0
3,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2022-12-22,2023-01-21,0.0,-16929600.0
4,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2023-01-22,2023-02-21,0.0,-16929600.0
5,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2023-03-22,2023-04-21,-4284.0,-16933884.0
6,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2023-04-22,2023-05-21,0.0,-16933884.0
7,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2023-05-22,2023-06-21,17730000.0,796116.0
8,04f221935990065c986d05b03c20a155,a6185c9450de61b47a5388cd55b356f1ad2a3cc65bdb6d...,2023-03-22,2023-04-21,-3748458.0,-2952342.0
9,04f221935990065c986d05b03c20a155,a6185c9450de61b47a5388cd55b356f1ad2a3cc65bdb6d...,2023-04-22,2023-05-21,2734098.0,-218244.0


In [101]:
statement_balances['ACCUM_BALANCE'] = statement_balances['ACCUM_BALANCE'].astype(int)


In [102]:
statement_balances[(statement_balances['CIF hash']=='0340ab3837f34a1aa87d5b5a8a25a07e') & (statement_balances['ACCUM_BALANCE']>0)]

Unnamed: 0,CIF hash,CARD_NBR hash,STATEMENT_START,STATEMENT_END,STATEMENT_BALANCE,ACCUM_BALANCE
7,0340ab3837f34a1aa87d5b5a8a25a07e,8a310933f4b7f835b19a0a8a2ab43ef865e49111d3fb0a...,2023-05-22,2023-06-21,17730000.0,796116


In [111]:
filtered_data['ACCUM_BALANCE'] = filtered_data.groupby(['CIF hash', 'CARD_NBR hash', 'STATEMENT_START', 'STATEMENT_END'])['NET_CASH_FLOW_AMT_LCY'].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['ACCUM_BALANCE'] = filtered_data.groupby(['CIF hash', 'CARD_NBR hash', 'STATEMENT_START', 'STATEMENT_END'])['NET_CASH_FLOW_AMT_LCY'].cumsum()


In [112]:
filtered_data[filtered_data['CIF hash']=='d46f9418182bc00b4d12d4eee5058c41'].head(50)

Unnamed: 0,CIF hash,CARD_NBR hash,TXN_TM,TML_WEB_AP_NM,DSC,NET_CASH_FLOW_AMT_LCY,TXN_SRC_ID,CARD_CLASSCIFICATION,STATEMENT_START,STATEMENT_END,PAYMENT_DUE_DATE,ACCUM_BALANCE
11,d46f9418182bc00b4d12d4eee5058c41,e3ec0045c51675ba5bd927204be7a68ae0e7dbb38d9493...,2022-05-22 02:54:08,OPTP0000,"TKTT MOBILE BANKING 57 LY THUONG KIET, HOAN KI...",-200000.0,270573626.0,VC,2022-04-22,2022-05-21,2022-06-05,-200000.0
42,d46f9418182bc00b4d12d4eee5058c41,e3ec0045c51675ba5bd927204be7a68ae0e7dbb38d9493...,2022-05-25 10:24:44,OPTP0000,SHOPEE - VIETNAM 842839691027 704,-23500.0,271336298.0,VC,2022-04-22,2022-05-21,2022-06-05,-223500.0
52,d46f9418182bc00b4d12d4eee5058c41,e3ec0045c51675ba5bd927204be7a68ae0e7dbb38d9493...,2022-05-26 06:56:16,OPTP0000,Foody 19002042 704,-1000.0,271572801.0,VC,2022-04-22,2022-05-21,2022-06-05,-224500.0
53,d46f9418182bc00b4d12d4eee5058c41,e3ec0045c51675ba5bd927204be7a68ae0e7dbb38d9493...,2022-05-26 06:56:19,OPTP0000,Foody 19002042 704,-1000.0,271572801.0,VC,2022-04-22,2022-05-21,2022-06-05,-225500.0
57,d46f9418182bc00b4d12d4eee5058c41,e3ec0045c51675ba5bd927204be7a68ae0e7dbb38d9493...,2022-05-27 18:02:59,OPTP0000,Foody 19002042 704,-366400.0,271961259.0,VC,2022-04-22,2022-05-21,2022-06-05,-591900.0
70,d46f9418182bc00b4d12d4eee5058c41,e3ec0045c51675ba5bd927204be7a68ae0e7dbb38d9493...,2022-05-29 21:04:59,OPTP0000,Foody 19002042 704,-52000.0,272474726.0,VC,2022-04-22,2022-05-21,2022-06-05,-643900.0
103,d46f9418182bc00b4d12d4eee5058c41,e3ec0045c51675ba5bd927204be7a68ae0e7dbb38d9493...,2022-06-04 17:56:22,OPTP0000,Shopee - Vietnam 6562708100 704,-1044500.0,273933357.0,VC,2022-05-22,2022-06-21,2022-07-05,-1044500.0
114,d46f9418182bc00b4d12d4eee5058c41,e3ec0045c51675ba5bd927204be7a68ae0e7dbb38d9493...,2022-06-06 09:34:47,OPTP0000,Foody 19002042 704,-128000.0,274300530.0,VC,2022-05-22,2022-06-21,2022-07-05,-1172500.0
227,d46f9418182bc00b4d12d4eee5058c41,e3ec0045c51675ba5bd927204be7a68ae0e7dbb38d9493...,2022-06-14 09:36:13,OPTP0000,Foody 19002042 704,-364400.0,276382414.0,VC,2022-05-22,2022-06-21,2022-07-05,-1536900.0
288,d46f9418182bc00b4d12d4eee5058c41,e3ec0045c51675ba5bd927204be7a68ae0e7dbb38d9493...,2022-06-17 16:05:11,OPTP0000,Foody 19002042 704,-117000.0,277264299.0,VC,2022-05-22,2022-06-21,2022-07-05,-1653900.0
