In [1]:
!pip install "dask[complete]"
!pip install mlxtend

Collecting dask[complete]
  Downloading dask-2024.5.2-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting partd>=1.2.0 (from dask[complete])
  Downloading partd-1.4.2-py3-none-any.whl (18 kB)
Collecting importlib-metadata>=4.13.0 (from dask[complete])
  Downloading importlib_metadata-7.1.0-py3-none-any.whl (24 kB)
Collecting pyarrow-hotfix (from dask[complete])
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting lz4>=4.3.2 (from dask[complete])
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Collecting locket (from partd>=1.2.0->dask[complete])
  Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Collecting dask-expr<1.2,>=1.1 (from dask[complete])
  Downloading dask_expr-1.1.2-py3-none-any.whl (205 kB)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from pathlib import Path
import os
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder
import dask.dataframe as dd
import warnings
warnings.filterwarnings('ignore')

In [4]:
pd.set_option('display.max_colwidth', None)

In [5]:
data_dir = Path('/content/drive/MyDrive/Data_hasking_without_banker_account')
ddf = dd.read_parquet(str(data_dir / '*.parquet'))
ddf

Unnamed: 0_level_0,CUST_CUSTNO,ACC_BUSINESSTYPE,ACC_ACCNO,ACC_BUSINESSNO,ACC_CURRENCYISO,BUSINESSNO_TRANS,FK_CURRENCY,AMOUNT,AMOUNTORIG,BRANCH_OFFICE,CONTRA_ACCNO,CONTRA_ZIP,CONTRA_NAME,CSHYN,REASON1,REASON2,REASON4,TR_FLAG_01,ANALYTICAL_TRANS_CODE,TR_SP_01,TR_SP_02,TR_SP_03,TR_SP_04,TR_SP_05,TR_SP_10,VALUEDATE
npartitions=184,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
,string,string,string,string,string,string,string,float64,float64,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,float64,string
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [6]:
df=ddf.compute()

In [7]:
# Optimize missing value handling
df = df.dropna(thresh=len(df) * 0.5, axis=1)  # Drop columns with more than 50% missing values
df = df.fillna(df.mode().iloc[0])  # Fill missing values with mode for string columns

# Identify numeric columns
numeric_cols = df.select_dtypes(include='number').columns

# Fill missing values with median for numeric columns only
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

In [8]:
df.isnull().sum()

CUST_CUSTNO              0
ACC_BUSINESSTYPE         0
ACC_ACCNO                0
ACC_BUSINESSNO           0
ACC_CURRENCYISO          0
BUSINESSNO_TRANS         0
FK_CURRENCY              0
AMOUNT                   0
AMOUNTORIG               0
BRANCH_OFFICE            0
CONTRA_ACCNO             0
CONTRA_ZIP               0
CONTRA_NAME              0
CSHYN                    0
REASON1                  0
REASON2                  0
TR_FLAG_01               0
ANALYTICAL_TRANS_CODE    0
TR_SP_01                 0
TR_SP_02                 0
TR_SP_03                 0
TR_SP_04                 0
TR_SP_05                 0
VALUEDATE                0
dtype: int64

In [16]:

# Convert the VALUEDATE column to datetime format
df['VALUEDATE'] = dd.to_datetime(df['VALUEDATE'])
# Define the start and end dates for the desired week
start_date = '2024-01-01'
end_date = '2024-01-07'

# Filter data for the specified week
filtered_data= df[(df['VALUEDATE'] >= start_date) & (df['VALUEDATE'] <= end_date)]

In [17]:
filtered_data

Unnamed: 0,CUST_CUSTNO,ACC_BUSINESSTYPE,ACC_ACCNO,ACC_BUSINESSNO,ACC_CURRENCYISO,BUSINESSNO_TRANS,FK_CURRENCY,AMOUNT,AMOUNTORIG,BRANCH_OFFICE,...,REASON1,REASON2,TR_FLAG_01,ANALYTICAL_TRANS_CODE,TR_SP_01,TR_SP_02,TR_SP_03,TR_SP_04,TR_SP_05,VALUEDATE
907,2bdb75e6139a7717939d04a1ef7b8a7e17fe3261064a309c02aac70c7a447709,CASA,c60feae21e6d3a79d1b9dca82edf74b97b9e0bd93c461c51b092b7bdbfe8777d,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308011167253,VND,-500.00,-500.00,110000,...,DOAN THI KIM OANH CHUYEN KH,OAN VU THU HIEN,N,1|1,a390b60d34d64407a224af176bb5f2fd44994e48b7a6e469edbf468b2dab4e63,MOB,HOI SO SHB,A2,VU THU HIEN,2024-01-07
1110,39f2cdbe919da465def1ccfd021faaa991024ec0be217f36709ec916b244fc28,CASA,bc971c77455ea3bfaafb3ed1720a350a0c41b558fb03af1d6693bc9e0c9a58a9,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308011205083,VND,1500.00,1500.00,110000,...,Thuong thanh lap cty Mr DO,TRONG MAO,N,1|2,e7912596f20db264bbe7b42e86f6cbf9530933a500eaabff63813bdbea2cc71b,MOB,HOI SO SHB,A2,TRINH THI NGA,2024-01-05
1482,9d3d2ff5e7b1968a85094ce265620d44959f254a442d71db974d2bf8c38779c6,CASA,0214eb9bc6f571fe82ff614133a9b39dfcff04fd59b4e976f8f04868facae0fd,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308011176487,VND,-247.00,-247.00,110000,...,TRAN NGUYEN HUNG CHUYEN KH,OAN VU THI HANG,N,1|1,7e85fd8173f58d5eca0b549f62b450e2cc629dd129956e4375152f2c71db61f2,MOB,HOI SO SHB,A2,VU THI HANG,2024-01-05
1716,8f9f3bd056ba38251221307d62eb4ed3afbbf1ab470125e406d979b48797f563,CASA,6c14af8e55eb35b7dd6e8ace87ad61e9c943d9a477e1982d0acabf40e21b8855,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308011219515,VND,2560.52,2560.52,110000,...,XXXXXXXX-CTY TNHH LIEN CAU,VN - (CKRmNo: XXXXXXXXXXXXX,N,21|45,c2158f0937b6db07cbfa88e85c784c23c0ccf9ab42793ea9741e60699f414e13,STP,HOI SO SHB,LP,CITAD SHB,2024-01-05
1741,6fb16bee4249b2780b0693d73affb53ba8f12808982b3af2252b68c2ef904e4e,CASA,1dc5f75ff8be52bde53170a1a03da13deb2336c6cd4a2e8c2db94cd9935e399e,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308011169837,VND,-10000.00,-10000.00,110000,...,QUANG MINH VAY,N,N,1|1,f2cffcbcc2699d6d4ed05b5ed35ccda52d264543c11879cd834e3000122d825c,MOB,HOI SO SHB,A2,VI VAN NHUONG,2024-01-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030184,b7aec93092f3ab6984109b097d6db4ecb33ff9f47f432cff895832a81b439d32,CASA,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,2024013041174361,VND,20.00,20.00,110000,...,AU VU HOANG CHUYEN KHOAN AU,VU HOANG,N,1|2,c012940028b93766a4c4088f905f17911c6ae466ce394c6266431bff06208108,MOB,HOI SO SHB,A2,AU VU HOANG,2024-01-05
1030228,34e0bbcf928bde2bdeb733f252737247ca756a99f2af30f51c53ed7f77322d0b,CASA,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,2024013041119751,VND,-15000.00,-15000.00,110000,...,Vk sam Tet nhe,N,N,1|1,2b36a9ae3bfe1ad4336600851ba3d8f12b1752a7a0b203c72e15c039664bd049,MOB,HOI SO SHB,A2,NGUYEN THI THOM,2024-01-03
1030686,61fd9631094000c5107a1922d34c9042ee8d8a8ec3b096371236bf344207548f,CASA,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,2024013041119762,VND,-20000.00,-20000.00,110000,...,VO THI THU HUYEN CHUYEN KH,OAN NGUYEN THI KIM ANH,N,1|1,d73c17d316d8faa727e8ca06cb642de6e79e41b64576718a947806bea0022db5,MOB,HOI SO SHB,A2,NGUYEN THI KIM ANH,2024-01-01
1030838,9ebbc0a429995b5a4459b8bf4e90e97a3af741e1fccc199b7839a3618785241f,CASA,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,2024013041175249,VND,7500.00,7500.00,110300,...,CTY THAN MONG DUONG BO SUNG,TIEN LUONG THANG X NAM XXX,N,1|2,19e6683a4e4917eae1be8d7d69d65076b15b33a24731447649d51c0557ef8dbe,BLK,CN QUANG NINH,A2,CASA TRUNG GIAN CHUYEN TIEN THEO LO,2024-01-02


In [18]:
# Convert categorical columns to categorical data type
categorical_cols = filtered_data.select_dtypes(include=['string']).columns
for col in categorical_cols:
    filtered_data[col] = filtered_data[col].astype('category')

In [19]:
categorical_cols

Index(['CUST_CUSTNO', 'ACC_BUSINESSTYPE', 'ACC_ACCNO', 'ACC_BUSINESSNO',
       'ACC_CURRENCYISO', 'BUSINESSNO_TRANS', 'FK_CURRENCY', 'BRANCH_OFFICE',
       'CONTRA_ACCNO', 'CONTRA_ZIP', 'CONTRA_NAME', 'CSHYN', 'REASON1',
       'REASON2', 'TR_FLAG_01', 'ANALYTICAL_TRANS_CODE', 'TR_SP_01',
       'TR_SP_02', 'TR_SP_03', 'TR_SP_04', 'TR_SP_05'],
      dtype='object')

In [27]:
# Label encode string columns
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for column in filtered_data.select_dtypes(include=['category']).columns:
    le = LabelEncoder()
    filtered_data[column] = le.fit_transform(filtered_data[column])
    label_encoders[column] = le


In [28]:
filtered_data

Unnamed: 0,CUST_CUSTNO,ACC_BUSINESSTYPE,ACC_ACCNO,ACC_BUSINESSNO,ACC_CURRENCYISO,BUSINESSNO_TRANS,FK_CURRENCY,AMOUNT,AMOUNTORIG,BRANCH_OFFICE,...,REASON1,REASON2,TR_FLAG_01,ANALYTICAL_TRANS_CODE,TR_SP_01,TR_SP_02,TR_SP_03,TR_SP_04,TR_SP_05,VALUEDATE
907,23979,0,115012,0,10,201,10,-500.00,-500.00,0,...,27470,127124,0,0,263550,7,60,0,77114,2024-01-07
1110,31553,0,109609,0,10,1523,10,1500.00,1500.00,0,...,108081,137123,0,1,373173,7,60,0,70322,2024-01-05
1482,85777,0,1186,0,10,535,10,-247.00,-247.00,0,...,98558,127005,0,0,203888,7,60,0,76581,2024-01-05
1716,78513,0,62818,0,10,2001,10,2560.52,2560.52,0,...,126146,143207,0,4,312975,11,60,2,5259,2024-01-05
1741,61121,0,17436,0,10,309,10,-10000.00,-10000.00,0,...,88719,89235,0,0,391069,7,60,0,73443,2024-01-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030184,100258,0,3103,0,10,419626,10,20.00,20.00,0,...,2189,27662,0,1,309692,7,60,0,183,2024-01-05
1030228,28847,0,3103,0,10,417771,10,-15000.00,-15000.00,0,...,117944,89235,0,0,69807,7,60,0,50541,2024-01-03
1030686,53541,0,3103,0,10,417773,10,-20000.00,-20000.00,0,...,112284,123174,0,0,346941,7,60,0,49367,2024-01-01
1030838,86573,0,3103,0,10,419656,10,7500.00,7500.00,19,...,15935,23988,0,1,41809,1,42,0,4092,2024-01-02


In [47]:
# Apply the FP-growth algorithm
frequent_itemsets = fpgrowth(filtered_data, min_support=0.99, use_colnames=True)

ValueError: The allowed values for a DataFrame are True, False, 0, 1. Found value 23979

**convert the data format from numeric to binary**

In [49]:
# Create a binary representation of the data
frequent_itemsets = fpgrowth(filtered_data.applymap(lambda x: 1 if x else 0), min_support=0.01, use_colnames=True)
# Adjust min_support as needed, likely much lower than 0.99 given binary representation


In [50]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,1.000000,(VALUEDATE)
1,0.999998,(TR_SP_05)
2,0.999998,(CONTRA_ACCNO)
3,0.999998,(REASON1)
4,0.999998,(TR_SP_01)
...,...,...
1179642,0.013665,"(TR_SP_03, BRANCH_OFFICE, FK_CURRENCY, VALUEDATE, REASON2, ANALYTICAL_TRANS_CODE, CONTRA_ACCNO, TR_SP_05, TR_SP_04, AMOUNTORIG, TR_SP_02, CONTRA_ZIP, BUSINESSNO_TRANS, ACC_ACCNO, CUST_CUSTNO, TR_SP_01, AMOUNT, REASON1, CONTRA_NAME)"
1179643,0.013665,"(TR_SP_03, ACC_CURRENCYISO, BRANCH_OFFICE, VALUEDATE, REASON2, ANALYTICAL_TRANS_CODE, CONTRA_ACCNO, TR_SP_05, TR_SP_04, AMOUNTORIG, TR_SP_02, CONTRA_ZIP, BUSINESSNO_TRANS, ACC_ACCNO, CUST_CUSTNO, TR_SP_01, AMOUNT, REASON1, CONTRA_NAME)"
1179644,0.013665,"(TR_SP_03, ACC_CURRENCYISO, BRANCH_OFFICE, FK_CURRENCY, VALUEDATE, REASON2, ANALYTICAL_TRANS_CODE, CONTRA_ACCNO, TR_SP_05, TR_SP_04, AMOUNTORIG, CONTRA_ZIP, BUSINESSNO_TRANS, ACC_ACCNO, CUST_CUSTNO, TR_SP_01, AMOUNT, REASON1, CONTRA_NAME)"
1179645,0.013665,"(TR_SP_03, ACC_CURRENCYISO, BRANCH_OFFICE, FK_CURRENCY, VALUEDATE, REASON2, ANALYTICAL_TRANS_CODE, CONTRA_ACCNO, TR_SP_05, AMOUNTORIG, TR_SP_02, CONTRA_ZIP, BUSINESSNO_TRANS, ACC_ACCNO, CUST_CUSTNO, TR_SP_01, AMOUNT, REASON1, CONTRA_NAME)"


In [52]:
# Create a binary representation of the data
frequent_itemsets = fpgrowth(filtered_data.applymap(lambda x: 1 if x else 0), min_support=0.99, use_colnames=True)


In [53]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,1.000000,(VALUEDATE)
1,0.999998,(TR_SP_05)
2,0.999998,(REASON1)
3,0.999998,(TR_SP_01)
4,0.999998,(BUSINESSNO_TRANS)
...,...,...
48634,0.990008,"(CONTRA_ACCNO, TR_SP_05, TR_SP_01, BUSINESSNO_TRANS, ACC_ACCNO, AMOUNTORIG, CUST_CUSTNO, REASON1, TR_SP_02, CONTRA_NAME, REASON2)"
48635,0.990010,"(CONTRA_ACCNO, VALUEDATE, TR_SP_05, TR_SP_01, ACC_ACCNO, AMOUNTORIG, CUST_CUSTNO, AMOUNT, REASON1, TR_SP_02, CONTRA_NAME, REASON2)"
48636,0.990008,"(CONTRA_ACCNO, VALUEDATE, TR_SP_05, TR_SP_01, BUSINESSNO_TRANS, ACC_ACCNO, AMOUNTORIG, CUST_CUSTNO, REASON1, TR_SP_02, CONTRA_NAME, REASON2)"
48637,0.990008,"(CONTRA_ACCNO, TR_SP_05, TR_SP_01, BUSINESSNO_TRANS, ACC_ACCNO, AMOUNTORIG, CUST_CUSTNO, AMOUNT, REASON1, TR_SP_02, CONTRA_NAME, REASON2)"




In [54]:


from mlxtend.frequent_patterns import association_rules

# Define the minimum confidence level
min_confidence = 1

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)



In [55]:

# Display the association rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(TR_SP_05),(VALUEDATE),0.999998,1.000000,0.999998,1.0,1.000000,0.000000,inf,0.000000
1,(REASON1),(VALUEDATE),0.999998,1.000000,0.999998,1.0,1.000000,0.000000,inf,0.000000
2,"(TR_SP_05, REASON1)",(VALUEDATE),0.999995,1.000000,0.999995,1.0,1.000000,0.000000,inf,0.000000
3,(TR_SP_01),(VALUEDATE),0.999998,1.000000,0.999998,1.0,1.000000,0.000000,inf,0.000000
4,"(REASON1, TR_SP_01)",(VALUEDATE),0.999995,1.000000,0.999995,1.0,1.000000,0.000000,inf,0.000000
...,...,...,...,...,...,...,...,...,...,...
107258,"(CONTRA_ACCNO, VALUEDATE, TR_SP_05, BUSINESSNO_TRANS, ACC_ACCNO, AMOUNTORIG, TR_SP_01, CUST_CUSTNO, REASON1, TR_SP_02, CONTRA_NAME, REASON2)",(AMOUNT),0.990008,0.991379,0.990008,1.0,1.008696,0.008535,inf,0.862792
107259,"(CONTRA_ACCNO, VALUEDATE, TR_SP_05, BUSINESSNO_TRANS, ACC_ACCNO, CUST_CUSTNO, TR_SP_01, AMOUNT, REASON1, TR_SP_02, CONTRA_NAME, REASON2)",(AMOUNTORIG),0.990008,0.991379,0.990008,1.0,1.008696,0.008535,inf,0.862792
107260,"(CONTRA_ACCNO, TR_SP_05, BUSINESSNO_TRANS, AMOUNT, ACC_ACCNO, AMOUNTORIG, TR_SP_01, CUST_CUSTNO, REASON1, TR_SP_02, CONTRA_NAME, REASON2)",(VALUEDATE),0.990008,1.000000,0.990008,1.0,1.000000,0.000000,inf,0.000000
107261,"(CONTRA_ACCNO, TR_SP_05, BUSINESSNO_TRANS, ACC_ACCNO, AMOUNTORIG, TR_SP_01, CUST_CUSTNO, REASON1, TR_SP_02, CONTRA_NAME, REASON2)","(VALUEDATE, AMOUNT)",0.990008,0.991379,0.990008,1.0,1.008696,0.008535,inf,0.862792


In [59]:
#  filter rules by the zhangs metric >0

filtered_rules = rules[rules['zhangs_metric'] > 1]
filtered_rules


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
511,(ACC_CURRENCYISO),(FK_CURRENCY),0.999838,0.999838,0.999838,1.0,1.000162,0.000162,inf,1.0
512,(FK_CURRENCY),(ACC_CURRENCYISO),0.999838,0.999838,0.999838,1.0,1.000162,0.000162,inf,1.0
514,"(ACC_CURRENCYISO, VALUEDATE)",(FK_CURRENCY),0.999838,0.999838,0.999838,1.0,1.000162,0.000162,inf,1.0
516,"(VALUEDATE, FK_CURRENCY)",(ACC_CURRENCYISO),0.999838,0.999838,0.999838,1.0,1.000162,0.000162,inf,1.0
517,(ACC_CURRENCYISO),"(VALUEDATE, FK_CURRENCY)",0.999838,0.999838,0.999838,1.0,1.000162,0.000162,inf,1.0
518,(FK_CURRENCY),"(ACC_CURRENCYISO, VALUEDATE)",0.999838,0.999838,0.999838,1.0,1.000162,0.000162,inf,1.0
33535,(AMOUNT),(AMOUNTORIG),0.991379,0.991379,0.991379,1.0,1.008696,0.008547,inf,1.0
33536,(AMOUNTORIG),(AMOUNT),0.991379,0.991379,0.991379,1.0,1.008696,0.008547,inf,1.0
33538,"(AMOUNT, VALUEDATE)",(AMOUNTORIG),0.991379,0.991379,0.991379,1.0,1.008696,0.008547,inf,1.0
33540,"(VALUEDATE, AMOUNTORIG)",(AMOUNT),0.991379,0.991379,0.991379,1.0,1.008696,0.008547,inf,1.0


**Convert the data format from numeric to bolean**

In [39]:
# Convert the DataFrame to a transaction DataFrame
transaction_df = filtered_data.astype(bool)

In [40]:
transaction_df

Unnamed: 0,CUST_CUSTNO,ACC_BUSINESSTYPE,ACC_ACCNO,ACC_BUSINESSNO,ACC_CURRENCYISO,BUSINESSNO_TRANS,FK_CURRENCY,AMOUNT,AMOUNTORIG,BRANCH_OFFICE,...,REASON1,REASON2,TR_FLAG_01,ANALYTICAL_TRANS_CODE,TR_SP_01,TR_SP_02,TR_SP_03,TR_SP_04,TR_SP_05,VALUEDATE
907,True,False,True,False,True,True,True,True,True,False,...,True,True,False,False,True,True,True,False,True,True
1110,True,False,True,False,True,True,True,True,True,False,...,True,True,False,True,True,True,True,False,True,True
1482,True,False,True,False,True,True,True,True,True,False,...,True,True,False,False,True,True,True,False,True,True
1716,True,False,True,False,True,True,True,True,True,False,...,True,True,False,True,True,True,True,True,True,True
1741,True,False,True,False,True,True,True,True,True,False,...,True,True,False,False,True,True,True,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030184,True,False,True,False,True,True,True,True,True,False,...,True,True,False,True,True,True,True,False,True,True
1030228,True,False,True,False,True,True,True,True,True,False,...,True,True,False,False,True,True,True,False,True,True
1030686,True,False,True,False,True,True,True,True,True,False,...,True,True,False,False,True,True,True,False,True,True
1030838,True,False,True,False,True,True,True,True,True,True,...,True,True,False,True,True,True,True,False,True,True


In [41]:
# Apply the FP-growth algorithm
fpgrowth_results = fpgrowth(transaction_df, min_support=0.01, use_colnames=True)


In [42]:
fpgrowth_results

Unnamed: 0,support,itemsets
0,1.000000,(VALUEDATE)
1,0.999998,(TR_SP_05)
2,0.999998,(CONTRA_ACCNO)
3,0.999998,(REASON1)
4,0.999998,(TR_SP_01)
...,...,...
1179642,0.013665,"(TR_SP_03, BRANCH_OFFICE, FK_CURRENCY, VALUEDATE, REASON2, ANALYTICAL_TRANS_CODE, CONTRA_ACCNO, TR_SP_05, TR_SP_04, AMOUNTORIG, TR_SP_02, CONTRA_ZIP, BUSINESSNO_TRANS, ACC_ACCNO, CUST_CUSTNO, TR_SP_01, AMOUNT, REASON1, CONTRA_NAME)"
1179643,0.013665,"(TR_SP_03, ACC_CURRENCYISO, BRANCH_OFFICE, VALUEDATE, REASON2, ANALYTICAL_TRANS_CODE, CONTRA_ACCNO, TR_SP_05, TR_SP_04, AMOUNTORIG, TR_SP_02, CONTRA_ZIP, BUSINESSNO_TRANS, ACC_ACCNO, CUST_CUSTNO, TR_SP_01, AMOUNT, REASON1, CONTRA_NAME)"
1179644,0.013665,"(TR_SP_03, ACC_CURRENCYISO, BRANCH_OFFICE, FK_CURRENCY, VALUEDATE, REASON2, ANALYTICAL_TRANS_CODE, CONTRA_ACCNO, TR_SP_05, TR_SP_04, AMOUNTORIG, CONTRA_ZIP, BUSINESSNO_TRANS, ACC_ACCNO, CUST_CUSTNO, TR_SP_01, AMOUNT, REASON1, CONTRA_NAME)"
1179645,0.013665,"(TR_SP_03, ACC_CURRENCYISO, BRANCH_OFFICE, FK_CURRENCY, VALUEDATE, REASON2, ANALYTICAL_TRANS_CODE, CONTRA_ACCNO, TR_SP_05, AMOUNTORIG, TR_SP_02, CONTRA_ZIP, BUSINESSNO_TRANS, ACC_ACCNO, CUST_CUSTNO, TR_SP_01, AMOUNT, REASON1, CONTRA_NAME)"
