In [None]:
!pip install "dask[complete]"
!pip install pyarrow pandas
!pip install mlxtend

Collecting dask[complete]
  Downloading dask-2024.7.1-py3-none-any.whl.metadata (3.8 kB)
Collecting partd>=1.4.0 (from dask[complete])
  Downloading partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting importlib-metadata>=4.13.0 (from dask[complete])
  Downloading importlib_metadata-8.2.0-py3-none-any.whl.metadata (4.7 kB)
Collecting pyarrow-hotfix (from dask[complete])
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting lz4>=4.3.2 (from dask[complete])
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting locket (from partd>=1.4.0->dask[complete])
  Downloading locket-1.0.0-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting distributed==2024.7.1 (from dask[complete])
  Downloading distributed-2024.7.1-py3-none-any.whl.metadata (3.4 kB)
Collecting dask-expr<1.2,>=1.1 (from dask[complete])
  Downloading dask_expr-1.1.9-py3-none-any.whl.metadata (2.5 kB)
Collecting bokeh>=2.4.2 (from dask[complete])


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import re
import dask.dataframe as dd
import glob
from pathlib import Path
import os
import glob
import datetime
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
# Directory containing the data files (assuming they are in parquet format)
data_dir = '/content/drive/MyDrive/Data_hasking'

# Get a list of all parquet files for the 10 days
file_list = sorted(glob.glob(f"{data_dir}/*.parquet"))[:10]  # Adjust the slicing if needed
file_list

['/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230801.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230802.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230803.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230804.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230805.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230806.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230807.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230808.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230809.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230810.parquet']

In [None]:
# Load all parquet files into a Dask DataFrame
ddf = dd.read_parquet(file_list)
# Compute the DataFrame to get a Pandas DataFrame
df = ddf.compute()

In [None]:
tb_customers = dd.read_parquet('/content/drive/MyDrive/TB_CUSTOMERS.parquet')

In [None]:
tb_customers_df =  tb_customers.compute()
tb_customers_df

Unnamed: 0,CUST_CUSTNO,CUSTOMER_TYPE,BUSINESS_TYPE,INDUSTRY_TYPE
0,e2eaa7a94d5e3bcd15579df95cfadcc33a9ffef9832e5073426dc36a3499c069,Cá nhân,CA THE,DICH VU LUU TRU VA AN UONG
1,30882b5b40eb85071295cc78276c3b74eb0f73eaebad8f009bc0aa04796a565c,Cá nhân,CA THE,SX &PP DIEN/KHI DOT/NUOC NONG/HOI NUOC/DIEU HOA KK
2,9e70464b7a79b94c07a6dc6cf3eea62c1e4cde0872304fd1e03542c898c0a6e9,Cá nhân,CA THE,HOAT DONG DICH VU KHAC
3,ce932f938de9bbabd34650dc0d8e07e1814d17580d97440d48905c5a7d0f0d09,Cá nhân,CA THE,HOAT DONG DICH VU KHAC
4,9771233eb92c014fd6cc1c0312a93278b5b933fc01a64d0a3f24dac62cd92812,Cá nhân,CA THE,HOAT DONG DICH VU KHAC
...,...,...,...,...
3179025,cc1748a291074998ab6fd12d995f4773618f75aac4f6cf33789d8ab3361b658c,Cá nhân,CA THE,HOAT DONG DICH VU KHAC
3179026,40062bd373af28817793d4b15ca498238a103ca39213d7857a039181a9328a54,Cá nhân,CA THE,HOAT DONG DICH VU KHAC
3179027,5c3d7ca15098f3d43a0b47f193f4a1a47f11b35ca8717277c64b165e2fa3c745,Cá nhân,CA THE,HOAT DONG DICH VU KHAC
3179028,b806e3779fe5fcd0efe15d027a91069a07304cddf213a708c8f4318a2305e347,Cá nhân,CA THE,HOAT DONG DICH VU KHAC


In [None]:
# Filter for individual customers
tb_customers_individual_df = tb_customers_df[tb_customers_df["CUSTOMER_TYPE"] == "Cá nhân"]

In [None]:
individual_customers = tb_customers_individual_df["CUST_CUSTNO"].unique().tolist()

#RULE 6

In [None]:
# Merge transaction data with individual customer data
df_individual = df[df['CUST_CUSTNO'].isin(individual_customers)]

In [None]:
# Convert VALUEDATE to datetime and set as index
df_individual['VALUEDATE'] = pd.to_datetime(df_individual['VALUEDATE'], errors='coerce')
df_individual.set_index('VALUEDATE', inplace=True)


In [None]:
# Group by customer and resample within 3-day windows
df_grouped = df_individual.groupby('CUST_CUSTNO').resample('3D').agg({'CONTRA_ACCNO': 'nunique'}).reset_index()


In [None]:
# Filter for anomalies where the number of unique contra accounts >= 10
anomalies = df_grouped[df_grouped['CONTRA_ACCNO'] >= 10]


In [None]:
anomalies

Unnamed: 0,CUST_CUSTNO,VALUEDATE,CONTRA_ACCNO
121,000975b0ee367856502b174994023821120d5730b3fcd384eda36753a7e1e2c7,2023-08-01,13
122,000975b0ee367856502b174994023821120d5730b3fcd384eda36753a7e1e2c7,2023-08-04,11
123,000975b0ee367856502b174994023821120d5730b3fcd384eda36753a7e1e2c7,2023-08-07,12
723,0032631c2d2052103e4ba885b81269361664018c1ed8aaa72bf03b7e7a755e4a,2023-08-04,11
1025,004409ce915f6fe9689ea5a0480144fa99eaeb805aa7509b2ecfa8349590b2af,2023-08-01,14
...,...,...,...
920358,feb237d2cbede4477520a82ae16e592c9bd95eef526b8a00c938c76ef764014f,2023-08-07,12
920487,febc0044001bed1df6ccdba70e19d51170c757ff9b5912beb8e784897081fa76,2023-08-07,12
920529,fec007f4a87ba34915602be31762ac4a1858886285122802e8220fe38ade0c33,2023-08-10,14
923597,ff9e7b04b983f33f20a8c0a9d95e7824a6d54d063de221de62dbf5aaa94f2d3b,2023-08-07,10


In [None]:
# Merge anomalies with the original dataframe to get detailed information
anomalous_transactions = df_individual.reset_index().merge(anomalies[['CUST_CUSTNO', 'VALUEDATE']], on=['CUST_CUSTNO', 'VALUEDATE'], how='inner')


In [None]:
# Display the final DataFrame containing all anomalous transactions
anomalous_transactions

Unnamed: 0,VALUEDATE,CUST_CUSTNO,ACC_BUSINESSTYPE,ACC_ACCNO,ACC_BUSINESSNO,ACC_CURRENCYISO,BUSINESSNO_TRANS,FK_CURRENCY,AMOUNT,AMOUNTORIG,...,REASON2,REASON4,TR_FLAG_01,ANALYTICAL_TRANS_CODE,TR_SP_01,TR_SP_02,TR_SP_03,TR_SP_04,TR_SP_05,TR_SP_10
0,2023-08-01,1370756be01708ceebd34e073b847abef0925fc70847af6eed75ee756c7c707d,CARD,5942cd1adcceb7f9c461520ca13ac98243a90dec1295d31df5e94da76d1a600e,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308011836923,VND,-926.70,-926.70,...,,,Y,TP501,3571d49ba215238abbf108c2341ef60b74bd539c6e27ed0785786db6c2e43c3c,EPOS,HOI SO SHB,CM,,5453.64
1,2023-08-01,1370756be01708ceebd34e073b847abef0925fc70847af6eed75ee756c7c707d,CASA,2a41b21da840ff3b3819ebb9c3f7f73e7e362201e1a3351abba276e6373906f2,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308011212570,VND,10000.00,10000.00,...,OAN NGUYEN THUY TIEN,,N,1|2,98bd1eff0d795cdcc3d33d602257d1c8b08722c02ae4dd5e0cc85a40a2b708bf,MOB,HOI SO SHB,A2,NGUYEN THUY TIEN,
2,2023-08-01,1370756be01708ceebd34e073b847abef0925fc70847af6eed75ee756c7c707d,CASA,2a41b21da840ff3b3819ebb9c3f7f73e7e362201e1a3351abba276e6373906f2,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308011169715,VND,-2000.00,-2000.00,...,OAN Ms NGUYEN NGOC HUYEN,,N,1|1,5d4a741075b48d84a0e5fc1cba301c156b01f8a320078b18d56b2e8e778dc759,MOB,HOI SO SHB,A2,NGUYEN NGOC HUYEN,
3,2023-08-01,1370756be01708ceebd34e073b847abef0925fc70847af6eed75ee756c7c707d,CASA,2a41b21da840ff3b3819ebb9c3f7f73e7e362201e1a3351abba276e6373906f2,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308011164691,VND,-500.00,-500.00,...,OAN NGUYEN CUU DIEU HUONG,,N,1|1,2907cdbd8c30db78ae76a0db00d1510ec1310273086d01235a8ee4888d73500c,MOB,HOI SO SHB,A2,NGUYEN CUU DIEU HUONG,
4,2023-08-01,1370756be01708ceebd34e073b847abef0925fc70847af6eed75ee756c7c707d,CASA,9e3cba65b78c89574a8e91564c02f7db0c8b28488964ad0ef4e29370046211ad,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308011185625,VND,-10000.00,-10000.00,...,OAN NGUYEN THUY TIEN,,N,1|1,98bd1eff0d795cdcc3d33d602257d1c8b08722c02ae4dd5e0cc85a40a2b708bf,MOB,HOI SO SHB,A2,NGUYEN THUY TIEN,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176033,2023-08-10,efe0aa21f8612aad245340999d471f0c760852ba2d484e5e17b6480dac3097e9,CASA,8710da6d58b0ed03933bf8db34188a39ab74186145322e717035b26d55038cdc,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308106564753,VND,-500.00,-500.00,...,HOAN NGUYEN THI KIM CHI,,N,1|1,f7e568209f20aa90d27047f46ff97e10490a4e48480f22e09366f7fe2e7c668f,MOB,HOI SO SHB,A2,NGUYEN THI KIM CHI,
176034,2023-08-10,efe0aa21f8612aad245340999d471f0c760852ba2d484e5e17b6480dac3097e9,CASA,8710da6d58b0ed03933bf8db34188a39ab74186145322e717035b26d55038cdc,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308106584275,VND,275.00,275.00,...,KHOAN NGUYEN LINH PHUONG,,N,1|2,f417f624ba4c07e57c237c2b9f2babfbcdbe3e34b961279967dfdcff7510fb64,MOB,HOI SO SHB,A2,NGUYEN THI THU NGAN,
176035,2023-08-10,efe0aa21f8612aad245340999d471f0c760852ba2d484e5e17b6480dac3097e9,CASA,8710da6d58b0ed03933bf8db34188a39ab74186145322e717035b26d55038cdc,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308106584274,VND,100.00,100.00,...,N KHOAN NGUYEN LINH PHUONG,,N,1|2,24461f40d2edacf2d5a41499c569b1f3c789a68f4660037a88f7871895859d38,MOB,HOI SO SHB,A2,TRAN THI KHANH CHUNG,
176036,2023-08-10,efe0aa21f8612aad245340999d471f0c760852ba2d484e5e17b6480dac3097e9,CASA,8710da6d58b0ed03933bf8db34188a39ab74186145322e717035b26d55038cdc,0553d33b8bbc8e3a944ff438a51a17cfb38fd0241667af98b6834685cc487ee6,VND,202308106694730,VND,6425.15,6425.15,...,ATIONAL TT TIEN LUONG TXX-,XXXX ( DOT X),N,1|1,2c9f006ad45b2bbf64ed17dfa3f855f2735184015cac732e4c13537f8f8aa8ba,IAS,CN LONG AN,FP,,
