In [1]:
!pip install "dask[complete]"
!pip install pyarrow pandas
!pip install mlxtend

Collecting dask[complete]
  Downloading dask-2024.7.1-py3-none-any.whl.metadata (3.8 kB)
Collecting partd>=1.4.0 (from dask[complete])
  Downloading partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting importlib-metadata>=4.13.0 (from dask[complete])
  Downloading importlib_metadata-8.2.0-py3-none-any.whl.metadata (4.7 kB)
Collecting pyarrow-hotfix (from dask[complete])
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting lz4>=4.3.2 (from dask[complete])
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting locket (from partd>=1.4.0->dask[complete])
  Downloading locket-1.0.0-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting dask-expr<1.2,>=1.1 (from dask[complete])
  Downloading dask_expr-1.1.9-py3-none-any.whl.metadata (2.5 kB)
Collecting bokeh>=2.4.2 (from dask[complete])
  Downloading bokeh-3.5.0-py3-none-any.whl.metadata (12 kB)
Collecting distributed==2024.7.1 (from dask[complete])
  Download

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from pathlib import Path
import os
import glob
import datetime
from datetime import timedelta
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
import dask.dataframe as dd
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_colwidth', None)

#RULE 6

In [4]:
# Directory containing the data files (assuming they are in parquet format)
data_dir = '/content/drive/MyDrive/Data_hasking'

# Get a list of all parquet files for the 10 days
file_list = sorted(glob.glob(f"{data_dir}/*.parquet"))[:10]  # Adjust the slicing if needed
file_list

['/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230801.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230802.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230803.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230804.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230805.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230806.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230807.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230808.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230809.parquet',
 '/content/drive/MyDrive/Data_hasking/DATA_TB_TRANSACTIONS_20230810.parquet']

In [5]:
tb_customers = dd.read_parquet('/content/drive/MyDrive/TB_CUSTOMERS.parquet')

In [6]:
tb_customers_df =  tb_customers.compute()
tb_customers_df

Unnamed: 0,CUST_CUSTNO,CUSTOMER_TYPE,BUSINESS_TYPE,INDUSTRY_TYPE
0,e2eaa7a94d5e3bcd15579df95cfadcc33a9ffef9832e50...,Cá nhân,CA THE,DICH VU LUU TRU VA AN UONG
1,30882b5b40eb85071295cc78276c3b74eb0f73eaebad8f...,Cá nhân,CA THE,SX &PP DIEN/KHI DOT/NUOC NONG/HOI NUOC/DIEU HO...
2,9e70464b7a79b94c07a6dc6cf3eea62c1e4cde0872304f...,Cá nhân,CA THE,HOAT DONG DICH VU KHAC
3,ce932f938de9bbabd34650dc0d8e07e1814d17580d9744...,Cá nhân,CA THE,HOAT DONG DICH VU KHAC
4,9771233eb92c014fd6cc1c0312a93278b5b933fc01a64d...,Cá nhân,CA THE,HOAT DONG DICH VU KHAC
...,...,...,...,...
3179025,cc1748a291074998ab6fd12d995f4773618f75aac4f6cf...,Cá nhân,CA THE,HOAT DONG DICH VU KHAC
3179026,40062bd373af28817793d4b15ca498238a103ca39213d7...,Cá nhân,CA THE,HOAT DONG DICH VU KHAC
3179027,5c3d7ca15098f3d43a0b47f193f4a1a47f11b35ca87172...,Cá nhân,CA THE,HOAT DONG DICH VU KHAC
3179028,b806e3779fe5fcd0efe15d027a91069a07304cddf213a7...,Cá nhân,CA THE,HOAT DONG DICH VU KHAC


In [7]:
# Filter for individual customers
tb_customers_dfindividual_df = tb_customers_df[tb_customers_df["CUSTOMER_TYPE"] == "Cá nhân"]

In [None]:
# Function to detect high volume transactions within a single day
def detect_short_periods_and_high_volume(df, transaction_threshold=300):
    # Group by customer and date to count transactions within the same day
    grouped = df.groupby(['CUST_CUSTNO', 'VALUEDATE']).agg({
        'CONTRA_ACCNO': 'count'
    }).reset_index()

    # Filter customers with more than the threshold transactions in a single day
    high_transaction_customers = grouped[grouped['CONTRA_ACCNO'] > transaction_threshold]

    return high_transaction_customers

In [None]:

# Initialize an empty DataFrame to hold results
final_results = pd.DataFrame()

# Loop through each file, load the data, and apply the detection logic
for file in file_list:
    # Load the data for the current file using Dask
    ddf = dd.read_parquet(file)

    # Convert VALUEDATE column to datetime
    ddf['VALUEDATE'] = dd.to_datetime(ddf['VALUEDATE'], errors='coerce')

    # Compute the DataFrame to get a Pandas DataFrame
    df = ddf.compute()

    # Merge transaction data with individual customer data
    individual_df = df.merge(tb_customers_dfindividual_df[['CUST_CUSTNO']], on='CUST_CUSTNO', how='inner')

    # Apply the function to detect high volume transactions
    high_transaction_customers = detect_short_periods_and_high_volume(individual_df)

    # Append the results to the final_results DataFrame
    final_results = pd.concat([final_results, high_transaction_customers])


In [None]:

# Display final results
print("High transaction customers:")
final_results

High transaction customers:


Unnamed: 0,CUST_CUSTNO,VALUEDATE,CONTRA_ACCNO
7433,097585d1fe09d306fdd731023196eee61025c5c0011e844473ccd6766f79c28e,2023-08-01,1093
10436,0d687c176172c6450adcc1f064f30eabb27ff3b2af80490a23c42c2922e45d59,2023-08-01,634
11296,0e8f5e430f71530700dff9e1c9d3c8dbbec433de18f0d5683ddffb0309c6e677,2023-08-01,645
15696,14668983e91e5ece2984ec789a52564533cf1a8772e84a0a8563f71f8a585b17,2023-08-01,423
18652,1833f1e7251ada75d1ce72f01297fe38746e744da31ad50c8e3c147659112c00,2023-08-01,354
...,...,...,...
154073,f36a9f297b95791f15f5716d95743d80672253f9f13108d8d367e48f8b827c76,2023-08-10,516
157199,f88658460c792c72605b63e3176c6c6e42fbe7cb85cf29ad218829b659075376,2023-08-10,310
157504,f8fa376e6a86248748ff3c17be48f61bfa014bacbdf87400c4cb1dd73763e853,2023-08-10,430
159232,fbbb29a83cb2a0403e7bda342aca93c243a9f4cb2211365b7005b801310c4f9c,2023-08-10,817


In [None]:

# Extract unique customer IDs
unique_cust_custno = final_results['CUST_CUSTNO'].unique()

# Display the unique customer IDs
unique_cust_custno


<ArrowStringArray>
['097585d1fe09d306fdd731023196eee61025c5c0011e844473ccd6766f79c28e',
 '0d687c176172c6450adcc1f064f30eabb27ff3b2af80490a23c42c2922e45d59',
 '0e8f5e430f71530700dff9e1c9d3c8dbbec433de18f0d5683ddffb0309c6e677',
 '14668983e91e5ece2984ec789a52564533cf1a8772e84a0a8563f71f8a585b17',
 '1833f1e7251ada75d1ce72f01297fe38746e744da31ad50c8e3c147659112c00',
 '1860d0281f23ef94f964d0319158ca0ab52d8c049fc2116e6e1829373dac0431',
 '1b558f501cf59fee2d7ca7f3da379ce4489368ceb8e3301a2cd0c4cb552febcb',
 '23435e0473f7a743ea3b17b00e4f3d44bd63eda90d69b63e36144b4c9eab4c32',
 '24c72ddcc5e4e40afc7512bc105df6a61818ed93c091b6cbb3cb605b52ab8724',
 '267ae931b3e22eea61494d952aad984f20c6579c8432970dc0a9a886ef3908c1',
 '29b6eaac5e5a4076d01f73ef9704e53966ea4b0713fdc2c42bcb88c8c793f92b',
 '2ce16fa639c7f00389bb618408d9529bf40030d74c6046d0565738b86ab700b9',
 '3126aa0a124750dfb4825398c73f4b90225aaa2c4c7d5755efbebdea33e17372',
 '39d1ac7e58a26d9ef994ca4e7204492145885892f650de0655d3534443d5dd28',
 '3ae2cb3e24547

#RULE 25

In [None]:
# Function to detect high volume transactions within a single day
def detect_high_volume_transactions(df):
    # Sort by customer and date
    df = df.sort_values(by=['CUST_CUSTNO', 'VALUEDATE'])

    # Create a new column for the date part only
    df['DATE_ONLY'] = df['VALUEDATE'].dt.date

    # Group by customer and date to count transactions within the day
    grouped = df.groupby(['CUST_CUSTNO', 'DATE_ONLY']).size().reset_index(name='transaction_count')

    # Filter customers with more than 300 transactions within a single day
    high_transaction_customers = grouped[grouped['transaction_count'] > 300]

    return high_transaction_customers

In [None]:
# Initialize an empty DataFrame to hold results
final_results = pd.DataFrame()

# Loop through each file, load the data, and apply the detection logic
for file in file_list:
    # Load the data for the current file using Dask
    ddf = dd.read_parquet(file)

    # Convert VALUEDATE column to datetime
    ddf['VALUEDATE'] = dd.to_datetime(ddf['VALUEDATE'], errors='coerce')

    # Compute the filtered DataFrame to get a Pandas DataFrame
    df = ddf.compute()

    # Apply the function to detect high volume transactions
    high_transaction_customers = detect_high_volume_transactions(df)

    # Append the results to the final_results DataFrame
    final_results = pd.concat([final_results, high_transaction_customers])


In [None]:

# Display final results
print("High transaction customers:")
final_results

High transaction customers:


Unnamed: 0,CUST_CUSTNO,DATE_ONLY,transaction_count
51,000f99e1d656961a9b6c86837647f0b7aae4a67b2e321d...,2023-08-01,1523
7392,090b24e00336e8d1b7b46c91dd73a1af61a50f7a0bd6e7...,2023-08-01,4267
7779,097585d1fe09d306fdd731023196eee61025c5c0011e84...,2023-08-01,1093
10921,0d687c176172c6450adcc1f064f30eabb27ff3b2af8049...,2023-08-01,634
11829,0e8f5e430f71530700dff9e1c9d3c8dbbec433de18f0d5...,2023-08-01,645
...,...,...,...
155992,f36a9f297b95791f15f5716d95743d80672253f9f13108...,2023-08-10,516
159156,f88658460c792c72605b63e3176c6c6e42fbe7cb85cf29...,2023-08-10,310
159469,f8fa376e6a86248748ff3c17be48f61bfa014bacbdf874...,2023-08-10,430
161214,fbbb29a83cb2a0403e7bda342aca93c243a9f4cb221136...,2023-08-10,817


In [None]:

# Extract unique customer IDs
unique_cust_custno = final_results['CUST_CUSTNO'].unique()

# Display the unique customer IDs
print("Unique customer IDs with more than 300 transactions in a single day:")
unique_cust_custno

Unique customer IDs with more than 300 transactions in a single day:


<ArrowStringArray>
['000f99e1d656961a9b6c86837647f0b7aae4a67b2e321d8a39bd6d7a4122b5ef',
 '090b24e00336e8d1b7b46c91dd73a1af61a50f7a0bd6e7b0693592a61d99e5d8',
 '097585d1fe09d306fdd731023196eee61025c5c0011e844473ccd6766f79c28e',
 '0d687c176172c6450adcc1f064f30eabb27ff3b2af80490a23c42c2922e45d59',
 '0e8f5e430f71530700dff9e1c9d3c8dbbec433de18f0d5683ddffb0309c6e677',
 '14668983e91e5ece2984ec789a52564533cf1a8772e84a0a8563f71f8a585b17',
 '1833f1e7251ada75d1ce72f01297fe38746e744da31ad50c8e3c147659112c00',
 '1860d0281f23ef94f964d0319158ca0ab52d8c049fc2116e6e1829373dac0431',
 '1b558f501cf59fee2d7ca7f3da379ce4489368ceb8e3301a2cd0c4cb552febcb',
 '23435e0473f7a743ea3b17b00e4f3d44bd63eda90d69b63e36144b4c9eab4c32',
 ...
 '57a484c92281dfa6b95019fcdad8a1c602cfe29c79f2706f7301c31b9f22a4c2',
 'c398d24bfa1665e4ec945a621967871a64d3667b862e66f0488be484c01d31e2',
 '1db9f923814106f1a1e2e9eaa8e12e9746224d33e24bdaf191af0d439ca74bfc',
 '7faddedccb7e3ee42a5f7b3d9b71b0e0bd77b94496bc938c1d0872dfdf3e9ab4',
 'dde987a1

#RULE 15

In [13]:

# Function to detect individual customers receiving and withdrawing money in approximately equal amounts during the day
def detect_individual_withdraw_equal_deposit(df):
    # Merge transaction data with individual customer data
    individual_df = df.merge(tb_customers_dfindividual_df[['CUST_CUSTNO']], on='CUST_CUSTNO', how='inner')

    # Convert VALUEDATE column to date
    individual_df['DATE_ONLY'] = individual_df['VALUEDATE'].dt.date

    # Calculate total credits and debits for each customer and day
    credits = individual_df[individual_df['AMOUNT'] > 0].groupby(['CUST_CUSTNO', 'DATE_ONLY'])['AMOUNT'].sum().reset_index(name='total_credits')
    debits = individual_df[individual_df['AMOUNT'] < 0].groupby(['CUST_CUSTNO', 'DATE_ONLY'])['AMOUNT'].sum().reset_index(name='total_debits')

    # Merge credits and debits
    transactions = pd.merge(credits, debits, on=['CUST_CUSTNO', 'DATE_ONLY'], how='inner')

    # Calculate proportion of debits to credits
    transactions['debit_credit_ratio'] = (transactions['total_debits'].abs() / transactions['total_credits']) * 100

    # Apply the rule conditions
    filtered_transactions = transactions[(transactions['debit_credit_ratio'] >= 95) & (transactions['total_credits'] >= 6000000)]

    return filtered_transactions

In [19]:


# Loop through each file, load the data, and apply the detection logic
for file in file_list:
    # Load the data for the current file using Dask
    ddf = dd.read_parquet(file)

    # Convert VALUEDATE column to datetime
    ddf['VALUEDATE'] = dd.to_datetime(ddf['VALUEDATE'], errors='coerce')


    # Compute the filtered DataFrame to get a Pandas DataFrame
    df = ddf.compute()

    # Apply the function to detect individual customers receiving and withdrawing money in approximately equal amounts
    result = detect_individual_withdraw_equal_deposit(df)



In [20]:
result

Unnamed: 0,CUST_CUSTNO,DATE_ONLY,total_credits,total_debits,debit_credit_ratio
1350,0462ed3bc102ac5595ac9e36153cf5d1f696a58d087ada...,2023-08-10,3.422877e+08,-3.422868e+08,99.999736
1484,04cb9a89d7c4f75ed4d0caefb7282e9f50ca87b49fbfb5...,2023-08-10,9.005810e+07,-9.000000e+07,99.935486
1799,05dbe951083c322a7cb506317fd297c29bf863048dcd0a...,2023-08-10,1.441655e+07,-1.441655e+07,100.000000
2255,0737064e5008d88e0d2299c4f3c4edcbb69ad077119662...,2023-08-10,6.218192e+06,-6.109096e+06,98.245536
3160,0a1a0d65278332ce4a0b3d982b8ee9d75bf4e6f3503279...,2023-08-10,7.055321e+06,-7.046352e+06,99.872879
...,...,...,...,...,...
75018,f3f45c0e740f423b6a7ccb8864be3aced1492248881cd7...,2023-08-10,6.218192e+06,-6.109096e+06,98.245536
76237,f7e7345140d86f403288974880f439ddcaac6ea3def6c7...,2023-08-10,6.166619e+06,-9.166619e+06,148.649020
77385,fbbb29a83cb2a0403e7bda342aca93c243a9f4cb221136...,2023-08-10,7.323000e+06,-7.383708e+06,100.829009
78050,fdf8b005bbd7c710ef862839b8b233bc8befc4bc9cdaa1...,2023-08-10,8.124559e+06,-7.979210e+06,98.210994


In [22]:

# Extract unique customer IDs
unique_cust_custno = result['CUST_CUSTNO'].unique()

# Display the unique customer IDs
print("Unique customer IDs meeting the criteria:")
unique_cust_custno


Unique customer IDs meeting the criteria:


<ArrowStringArray>
['0462ed3bc102ac5595ac9e36153cf5d1f696a58d087ada16d6cf88b80446d301',
 '04cb9a89d7c4f75ed4d0caefb7282e9f50ca87b49fbfb5554fb3ab024ac13517',
 '05dbe951083c322a7cb506317fd297c29bf863048dcd0aea9199e881e12e55b3',
 '0737064e5008d88e0d2299c4f3c4edcbb69ad07711966210e517fff02fd3817a',
 '0a1a0d65278332ce4a0b3d982b8ee9d75bf4e6f350327975176afd7ced84b910',
 '0aa7c36c5638a6fad7f8a6952ca33a2b3f3928d826e9df45c25e174c5113a026',
 '0af6add8cb3d47c1b5cb4c304d95f211dd83bfef294762db001bc481858a3f98',
 '0b5023c23745b483f339dcb5e74edca09fedbe5277ff09a1088406320d03f83b',
 '0b5e2463220a5bf272315cf498fdb182ce32788f273659e62e20000df2a3f058',
 '0d7f6ce46beb610318edf56f5724fca8b4c047d64945b0731fa41514ab275335',
 ...
 'e7cece8c7e57cbed9dd7b9c156c4919ae8e9f42cc1ca302bcfc23a02d8dd97e1',
 'e81b562097258f3d77c3d8d6ea6e1cb3c7e639fef6f6463e70b3eb0387762d1d',
 'ec198f83d1c6fd222bd1047faff05075f574d2a5f4d555b3f5122f32995a15a8',
 'ec40a44d9b1cb971136271f71c09b47c9bba72b5c9d9f47ceb269a0f306ae438',
 'f0e9c449