In [1]:
import numpy as np
import pandas as pd

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
# Read the parquet file
filtered_transactions = pd.read_parquet('filtered_transactions.parquet')
# Display the first few rows
print(filtered_transactions.head())


                                         CUST_CUSTNO  VALUEDATE  \
0  00001fd6f1d8852373810b5402eb3ae6adbd610d9d3608... 2023-08-05   
1  00001fd6f1d8852373810b5402eb3ae6adbd610d9d3608... 2023-08-06   
2  00001fd6f1d8852373810b5402eb3ae6adbd610d9d3608... 2023-08-10   
3  00001fd6f1d8852373810b5402eb3ae6adbd610d9d3608... 2023-08-11   
4  00001fd6f1d8852373810b5402eb3ae6adbd610d9d3608... 2023-08-13   

   TOTAL_RECEIVED  TOTAL_SENT  TOTAL_ABSOLUTE  
0          1500.0      -300.0          1800.0  
1             0.0     -1000.0          1000.0  
2         12717.6     -2063.0         14780.6  
3             0.0     -3439.0          3439.0  
4             0.0     -2500.0          2500.0  


In [4]:
# Rule-based anomalies
rule_15_anomalies = filtered_transactions[
    (filtered_transactions['TOTAL_RECEIVED'] >= 6000000) &
    (filtered_transactions['TOTAL_SENT'].abs() >= 0.95 * filtered_transactions['TOTAL_RECEIVED'])
]
rule_15_anomalies

Unnamed: 0,CUST_CUSTNO,VALUEDATE,TOTAL_RECEIVED,TOTAL_SENT,TOTAL_ABSOLUTE
702,0002270df70734f8d04c42004d9995e8b97fe3ff9de575...,2023-11-08,6925150.00,-6924000.00,13849150.00
2170,0004abe134ca2b7af66656ecc3232a24a1167e71aa2bd9...,2023-09-23,8000493.15,-8000008.22,16000501.37
4095,000975b0ee367856502b174994023821120d5730b3fcd3...,2023-08-22,10469355.99,-10051700.46,20521056.45
4102,000975b0ee367856502b174994023821120d5730b3fcd3...,2023-08-29,7179280.10,-7513252.10,14692532.20
4142,000975b0ee367856502b174994023821120d5730b3fcd3...,2023-10-10,7609562.68,-9133854.50,16743417.18
...,...,...,...,...,...
28979127,ffdc5cead1c3798acfabfbc6eb7646e4b5d9e540f35ee3...,2023-09-30,12854001.29,-12512900.03,25366901.32
28985946,ffe85c70096ca3b80d36d42577c9c12eb0ed1ad9508851...,2023-09-19,6012400.00,-6012400.00,12024800.00
28988045,ffecb0aaab47e4d92ac4176cdee0c3d9dd4617216035c8...,2023-08-31,6242683.78,-6231263.79,12473947.57
28988705,ffed5808f5090ac8cfa75f590ff0874f5787cde1784b40...,2023-12-29,9738344.66,-11274259.32,21012603.98


In [6]:
# Display results
print(f"Filtered transactions: {filtered_transactions.shape[0]} rows")
print(f"Remaining accounts: {filtered_transactions['CUST_CUSTNO'].nunique()}")
print(f"Anomaly accounts: {rule_15_anomalies['CUST_CUSTNO'].nunique()}") 


Filtered transactions: 28997435 rows
Remaining accounts: 459915
Anomaly accounts: 8580


In [7]:
# Step 1: Ensure total counts only for customers from rule_15_anomalies
relevant_customers = rule_15_anomalies['CUST_CUSTNO'].unique()
filtered_total_transactions = filtered_transactions[filtered_transactions['CUST_CUSTNO'].isin(relevant_customers)]

# Compute anomaly and total counts
anomaly_counts = rule_15_anomalies['CUST_CUSTNO'].value_counts()
total_counts = filtered_total_transactions['CUST_CUSTNO'].value_counts()

# Create account stats dataframe
account_stats = pd.DataFrame({
    'total_transactions': total_counts,
    'anomaly_transactions': anomaly_counts
}).fillna(0)

# Calculate anomaly ratio
account_stats['anomaly_ratio'] = account_stats['anomaly_transactions'] / account_stats['total_transactions']

# Step 2: Sort by anomaly ratio and filter eligible accounts
high_anomaly_accounts = account_stats.sort_values(by='anomaly_ratio', ascending=False)

# Get top 500 high-anomaly accounts
num_high_anomaly = min(2500, len(high_anomaly_accounts))
top_high_anomaly = high_anomaly_accounts.head(num_high_anomaly).index

# Step 3: Stratify remaining accounts
remaining_accounts = high_anomaly_accounts.iloc[num_high_anomaly:]
remaining_accounts['bucket'] = pd.qcut(remaining_accounts['total_transactions'], q=5, duplicates='drop')

# Sample 500 accounts across buckets
random_500_other = (
    remaining_accounts
    .groupby('bucket', group_keys=False)
    .apply(lambda x: x.sample(n=min(2500 // remaining_accounts['bucket'].nunique(), len(x)), replace=False))
    .index
)

# Step 4: Combine selected accounts and filter transactions
selected_accounts = np.concatenate((top_high_anomaly, random_500_other))

sample_anomalies = rule_15_anomalies[rule_15_anomalies['CUST_CUSTNO'].isin(selected_accounts)]
sample_initial_data = filtered_total_transactions[filtered_total_transactions['CUST_CUSTNO'].isin(selected_accounts)]

# Summary
print(f"Total selected accounts: {len(selected_accounts)}")
print(f"Total anomalies for selected accounts: {sample_anomalies.shape[0]}")
print(f"Total transactions for selected accounts: {sample_initial_data.shape[0]}")

# Preview
print("\nSample transactions from selected accounts:")
print(sample_initial_data.head()) 

print("\nSample anomalies from selected accounts:")
print(sample_anomalies.head())


Total selected accounts: 5000
Total anomalies for selected accounts: 15145
Total transactions for selected accounts: 255198

Sample transactions from selected accounts:
                                           CUST_CUSTNO  VALUEDATE  \
665  0002270df70734f8d04c42004d9995e8b97fe3ff9de575... 2023-08-02   
666  0002270df70734f8d04c42004d9995e8b97fe3ff9de575... 2023-08-04   
667  0002270df70734f8d04c42004d9995e8b97fe3ff9de575... 2023-08-05   
668  0002270df70734f8d04c42004d9995e8b97fe3ff9de575... 2023-08-07   
669  0002270df70734f8d04c42004d9995e8b97fe3ff9de575... 2023-08-11   

     TOTAL_RECEIVED  TOTAL_SENT  TOTAL_ABSOLUTE  
665          8000.0         0.0          8000.0  
666           986.0     -7000.0          7986.0  
667          3595.0     -2500.0          6095.0  
668         30111.0    -10000.0         40111.0  
669         33895.0    -37700.0         71595.0  

Sample anomalies from selected accounts:
                                             CUST_CUSTNO  VALUEDATE  \
702

In [8]:
# Save the filtered initial transaction data to a Parquet file
sample_initial_data.to_parquet('data_mix_5000.parquet', index=False)

# Save the anomaly data to a Parquet file
sample_anomalies.to_parquet('anomaly_mix_5000.parquet', index=False)

# Display confirmation message
print("Initial data and anomaly data have been saved to Parquet files.")


Initial data and anomaly data have been saved to Parquet files.
