In [106]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from datetime import datetime

In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import plotly.express as px
from sklearn.preprocessing import StandardScaler

In [4]:
abm = pd.read_csv("data/abm.csv")
card = pd.read_csv("data/card.csv")
cheque = pd.read_csv("data/cheque.csv")
eft = pd.read_csv("data/eft.csv")
emt = pd.read_csv("data/emt.csv")
wire = pd.read_csv("data/wire.csv")

kyc = pd.read_csv("data/kyc.csv")

1. Wire Transfers
Advantages: Ability to handle large money transfers, especially for international transactions.
Use: Due to its fast and international nature, wire transfers are a very popular method of money laundering, especially when it involves transferring funds to offshore accounts.

2. EMT (Email Transfer)
Advantages: Easy to operate, high popularity, usually used for smaller daily transactions.
Use: May be used for smaller money laundering operations, or as part of a "layering" strategy to obfuscate the flow of funds through frequent small transactions.

3. EFT (Electronic Funds Transfer)
Pros: Covers a variety of forms from direct deposit to automatic deductions and is suitable for domestic and international transactions.
Use: Similar to wire transfers, electronic funds transfers can be used for large-scale fund movements, especially when combined with multiple accounts and cross-border transactions.

4. ABM (Automated Banking Machine)
Advantages: Provides cash access services, users can anonymously conduct certain types of transactions.
Use: Although the transaction amount is limited, it can be used to deposit illegal cash and then make other forms of transfers.

5. Cheques (cheques)
Advantages: Traditional payment methods, used for both legal and illegal purposes have an extensive history.
Purpose: To launder funds through checks, especially by setting up shell companies to issue or receive checks.
Money laundering strategies:
Layering: Multiple transfers between the source and final destination of funds, using different methods and tools at each step.
Structuring: Make multiple small transactions to avoid triggering financial institutions' reporting thresholds.
Mixed use: By cross-using wire transfers, EFTs, checks, and other tools, money launderers try to make tracing more difficult.

EFT & Wire

EMT

ABM 

Cheque

# ABM


### 1. single transaction > 20k
### 2. single cash transaction > 10k

In [44]:
abm = abm.fillna('unknown')
abm.head(10)

Unnamed: 0,abm_id,customer_id,amount_cad,debit_credit,cash_indicator,country,province,city,transaction_date,transaction_time
0,ABM00000000000000006,SYNCID0000000014,25.41,credit,False,CA,SK,REGINA,2022-11-16,17:37:41
1,ABM00000000000000008,SYNCID0000000034,238.17,debit,True,CA,ON,TORONTO,2022-11-18,10:22:59
2,ABM00000000000000009,SYNCID0000000034,1655.43,credit,False,CA,unknown,other,2022-12-29,11:56:08
3,ABM00000000000000010,SYNCID0000000034,620.69,credit,True,CA,unknown,other,2023-01-22,16:48:12
4,ABM00000000000000011,SYNCID0000000034,323.7,debit,True,CA,unknown,other,2022-11-14,13:24:45
5,ABM00000000000000012,SYNCID0000000034,6876.76,credit,False,CA,ON,VAUGHAN,2022-12-11,11:55:50
6,ABM00000000000000013,SYNCID0000000038,104.55,debit,True,CA,unknown,other,2022-12-28,18:07:36
7,ABM00000000000000014,SYNCID0000000046,111.91,debit,True,CA,AB,EDMONTON,2022-11-14,22:58:02
8,ABM00000000000000015,SYNCID0000000046,193.76,debit,True,unknown,unknown,unknown,2022-11-26,10:50:41
9,ABM00000000000000016,SYNCID0000000046,217.0,debit,True,unknown,unknown,unknown,2023-01-06,10:26:31


In [43]:
# Strange Customer Example
abm[abm["customer_id"] == "SYNCID0000016882"]

Unnamed: 0,abm_id,customer_id,amount_cad,debit_credit,cash_indicator,country,province,city,transaction_date,transaction_time
20380,ABM00000000000021506,SYNCID0000016882,153.8,debit,True,,,,2023-01-17,06:14:52
20381,ABM00000000000021507,SYNCID0000016882,1937.08,credit,False,CA,ON,VAUGHAN,2022-11-21,13:01:01
20382,ABM00000000000021508,SYNCID0000016882,23375.23,credit,False,CA,ON,TORONTO,2022-12-16,17:45:51
20383,ABM00000000000021509,SYNCID0000016882,2039.37,credit,False,CA,ON,HALTONHLLS,2022-11-03,09:49:13
20384,ABM00000000000021510,SYNCID0000016882,52402.54,credit,False,other,,other,2022-11-15,12:14:52
20385,ABM00000000000021511,SYNCID0000016882,1058.62,debit,True,CA,ON,PETERBOROUGH,2022-11-18,17:21:28
20386,ABM00000000000021512,SYNCID0000016882,422.46,debit,True,CA,,other,2023-01-12,19:35:40
20387,ABM00000000000021513,SYNCID0000016882,3530.5,credit,False,CA,SK,SASKATOON,2022-12-05,08:18:41
20388,ABM00000000000021514,SYNCID0000016882,210.77,debit,True,other,,other,2022-11-22,09:23:01


In [54]:
abm[abm["customer_id"] == "SYNCID0000013414"]

Unnamed: 0,abm_id,customer_id,amount_cad,debit_credit,cash_indicator,country,province,city,transaction_date,transaction_time
15896,ABM00000000000016694,SYNCID0000013414,53.52,debit,True,unknown,unknown,unknown,2022-12-07,18:36:32
15897,ABM00000000000016695,SYNCID0000013414,66.73,debit,True,unknown,unknown,unknown,2023-01-13,12:40:26
15898,ABM00000000000016696,SYNCID0000013414,428.17,credit,False,CA,QC,STJEROME,2022-11-09,11:51:05
15899,ABM00000000000016697,SYNCID0000013414,1083.14,debit,True,CA,ON,NIAGARA FLS,2022-11-23,14:17:30
15900,ABM00000000000016698,SYNCID0000013414,860.22,debit,True,CA,NS,HALIFAX,2022-12-25,14:59:11
15901,ABM00000000000016699,SYNCID0000013414,1026.54,debit,True,CA,ON,STCATHARINES,2022-11-01,15:00:33
15902,ABM00000000000016700,SYNCID0000013414,66.65,debit,True,CA,ON,WOODBRIDGE,2022-12-04,15:14:33
15903,ABM00000000000016701,SYNCID0000013414,430.41,credit,True,CA,ON,TORONTO,2022-11-27,21:12:57
15904,ABM00000000000016702,SYNCID0000013414,1039.85,debit,True,CA,ON,NEPEAN,2023-01-24,16:43:26
15905,ABM00000000000016703,SYNCID0000013414,1070.11,debit,True,CA,BC,BURNABY,2022-12-02,11:32:11


In [149]:
abm[(abm["cash_indicator"] == True) & (abm["amount_cad"] > 10000) & (abm["debit_credit"] == "credit")]

Unnamed: 0,abm_id,customer_id,amount_cad,debit_credit,cash_indicator,country,province,city,transaction_date,transaction_time
236,ABM00000000000000246,SYNCID0000000129,11594.03,credit,True,CA,BC,RICHMOND,2023-01-12,15:02:42
1619,ABM00000000000001646,SYNCID0000001239,84313.55,credit,True,CA,NL,STJOHNS,2022-11-21,11:43:56
3133,ABM00000000000003228,SYNCID0000002454,48467.35,credit,True,CA,unknown,other,2023-01-09,18:09:40
4304,ABM00000000000004445,SYNCID0000003184,83187.94,credit,True,CA,ON,HAMILTON,2023-01-05,14:44:51
4939,ABM00000000000005124,SYNCID0000003869,19819.36,credit,True,CA,ON,BURLINGTON,2023-01-03,14:03:20
5481,ABM00000000000005702,SYNCID0000004436,10896.53,credit,True,CA,ON,KINGSTON,2023-01-10,13:31:00
6182,ABM00000000000006413,SYNCID0000004924,18490.76,credit,True,CA,NS,AMHERST,2022-11-09,11:58:43
7568,ABM00000000000007871,SYNCID0000005977,76197.14,credit,True,CA,ON,GUELPH,2023-01-05,14:44:26
7946,ABM00000000000008304,SYNCID0000006238,81588.12,credit,True,CA,NS,DARTMOUTH,2022-11-30,12:44:09
8257,ABM00000000000008619,SYNCID0000006475,52310.36,credit,True,other,unknown,other,2023-01-10,17:40:28


In [53]:
hi = abm[(abm["country"] == 'unknown') & (abm["amount_cad"] > 10000)]
hi["customer_id"].unique()

array(['SYNCID0000013414'], dtype=object)

# Card

In [34]:
card.head(10)

Unnamed: 0,card_trxn_id,customer_id,amount_cad,debit_credit,merchant_category,ecommerce_ind,country,province,city,transaction_date,transaction_time
0,CON00000000000000000,SYNCID0000000001,60.3,debit,other,False,CA,AB,CALGARY,2022-12-20,17:14:40
1,CON00000000000000001,SYNCID0000000001,1.72,debit,4121,True,,,other,2022-11-02,13:28:57
2,CON00000000000000002,SYNCID0000000001,15.97,debit,other,False,,,other,2022-11-29,10:55:08
3,CON00000000000000003,SYNCID0000000001,213.4,debit,5542,False,CA,NB,MONCTON,2022-11-21,09:15:48
4,CON00000000000000077,SYNCID0000000004,-137.42,credit,7399,False,CA,QC,VERDUN,2022-12-02,17:56:27
5,CON00000000000000078,SYNCID0000000004,182.91,debit,5541,False,,,other,2023-01-09,10:51:40
6,CON00000000000000079,SYNCID0000000004,162.5,debit,5541,False,,,,2023-01-10,16:21:44
7,CON00000000000000080,SYNCID0000000004,35.04,debit,5411,False,,,other,2022-11-22,17:06:24
8,CON00000000000000081,SYNCID0000000004,43.02,debit,5734,False,CA,AB,,2023-01-28,07:44:14
9,CON00000000000000082,SYNCID0000000004,72.91,debit,5814,False,,,other,2023-01-03,19:01:21


In [28]:
card[card["amount_cad"] > 10000 ]

KeyError: 'cash_indicator'

# cheque


### 1. An anomaly in the transaction amount > 150k
### 2. high frequency high-value transaction (> 8 times / month, amount > 10k)

In [38]:
cheque.head(10)

Unnamed: 0,cheque_id,customer_id,amount_cad,debit_credit,transaction_date
0,CHE00000000000000000,SYNCID0000000000,415.24,debit,2022-11-08
1,CHE00000000000000001,SYNCID0000000002,564.95,debit,2023-01-16
2,CHE00000000000000002,SYNCID0000000002,573.46,debit,2023-01-16
3,CHE00000000000000003,SYNCID0000000002,3771.6,debit,2022-11-25
4,CHE00000000000000004,SYNCID0000000002,1661.76,credit,2023-01-24
5,CHE00000000000000005,SYNCID0000000002,72621.39,debit,2022-12-19
6,CHE00000000000000006,SYNCID0000000002,1597.38,credit,2022-11-16
7,CHE00000000000000007,SYNCID0000000002,406.7,debit,2023-01-11
8,CHE00000000000000008,SYNCID0000000002,434.57,credit,2022-11-14
9,CHE00000000000000009,SYNCID0000000002,985.27,credit,2022-12-15


In [142]:
# cheque type 1
cheque[cheque["amount_cad"] > 150000]

Unnamed: 0,cheque_id,customer_id,amount_cad,debit_credit,transaction_date
233,CHE00000000000000233,SYNCID0000000013,522244.13,debit,2023-01-24
254,CHE00000000000000254,SYNCID0000000013,1721280.34,credit,2022-11-30
305,CHE00000000000000305,SYNCID0000000019,600016.68,credit,2023-01-01
594,CHE00000000000000705,SYNCID0000000047,688236.05,debit,2022-11-14
1098,CHE00000000000001215,SYNCID0000000075,232576.73,debit,2023-01-05
...,...,...,...,...,...
262521,CHE00000000000276256,SYNCID0000017139,5019526.59,credit,2022-11-08
262539,CHE00000000000276274,SYNCID0000017139,321633.93,debit,2023-01-04
262556,CHE00000000000276291,SYNCID0000017139,706714.17,debit,2022-12-19
262676,CHE00000000000276411,SYNCID0000017152,220663.12,debit,2023-01-05


In [82]:
# cheque type 2
large_cheque = cheque[cheque['amount_cad'] > 10000]
customer_counts = large_cheque['customer_id'].value_counts()
large_cheque['transaction_date1'] = pd.to_datetime(large_cheque['transaction_date'])
large_cheque.set_index('transaction_date1', inplace=True)
transaction_frequency = large_cheque.groupby('customer_id').resample('M').size()


transaction_frequency = transaction_frequency.reset_index()
transaction_frequency.columns = ['customer_id', 'month', 'transaction_count']
high_frequency_customers = transaction_frequency[transaction_frequency['transaction_count'] > 8]

print(high_frequency_customers)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  large_cheque['transaction_date1'] = pd.to_datetime(large_cheque['transaction_date'])
  transaction_frequency = large_cheque.groupby('customer_id').resample('M').size()


           customer_id      month  transaction_count
23    SYNCID0000000047 2022-11-30                 12
25    SYNCID0000000047 2023-01-31                 12
166   SYNCID0000000319 2022-12-31                  9
203   SYNCID0000000386 2022-11-30                 13
204   SYNCID0000000386 2022-12-31                 18
...                ...        ...                ...
9642  SYNCID0000017095 2023-01-31                 13
9653  SYNCID0000017116 2022-11-30                 10
9675  SYNCID0000017137 2022-11-30                 42
9676  SYNCID0000017137 2022-12-31                 30
9677  SYNCID0000017137 2023-01-31                 19

[450 rows x 3 columns]


# eft

### 1. An anomaly in the transaction amount > 100k
### 2. large transaction (>= 50k) record >= 10
### 3. large transaction (>= 50k) at night / weekend

In [39]:
eft.head(10)

Unnamed: 0,eft_id,customer_id,amount_cad,debit_credit,transaction_date,transaction_time
0,EFT00000000000000032,SYNCID0000000005,60607.55,debit,2023-01-19,09:40:38
1,EFT00000000000000033,SYNCID0000000005,18.59,credit,2022-11-21,16:41:50
2,EFT00000000000000034,SYNCID0000000005,85.89,credit,2022-11-07,09:40:40
3,EFT00000000000000035,SYNCID0000000005,43.22,credit,2022-12-16,16:42:43
4,EFT00000000000000036,SYNCID0000000005,82.05,credit,2023-01-20,08:19:41
5,EFT00000000000000037,SYNCID0000000005,51.94,credit,2022-12-07,09:41:37
6,EFT00000000000000038,SYNCID0000000005,45.82,debit,2022-11-04,17:00:31
7,EFT00000000000000039,SYNCID0000000005,79.99,credit,2022-11-04,22:54:10
8,EFT00000000000000040,SYNCID0000000005,218.06,credit,2022-11-21,16:41:51
9,EFT00000000000000041,SYNCID0000000005,852.5,credit,2023-01-13,01:44:37


In [104]:
# strange customer
eft[eft["customer_id"] == "SYNCID0000004105"]

Unnamed: 0,eft_id,customer_id,amount_cad,debit_credit,transaction_date,transaction_time
122917,EFT00000000000126896,SYNCID0000004105,4560.48,debit,2023-01-10,17:08:26
122918,EFT00000000000126897,SYNCID0000004105,12.53,debit,2022-12-05,10:25:03
122919,EFT00000000000126898,SYNCID0000004105,3578.35,credit,2023-01-18,21:12:32
122920,EFT00000000000126899,SYNCID0000004105,192.57,debit,2022-11-04,11:01:08
122921,EFT00000000000126900,SYNCID0000004105,53553.93,credit,2022-11-02,21:11:38
...,...,...,...,...,...,...
125201,EFT00000000000129180,SYNCID0000004105,7974.53,debit,2022-12-19,21:31:09
125202,EFT00000000000129181,SYNCID0000004105,377.10,debit,2022-11-18,12:51:05
125203,EFT00000000000129182,SYNCID0000004105,937.20,debit,2023-01-11,21:27:54
125204,EFT00000000000129183,SYNCID0000004105,1761.81,credit,2023-01-09,16:41:55


In [103]:
# eft type 1
eft[eft["amount_cad"] > 100000]

Unnamed: 0,eft_id,customer_id,amount_cad,debit_credit,transaction_date,transaction_time
148,EFT00000000000000180,SYNCID0000000015,312788.86,credit,2022-12-13,16:41:54
584,EFT00000000000000631,SYNCID0000000036,262342.21,debit,2022-11-01,09:42:45
914,EFT00000000000000961,SYNCID0000000043,112955.01,credit,2023-01-04,09:41:59
1045,EFT00000000000001092,SYNCID0000000048,342315.47,debit,2022-12-07,21:27:36
1062,EFT00000000000001109,SYNCID0000000048,123327.93,credit,2022-11-28,16:41:45
...,...,...,...,...,...,...
487630,EFT00000000000503046,SYNCID0000017151,182928.98,debit,2022-11-25,21:37:23
487796,EFT00000000000503212,SYNCID0000017161,114270.24,debit,2023-01-30,15:13:52
487973,EFT00000000000503389,SYNCID0000017165,357622.07,credit,2023-01-12,09:41:01
488372,EFT00000000000503788,SYNCID0000017179,126153.47,debit,2022-12-20,12:24:01


In [98]:
# eft type 2
eft_50k = eft[eft["amount_cad"] > 50000]

In [105]:
transaction_counts = eft_50k['customer_id'].value_counts()
high_transaction_customers = transaction_counts[transaction_counts > 10]
print(high_transaction_customers)

customer_id
SYNCID0000004105    65
SYNCID0000008526    56
SYNCID0000012928    55
SYNCID0000011351    47
SYNCID0000009036    39
                    ..
SYNCID0000007663    11
SYNCID0000008611    11
SYNCID0000004346    11
SYNCID0000016926    11
SYNCID0000015608    11
Name: count, Length: 91, dtype: int64


In [None]:
# eft type 3
eft['transaction_date1'] = pd.to_datetime(eft['transaction_date'])
eft['is_weekday'] = eft['transaction_date1'].apply(lambda x: x.weekday() < 5)

In [116]:
from datetime import datetime, time

eft["transaction_time1"] = pd.to_datetime(eft['transaction_time'], format='%H:%M:%S').dt.time
start_time = time(23, 0)  # 23:00
end_time = time(6, 0)  # 06:00 
eft['rest_time'] = eft["transaction_time1"].apply(lambda x: x >= start_time or x < end_time)

In [117]:
eft_50k = eft[eft["amount_cad"] > 50000]
eft_50k[(eft_50k["rest_time"] == True) | (eft_50k["is_weekday"] == False)]

Unnamed: 0,eft_id,customer_id,amount_cad,debit_credit,transaction_date,transaction_time,transaction_date1,is_weekday,transaction_time1,rest_time
999,EFT00000000000001046,SYNCID0000000043,96377.37,credit,2022-11-19,21:11:15,2022-11-19,False,21:11:15,False
1831,EFT00000000000001881,SYNCID0000000096,75849.71,debit,2023-01-01,21:11:47,2023-01-01,False,21:11:47,False
9760,EFT00000000000010141,SYNCID0000000357,112436.64,debit,2022-11-15,04:01:42,2022-11-15,True,04:01:42,True
10424,EFT00000000000010811,SYNCID0000000384,54756.00,debit,2023-01-27,04:13:51,2023-01-27,True,04:13:51,True
11053,EFT00000000000011440,SYNCID0000000397,53842.11,debit,2022-11-30,04:18:47,2022-11-30,True,04:18:47,True
...,...,...,...,...,...,...,...,...,...,...
483171,EFT00000000000497425,SYNCID0000016991,230740.52,credit,2023-01-01,10:30:38,2023-01-01,False,10:30:38,False
484919,EFT00000000000499314,SYNCID0000017075,50123.33,debit,2023-01-30,04:12:27,2023-01-30,True,04:12:27,True
485661,EFT00000000000501062,SYNCID0000017091,110003.63,debit,2023-01-31,04:22:13,2023-01-31,True,04:22:13,True
487894,EFT00000000000503310,SYNCID0000017163,81016.14,debit,2022-12-25,21:50:34,2022-12-25,False,21:50:34,False


# emt

### 1. large transaction (>= 10k) at night / weekend
### 2. high frequency mid-value transaction (> 30 times/month, amount > 1k)
### 3. high frequenct small-value transaction(> 100 times/month. amount < 1k)

In [40]:
emt.head(10)

Unnamed: 0,emt_id,customer_id,amount_cad,debit_credit,transaction_date,transaction_time
0,EMT00000000000000213,SYNCID0000000007,513.46,C,2022-11-15,14:14:07
1,EMT00000000000000214,SYNCID0000000019,631.11,D,2023-01-04,15:10:21
2,EMT00000000000000215,SYNCID0000000019,5281.87,D,2022-12-29,14:11:32
3,EMT00000000000000216,SYNCID0000000019,209.17,C,2022-12-11,17:13:30
4,EMT00000000000000217,SYNCID0000000019,948.98,D,2023-01-01,19:17:51
5,EMT00000000000000218,SYNCID0000000022,31.84,C,2022-12-02,22:12:33
6,EMT00000000000000219,SYNCID0000000022,167.22,D,2023-01-14,16:22:51
7,EMT00000000000000220,SYNCID0000000022,747.93,C,2022-11-09,19:11:54
8,EMT00000000000000221,SYNCID0000000022,78.03,C,2022-12-08,10:36:39
9,EMT00000000000000222,SYNCID0000000022,216.08,C,2022-12-16,01:23:28


In [None]:
# emt type 1
emt['transaction_date1'] = pd.to_datetime(emt['transaction_date'])
emt['is_weekday'] = emt['transaction_date1'].apply(lambda x: x.weekday() < 5)


emt["transaction_time1"] = pd.to_datetime(emt['transaction_time'], format='%H:%M:%S').dt.time
start_time = time(23, 0)  # 23:00
end_time = time(6, 0)  # 06:00 
emt['rest_time'] = emt["transaction_time1"].apply(lambda x: x >= start_time or x < end_time)

Unnamed: 0,emt_id,customer_id,amount_cad,debit_credit,transaction_date,transaction_time,transaction_date1,is_weekday,transaction_time1,rest_time
0,EMT00000000000000213,SYNCID0000000007,513.46,C,2022-11-15,14:14:07,2022-11-15,True,14:14:07,False
1,EMT00000000000000214,SYNCID0000000019,631.11,D,2023-01-04,15:10:21,2023-01-04,True,15:10:21,False
2,EMT00000000000000215,SYNCID0000000019,5281.87,D,2022-12-29,14:11:32,2022-12-29,True,14:11:32,False
3,EMT00000000000000216,SYNCID0000000019,209.17,C,2022-12-11,17:13:30,2022-12-11,False,17:13:30,False
4,EMT00000000000000217,SYNCID0000000019,948.98,D,2023-01-01,19:17:51,2023-01-01,False,19:17:51,False
...,...,...,...,...,...,...,...,...,...,...
161216,EMT00000000000168667,SYNCID0000017180,181.55,D,2023-01-02,21:45:21,2023-01-02,True,21:45:21,False
161217,EMT00000000000168668,SYNCID0000017180,52.08,C,2022-12-21,00:39:57,2022-12-21,True,00:39:57,True
161218,EMT00000000000168669,SYNCID0000017180,138.44,D,2022-11-22,17:14:36,2022-11-22,True,17:14:36,False
161219,EMT00000000000168670,SYNCID0000017180,335.38,C,2022-12-22,03:46:26,2022-12-22,True,03:46:26,True


In [126]:
emt_50k = emt[emt["amount_cad"] >= 10000]
emt_50k[(emt_50k["is_weekday"] == False) | (emt_50k["rest_time"] == True)]

Unnamed: 0,emt_id,customer_id,amount_cad,debit_credit,transaction_date,transaction_time,transaction_date1,is_weekday,transaction_time1,rest_time
123,EMT00000000000000336,SYNCID0000000038,10467.19,D,2023-01-22,16:33:00,2023-01-22,False,16:33:00,False
823,EMT00000000000001036,SYNCID0000000106,10150.10,D,2023-01-08,12:01:53,2023-01-08,False,12:01:53,False
1670,EMT00000000000001927,SYNCID0000000213,10640.04,C,2022-12-04,14:49:49,2022-12-04,False,14:49:49,False
2690,EMT00000000000002954,SYNCID0000000318,10386.24,D,2022-11-25,03:56:27,2022-11-25,True,03:56:27,True
3452,EMT00000000000003748,SYNCID0000000456,10615.98,C,2022-12-22,23:11:36,2022-12-22,True,23:11:36,True
...,...,...,...,...,...,...,...,...,...,...
158176,EMT00000000000165582,SYNCID0000016798,10168.82,C,2022-12-18,07:20:24,2022-12-18,False,07:20:24,False
158630,EMT00000000000166037,SYNCID0000016858,10439.44,D,2023-01-21,20:50:20,2023-01-21,False,20:50:20,False
159038,EMT00000000000166445,SYNCID0000016901,10444.09,C,2022-12-24,19:43:06,2022-12-24,False,19:43:06,False
160291,EMT00000000000167729,SYNCID0000017053,19887.01,D,2022-12-18,14:53:44,2022-12-18,False,14:53:44,False


In [137]:
# emt type 2
mid_emt = emt[emt['amount_cad'] > 1000]
customer_counts = mid_emt['customer_id'].value_counts()

mid_emt.set_index('transaction_date1', inplace=True)
transaction_frequency = mid_emt.groupby('customer_id').resample('M').size()

transaction_frequency = transaction_frequency.reset_index()
transaction_frequency.columns = ['customer_id', 'month', 'transaction_count']
high_frequency_customers = transaction_frequency[transaction_frequency['transaction_count'] > 30]

print(high_frequency_customers)

  transaction_frequency = mid_emt.groupby('customer_id').resample('M').size()


            customer_id      month  transaction_count
499    SYNCID0000000619 2022-11-30                 98
500    SYNCID0000000619 2022-12-31                118
501    SYNCID0000000619 2023-01-31                 33
524    SYNCID0000000648 2022-12-31                 35
561    SYNCID0000000692 2022-11-30                 36
...                 ...        ...                ...
12082  SYNCID0000015019 2022-11-30                 32
12083  SYNCID0000015019 2022-12-31                 34
12976  SYNCID0000016130 2022-11-30                 42
12977  SYNCID0000016130 2022-12-31                 38
13330  SYNCID0000016536 2022-11-30                 41

[76 rows x 3 columns]


In [148]:
# emt type 3
small_emt = emt[emt['amount_cad'] <= 1000]
customer_counts = small_emt['customer_id'].value_counts()

small_emt.set_index('transaction_date1', inplace=True)
transaction_frequency = small_emt.groupby('customer_id').resample('M').size()

transaction_frequency = transaction_frequency.reset_index()
transaction_frequency.columns = ['customer_id', 'month', 'transaction_count']
high_frequency_customers = transaction_frequency[transaction_frequency['transaction_count'] > 100]

print(high_frequency_customers)

  transaction_frequency = small_emt.groupby('customer_id').resample('M').size()


            customer_id      month  transaction_count
612    SYNCID0000000619 2022-11-30                201
613    SYNCID0000000619 2022-12-31                207
687    SYNCID0000000692 2022-11-30                159
688    SYNCID0000000692 2022-12-31                179
881    SYNCID0000000871 2022-11-30                102
1326   SYNCID0000001356 2022-11-30                134
1327   SYNCID0000001356 2022-12-31                117
1386   SYNCID0000001413 2022-11-30               1043
1387   SYNCID0000001413 2022-12-31               1629
1388   SYNCID0000001413 2023-01-31                139
2466   SYNCID0000002454 2022-11-30                101
3270   SYNCID0000003294 2022-11-30                487
3271   SYNCID0000003294 2022-12-31                464
3689   SYNCID0000003626 2022-11-30                132
3690   SYNCID0000003626 2022-12-31                138
3754   SYNCID0000003681 2022-11-30                144
3755   SYNCID0000003681 2022-12-31                147
3874   SYNCID0000003802 2022

# Wire

# large amount > 100k

In [37]:
wire.head(10)

Unnamed: 0,wire_id,customer_id,amount_cad,debit_credit,transaction_date,transaction_time
0,WIR00000000000000000,SYNCID0000000000,6316.04,debit,2022-11-18,00:00:00
1,WIR00000000000000001,SYNCID0000000000,0.03,credit,2022-12-31,00:00:00
2,WIR00000000000000005,SYNCID0000000038,67268.48,credit,2022-12-07,00:00:00
3,WIR00000000000000006,SYNCID0000000055,6354.06,credit,2022-12-07,00:00:00
4,WIR00000000000000007,SYNCID0000000061,7217.61,credit,2023-01-10,00:00:00
5,WIR00000000000000008,SYNCID0000000063,262017.1,debit,2022-12-23,00:00:00
6,WIR00000000000000009,SYNCID0000000079,8899.15,debit,2023-01-26,00:00:00
7,WIR00000000000000010,SYNCID0000000097,11249.32,debit,2023-01-27,00:00:00
8,WIR00000000000000011,SYNCID0000000097,986.71,debit,2023-01-26,00:00:00
9,WIR00000000000000012,SYNCID0000000097,110.06,debit,2022-11-16,00:00:00


In [140]:
wire[wire["amount_cad"] > 100000]

Unnamed: 0,wire_id,customer_id,amount_cad,debit_credit,transaction_date,transaction_time
5,WIR00000000000000008,SYNCID0000000063,262017.10,debit,2022-12-23,00:00:00
16,WIR00000000000000019,SYNCID0000000222,598947.85,debit,2023-01-30,00:00:00
17,WIR00000000000000020,SYNCID0000000227,262274.05,credit,2023-01-17,00:00:00
22,WIR00000000000000025,SYNCID0000000276,507672.80,credit,2022-12-12,00:00:00
37,WIR00000000000000040,SYNCID0000000537,103597.33,debit,2022-12-28,00:00:00
...,...,...,...,...,...,...
4697,WIR00000000000004888,SYNCID0000016860,100743.57,debit,2022-12-20,00:00:00
4722,WIR00000000000004913,SYNCID0000017055,191988.49,credit,2022-12-01,00:00:00
4738,WIR00000000000004934,SYNCID0000017130,249230.84,debit,2022-12-23,00:00:00
4750,WIR00000000000004946,SYNCID0000017137,233544.38,debit,2022-12-05,00:00:00


# Know Your Customer

In [139]:
kyc.head(10)

###### INTRO TASK - Using 10 features to perform KNN, so we will add 10 additional columns on kyc table, and use that to perform classification

# 1. Business History = onboard_date - estabilished_date

# 2. Average sales/person = sales / employee_count

# 3. (Olivia) Financial transparency -- total transaction amount / total transaction times, in all methods 

# 4. Industry Code 

# 5. Consistency between business location and place of registration
   #-- Using "kyc" LEFT JOIN "card" for example, look at the rate of DIFF(city in kyc, city in abm)
   
# (Need discussion on exaxt rules for each transaction type) 6.7.8.9.10. Unusual customer in 
#     EFT & Wire 
#     Card
#     EMT
#     ABM 
#     Cheque (1 -- Yes, 0 --- No, unknown)

# 11. (Olivia) credit / debit rate -- Too low, too high => unnormal， alternate C/D (complex)

# Question: Do you want to consider the risk of company location? 

# ==================================================================================================================================

###### TASK 3 -- Applying INTRO TASK, we identify 100 customers' property (0, 1) and offer labels, 
# leaving others unlabeled, perform self-supervised learning.


Unnamed: 0,customer_id,country,province,city,industry_code,employee_count,sales,established_date,onboard_date
0,SYNCID0000000000,CA,ON,NORTH YORK,7292.0,0.0,0.0,2019-06-22,2022-05-02
1,SYNCID0000000001,CA,ON,NORTH YORK,,,66446.0,2009-07-21,2009-08-21
2,SYNCID0000000002,CA,ON,KITCHENER,7721.0,2.0,,2022-04-11,2022-09-14
3,SYNCID0000000004,CA,,other,4565.0,1.0,328160.0,2020-11-16,2022-12-08
4,SYNCID0000000005,CA,BC,VICTORIA,7799.0,1.0,0.0,2022-01-21,2023-01-01
5,SYNCID0000000006,CA,BC,other,7761.0,0.0,0.0,2009-11-05,
6,SYNCID0000000007,CA,BC,PENTICTON,7292.0,1.0,33642.0,1987-05-29,2018-09-24
7,SYNCID0000000008,CA,,other,9659.0,0.0,0.0,1992-10-07,1992-01-08
8,SYNCID0000000009,CA,QC,MONTREAL,8653.0,1.0,73595.0,2012-07-01,2019-01-21
9,SYNCID0000000010,CA,ON,TORONTO,7511.0,0.0,,2007-08-27,2008-04-26


In [None]:
'''

CHAT GPT ANSWER

1. Silhouette Score
What it measures: This score evaluates how similar an object is to its own cluster compared to other clusters. The silhouette score ranges from -1 to 1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.
How to use: Compute the silhouette score for each sample, possibly using library functions from software like Python’s scikit-learn, and then average these values to obtain a final score.
2. Davies-Bouldin Index
What it measures: This index is intended to identify clusters that are well-separated and compact. A lower Davies-Bouldin index relates to a model with better separation between the clusters.
How to use: This index is calculable directly from features like the distance between centroids and the dispersion of points within each cluster. Libraries like scikit-learn can compute this index directly.
3. Calinski-Harabasz Index
What it measures: Also known as the Variance Ratio Criterion, this index measures the ratio of the sum of between-clusters dispersion to within-cluster dispersion. Higher values generally indicate better clustering.
How to use: This can be calculated using library functions and does not require any labels, relying instead on distances and variances that are intrinsic to the data.
4. Clustering Tendency
What it measures: Before even applying a clustering algorithm, it's important to assess whether the data tends to cluster at all.
How to use: Use the Hopkins statistic to determine the likelihood that the data has clusters rather than being uniformly distributed.
5. Visual Assessment
What it measures: Sometimes, the best way to understand the effectiveness of your clustering is by visualizing the clusters using techniques like PCA (Principal Component Analysis) or t-SNE (t-Distributed Stochastic Neighbor Embedding) for dimensionality reduction.
How to use: Plot the clusters after dimension reduction to see if they form distinct groups.
6. Stability Measures
What it measures: If you perturb the dataset slightly by adding noise or removing small subsets of data and then re-run the clustering, stable clusters should not change much.
How to use: Compare the results of clustering on the original data with those from perturbed versions to check consistency.
7. External Validation (if any external data available)
What it measures: If you have any external data that can serve as a rough proxy for cluster labels, you can use it to validate the clustering indirectly.
How to use: Apply some form of labeling or tagging based on external data, then use traditional classification metrics as a rough gauge.

'''