#**Dataset Description**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, f1_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
raw_data = pd.read_csv("Data/kyc.csv")
cash_raw = pd.read_csv("Data/cash_trxns.csv")
emt_raw = pd.read_csv("Data/emt_trxns.csv")
wire_raw = pd.read_csv("Data/wire_trxns.csv")

print(raw_data.shape,cash_raw.shape,emt_raw.shape,wire_raw.shape)

(195789, 7) (212532, 4) (506451, 7) (67872, 8)


In [3]:
raw_data.head()

Unnamed: 0,Name,Gender,Occupation,Age,Tenure,cust_id,label
0,JENNIFER WELLS,female,Architect,45.0,13.0,CUST82758793,0
1,ANTHONY ADAMS,male,Musician,52.0,8.0,CUST69248708,0
2,DENISE LEWIS,female,Jewelry Dealer,43.0,11.0,CUST67222818,0
3,STEPHEN FIGUEROA,male,Optometrist,35.0,17.0,CUST33995820,0
4,KYLE EDWARDS,male,Real Estate Broker,39.0,21.0,CUST76401392,1


In [4]:
cash_raw.head()

Unnamed: 0,cust_id,amount,type,trxn_id
0,CUST69827909,2330,deposit,XFMC87396884
1,CUST78509707,4800,deposit,BFMG48785876
2,CUST71480951,6510,withdrawal,TIAX63158064
3,CUST70854140,1715,deposit,IVIV63658514
4,CUST57406487,3770,withdrawal,TCBO25660159


In [5]:
emt_raw.head()

Unnamed: 0,id sender,id receiver,name sender,name receiver,emt message,emt value,trxn_id
0,CUST26232205,CUST94681618,JASON GARRISON,RENEE LANG,for the bike u lent me,154.0,WFEZ76031047
1,EXTERNAL623153,CUST59533929,GINA WISE,BRIAN HAMILTON,,1170.5,RAUG63886259
2,CUST35533148,CUST23126187,ANTHONY ROBERSON,NICHOLAS DODSON,,518.0,XQJS86205330
3,CUST59096559,EXTERNAL470507,KEVIN PARK,FREDERICK CARPENTER,,46.0,WPXP45854083
4,CUST69049633,EXTERNAL818528,ZHU FENG LAN,ROMIL LANKA,,570.0,OIRZ70883325


In [6]:
wire_raw.head()

Unnamed: 0,id sender,id receiver,name sender,name receiver,wire value,country sender,country receiver,trxn_id
0,EXTERNAL753550,CUST30139466,CHEN MIN,DESTINY MOORE,10098.0,CN,CA,TKEV83070517
1,CUST43146787,CUST94438297,DR. BENJAMÍN PAREDES,BOBBY SERRANO,1267.0,CA,CA,LWCS42954834
2,CUST82396415,EXTERNAL842611,WHITNEY WRIGHT,VICTORIA HOGAN,8591.0,CA,US,NTTG55749308
3,EXTERNAL851271,CUST84545757,DR.TRACY MOODY,MINDY BURGESS,1480.5,CA,CA,IXVD84599097
4,EXTERNAL685849,CUST14982223,JEREMY WHITE,NITARA BARMAN,13310.0,AU,CA,TIGB41956889


In [7]:
raw_data.info()
cash_raw.info()
emt_raw.info()
wire_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195789 entries, 0 to 195788
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Name        195789 non-null  object 
 1   Gender      195789 non-null  object 
 2   Occupation  195789 non-null  object 
 3   Age         195789 non-null  float64
 4   Tenure      195789 non-null  float64
 5   cust_id     195789 non-null  object 
 6   label       195789 non-null  int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 10.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212532 entries, 0 to 212531
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   cust_id  212532 non-null  object
 1   amount   212532 non-null  int64 
 2   type     212532 non-null  object
 3   trxn_id  212532 non-null  object
dtypes: int64(1), object(3)
memory usage: 6.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506451 entr

In [8]:
raw_data.describe()

Unnamed: 0,Age,Tenure,label
count,195789.0,195789.0,195789.0
mean,35.832856,6.528436,0.028163
std,10.714043,5.998545,0.165439
min,18.0,0.0,0.0
25%,28.0,2.0,0.0
50%,35.0,5.0,0.0
75%,42.0,10.0,0.0
max,92.0,49.0,1.0


In [9]:
pos_data = raw_data[raw_data['label'] == 1]
pos_data

Unnamed: 0,Name,Gender,Occupation,Age,Tenure,cust_id,label
4,KYLE EDWARDS,male,Real Estate Broker,39.0,21.0,CUST76401392,1
28,ROBERT STAFFORD,male,Offshore Trustee,51.0,9.0,CUST95743620,1
32,AARUSH SRIDHAR,female,Pawn Shop Owner,48.0,25.0,CUST37591597,1
41,FANG JIAN JUN,female,Private Banker,35.0,3.0,CUST14940080,1
69,MICHAEL JONES,male,Private Jet Broker,52.0,0.0,CUST40832249,1
...,...,...,...,...,...,...,...
195592,TAN JIAN GUO,female,Maritime or Shipping Agent,50.0,11.0,CUST39737655,1
195631,JOSEPH ESTES,male,Police Officer,25.0,7.0,CUST19561308,1
195692,AMANDA CRUZ,female,Hedge Fund Manager,49.0,10.0,CUST55221806,1
195714,DR.TINA LEVINE,female,Free Trade Zone Operator,53.0,15.0,CUST92442267,1


In [10]:
pos_data.describe()

Unnamed: 0,Age,Tenure,label
count,5514.0,5514.0,5514.0
mean,41.428908,9.859086,1.0
std,10.984981,7.400461,0.0
min,18.0,0.0,1.0
25%,34.0,4.0,1.0
50%,41.0,9.0,1.0
75%,49.0,15.0,1.0
max,80.0,41.0,1.0


#**Data Preprocessing**

In [11]:
# Cash Table
cash_result = cash_raw.groupby(['cust_id', 'type']).agg(
    cash_amt_sum=pd.NamedAgg(column="amount", aggfunc="sum"),
    cash_amt_ave=pd.NamedAgg(column="amount", aggfunc="mean"),
    cash_amt_max=pd.NamedAgg(column="amount", aggfunc="max"),
    cash_amt_min=pd.NamedAgg(column="amount", aggfunc="min"),
    cash_cnt=pd.NamedAgg(column="amount", aggfunc="count")
)
cash_result = cash_result.reset_index()
cash_result_pivot = cash_result.pivot_table(index='cust_id', columns=['type'], fill_value=0.0, dropna=False)
level_0 = cash_result_pivot.columns.get_level_values(0).astype(str)
level_1 = cash_result_pivot.columns.get_level_values(1).astype(str)
cash_result_pivot.columns = level_0 + '_' + level_1
cash_result_pivot

Unnamed: 0_level_0,cash_amt_ave_deposit,cash_amt_ave_withdrawal,cash_amt_max_deposit,cash_amt_max_withdrawal,cash_amt_min_deposit,cash_amt_min_withdrawal,cash_amt_sum_deposit,cash_amt_sum_withdrawal,cash_cnt_deposit,cash_cnt_withdrawal
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CUST10001032,0.000000,380.0,0,380,0,380,0,380,0,1
CUST10002122,0.000000,505.0,0,505,0,505,0,505,0,1
CUST10002913,0.000000,455.0,0,455,0,455,0,455,0,1
CUST10003195,0.000000,1265.0,0,1485,0,1045,0,2530,0,2
CUST10007063,415.000000,0.0,415,0,415,0,415,0,1,0
...,...,...,...,...,...,...,...,...,...,...
CUST99995000,740.000000,0.0,740,0,740,0,740,0,1,0
CUST99996815,0.000000,330.0,0,330,0,330,0,330,0,1
CUST99997704,8737.222222,8245.0,11180,8245,5640,8245,78635,8245,9,1
CUST99998660,750.000000,0.0,750,0,750,0,750,0,1,0


In [12]:
# EMT Table - sender
emt_send = emt_raw.groupby('id sender').agg(
    emt_amt_sum_send=pd.NamedAgg(column="emt value", aggfunc="sum"),
    emt_amt_ave_send=pd.NamedAgg(column="emt value", aggfunc="mean"),
    emt_amt_max_send=pd.NamedAgg(column="emt value", aggfunc="max"),
    emt_amt_min_send=pd.NamedAgg(column="emt value", aggfunc="min"),
    emt_cnt_send=pd.NamedAgg(column="emt value", aggfunc="count")
).reset_index()
emt_send_CUST = emt_send[emt_send['id sender'].str.contains('CUST')]
emt_send_CUST = emt_send_CUST.rename(columns={'id sender': 'cust_id'}).set_index('cust_id')
emt_send_CUST

Unnamed: 0_level_0,emt_amt_sum_send,emt_amt_ave_send,emt_amt_max_send,emt_amt_min_send,emt_cnt_send
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CUST10000513,988.0,247.000000,372.0,70.0,4
CUST10001522,334.0,167.000000,175.0,159.0,2
CUST10001820,0.0,0.000000,0.0,0.0,1
CUST10002122,51.5,51.500000,51.5,51.5,1
CUST10002761,5027.5,718.214286,1493.0,239.0,7
...,...,...,...,...,...
CUST99995000,14.0,14.000000,14.0,14.0,1
CUST99996213,50.0,25.000000,30.0,20.0,2
CUST99997704,1308.0,218.000000,302.0,72.0,6
CUST99998750,145.0,72.500000,78.0,67.0,2


In [13]:
# EMT Table - receive
emt_receive = emt_raw.groupby('id receiver').agg(
    emt_amt_sum_recieve=pd.NamedAgg(column="emt value", aggfunc="sum"),
    emt_amt_ave_recieve=pd.NamedAgg(column="emt value", aggfunc="mean"),
    emt_amt_max_recieve=pd.NamedAgg(column="emt value", aggfunc="max"),
    emt_amt_min_recieve=pd.NamedAgg(column="emt value", aggfunc="min"),
    emt_cnt_recieve=pd.NamedAgg(column="emt value", aggfunc="count")
).reset_index()
emt_recieve_CUST = emt_receive[emt_receive['id receiver'].str.contains('CUST')]
emt_recieve_CUST = emt_recieve_CUST.rename(columns={'id receiver': 'cust_id'}).set_index('cust_id')
emt_recieve_CUST

Unnamed: 0_level_0,emt_amt_sum_recieve,emt_amt_ave_recieve,emt_amt_max_recieve,emt_amt_min_recieve,emt_cnt_recieve
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CUST10000513,906.0,453.000000,460.0,446.0,2
CUST10001032,59.0,59.000000,59.0,59.0,1
CUST10001820,10.0,10.000000,10.0,10.0,1
CUST10002122,69.0,69.000000,69.0,69.0,1
CUST10002761,4477.0,746.166667,3604.5,43.0,6
...,...,...,...,...,...
CUST99995000,337.0,168.500000,216.0,121.0,2
CUST99996815,982.0,491.000000,903.0,79.0,2
CUST99997704,1880.5,235.062500,346.5,62.5,8
CUST99998756,5665.0,809.285714,2196.0,30.0,7


In [14]:
# Wire Table - send
wire_raw['wire type'] = wire_raw.apply(lambda row: 'local' if row['country sender'] == row['country receiver'] else 'aboard', axis=1)
wire_send = wire_raw.groupby(['id sender', 'wire type']).agg(
    wire_amt_sum_send=pd.NamedAgg(column="wire value", aggfunc="sum"),
    wire_amt_ave_send=pd.NamedAgg(column="wire value", aggfunc="mean"),
    wire_amt_max_send=pd.NamedAgg(column="wire value", aggfunc="max"),
    wire_amt_min_send=pd.NamedAgg(column="wire value", aggfunc="min"),
    wire_cnt_send=pd.NamedAgg(column="wire value", aggfunc="count")
).reset_index()
wire_send_CUST = wire_send[wire_send['id sender'].str.contains('CUST')]
wire_send_CUST = wire_send_CUST.rename(columns={'id sender': 'cust_id'}).set_index('cust_id')
wire_send_CUST_pivot = wire_send_CUST.pivot_table(index='cust_id', columns=['wire type'], fill_value=0.0, dropna=False)
level_0 = wire_send_CUST_pivot.columns.get_level_values(0).astype(str)
level_1 = wire_send_CUST_pivot.columns.get_level_values(1).astype(str)
wire_send_CUST_pivot.columns = level_0 + '_' + level_1
wire_send_CUST_pivot

Unnamed: 0_level_0,wire_amt_ave_send_aboard,wire_amt_ave_send_local,wire_amt_max_send_aboard,wire_amt_max_send_local,wire_amt_min_send_aboard,wire_amt_min_send_local,wire_amt_sum_send_aboard,wire_amt_sum_send_local,wire_cnt_send_aboard,wire_cnt_send_local
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CUST10002761,690.0,2880.166667,690.0,3510.0,690.0,1920.0,690.0,8640.5,1,3
CUST10005774,0.0,5046.000000,0.0,5046.0,0.0,5046.0,0.0,5046.0,0,1
CUST10006702,0.0,1890.000000,0.0,1890.0,0.0,1890.0,0.0,1890.0,0,1
CUST10010521,0.0,1420.500000,0.0,1420.5,0.0,1420.5,0.0,1420.5,0,1
CUST10012509,0.0,7251.500000,0.0,7251.5,0.0,7251.5,0.0,7251.5,0,1
...,...,...,...,...,...,...,...,...,...,...
CUST99972470,0.0,5803.750000,0.0,6541.5,0.0,5066.0,0.0,11607.5,0,2
CUST99975662,0.0,49468.500000,0.0,49468.5,0.0,49468.5,0.0,49468.5,0,1
CUST99981149,0.0,7062.000000,0.0,7062.0,0.0,7062.0,0.0,7062.0,0,1
CUST99990244,4417.0,2933.166667,4417.0,3761.0,4417.0,1950.5,4417.0,8799.5,1,3


In [15]:
# Wire Table - receive
wire_receive = wire_raw.groupby(['id receiver', 'wire type']).agg(
    wire_amt_sum_receive=pd.NamedAgg(column="wire value", aggfunc="sum"),
    wire_amt_ave_receive=pd.NamedAgg(column="wire value", aggfunc="mean"),
    wire_amt_max_receive=pd.NamedAgg(column="wire value", aggfunc="max"),
    wire_amt_min_receive=pd.NamedAgg(column="wire value", aggfunc="min"),
    wire_cnt_receive=pd.NamedAgg(column="wire value", aggfunc="count")
).reset_index()
wire_receive_CUST = wire_receive[wire_receive['id receiver'].str.contains('CUST')]
wire_receive_CUST = wire_receive_CUST.rename(columns={'id receiver': 'cust_id'}).set_index('cust_id')
wire_receive_CUST_pivot = wire_receive_CUST.pivot_table(index='cust_id', columns=['wire type'], fill_value=0.0, dropna=False)
level_0 = wire_receive_CUST_pivot.columns.get_level_values(0).astype(str)
level_1 = wire_receive_CUST_pivot.columns.get_level_values(1).astype(str)
wire_receive_CUST_pivot.columns = level_0 + '_' + level_1
wire_receive_CUST_pivot

Unnamed: 0_level_0,wire_amt_ave_receive_aboard,wire_amt_ave_receive_local,wire_amt_max_receive_aboard,wire_amt_max_receive_local,wire_amt_min_receive_aboard,wire_amt_min_receive_local,wire_amt_sum_receive_aboard,wire_amt_sum_receive_local,wire_cnt_receive_aboard,wire_cnt_receive_local
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CUST10001032,0.00,1526.000000,0.0,1526.0,0.0,1526.0,0.0,1526.0,0,1
CUST10002761,0.00,3623.000000,0.0,6055.0,0.0,1240.0,0.0,10869.0,0,3
CUST10005575,0.00,6547.000000,0.0,10850.0,0.0,1565.0,0.0,19641.0,0,3
CUST10005774,0.00,5787.500000,0.0,7795.0,0.0,3780.0,0.0,11575.0,0,2
CUST10012509,0.00,5642.833333,0.0,6450.0,0.0,4684.5,0.0,16928.5,0,3
...,...,...,...,...,...,...,...,...,...,...
CUST99981149,0.00,3215.750000,0.0,7141.0,0.0,1428.0,0.0,12863.0,0,4
CUST99985375,1762.00,0.000000,1762.0,0.0,1762.0,0.0,1762.0,0.0,1,0
CUST99990244,0.00,1197.500000,0.0,1197.5,0.0,1197.5,0.0,1197.5,0,1
CUST99996213,1643.00,0.000000,1643.0,0.0,1643.0,0.0,1643.0,0.0,1,0


In [16]:
result_df = (
    pd.merge(raw_data, cash_result_pivot, on='cust_id', how='outer')
    .merge(emt_send_CUST, on='cust_id', how='outer')
    .merge(emt_recieve_CUST, on='cust_id', how='outer')
    .merge(wire_send_CUST_pivot, on='cust_id', how='outer')
    .merge(wire_receive_CUST_pivot, on='cust_id', how='outer')
    .fillna(0)
    .drop_duplicates(subset='cust_id')
)

result_df

Unnamed: 0,Name,Gender,Occupation,Age,Tenure,cust_id,label,cash_amt_ave_deposit,cash_amt_ave_withdrawal,cash_amt_max_deposit,...,wire_amt_ave_receive_aboard,wire_amt_ave_receive_local,wire_amt_max_receive_aboard,wire_amt_max_receive_local,wire_amt_min_receive_aboard,wire_amt_min_receive_local,wire_amt_sum_receive_aboard,wire_amt_sum_receive_local,wire_cnt_receive_aboard,wire_cnt_receive_local
0,JENNIFER WELLS,female,Architect,45.0,13.0,CUST82758793,0,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ANTHONY ADAMS,male,Musician,52.0,8.0,CUST69248708,0,1210.000000,0.0,1210.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DENISE LEWIS,female,Jewelry Dealer,43.0,11.0,CUST67222818,0,1917.500000,0.0,2075.0,...,8914.0,2400.750000,16110.0,3340.0,1718.0,1461.5,17828.0,4801.5,2.0,2.0
3,STEPHEN FIGUEROA,male,Optometrist,35.0,17.0,CUST33995820,0,0.000000,325.0,0.0,...,27385.0,0.000000,27385.0,0.0,27385.0,0.0,27385.0,0.0,1.0,0.0
4,KYLE EDWARDS,male,Real Estate Broker,39.0,21.0,CUST76401392,1,7568.333333,9792.5,14360.0,...,0.0,7965.166667,0.0,11010.5,0.0,2974.0,0.0,23895.5,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195784,THOMAS YOUNG,male,Software Developer,46.0,3.0,CUST23014082,0,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
195785,CASEY JONES,male,Miner,31.0,10.0,CUST17691251,0,0.000000,0.0,0.0,...,0.0,10903.000000,0.0,10903.0,0.0,10903.0,0.0,10903.0,0.0,1.0
195786,NICOLE-CÉCILE LEBLANC,female,Unknown,21.0,3.0,CUST26444112,0,0.000000,0.0,0.0,...,0.0,1466.000000,0.0,1466.0,0.0,1466.0,0.0,1466.0,0.0,1.0
195787,CATHERINE ARMSTRONG,female,Antiques Dealer,58.0,14.0,CUST96567835,0,3132.500000,8465.0,3695.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#**Deal with imbalance issue**

In [17]:
total_counts = result_df['label'].value_counts()
Class_1 = (result_df['label'].value_counts()[1] / len(result_df)) * 100
Class_0 = (result_df['label'].value_counts()[0] / len(result_df)) * 100

print("Percentage of risky (1):", Class_1)
print("Percentage of not risky (0):", Class_0)

Percentage of risky (1): 2.816297136202749
Percentage of not risky (0): 97.18370286379725


In [18]:
data_df = result_df.drop(['Name','cust_id'],axis = 1)
data_df['Gender'] = result_df['Gender'].map({'male': 0, 'female': 1, 'other': 3})
occupation_dict = {occupation: index for index, occupation in enumerate(result_df['Occupation'].unique())}
data_df['Occupation'] = result_df['Occupation'].map(occupation_dict)

X = data_df.drop('label', axis=1)
y = data_df['label']

In [19]:
data_df

Unnamed: 0,Gender,Occupation,Age,Tenure,label,cash_amt_ave_deposit,cash_amt_ave_withdrawal,cash_amt_max_deposit,cash_amt_max_withdrawal,cash_amt_min_deposit,...,wire_amt_ave_receive_aboard,wire_amt_ave_receive_local,wire_amt_max_receive_aboard,wire_amt_max_receive_local,wire_amt_min_receive_aboard,wire_amt_min_receive_local,wire_amt_sum_receive_aboard,wire_amt_sum_receive_local,wire_cnt_receive_aboard,wire_cnt_receive_local
0,1,0,45.0,13.0,0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,52.0,8.0,0,1210.000000,0.0,1210.0,0.0,1210.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2,43.0,11.0,0,1917.500000,0.0,2075.0,0.0,1760.0,...,8914.0,2400.750000,16110.0,3340.0,1718.0,1461.5,17828.0,4801.5,2.0,2.0
3,0,3,35.0,17.0,0,0.000000,325.0,0.0,445.0,0.0,...,27385.0,0.000000,27385.0,0.0,27385.0,0.0,27385.0,0.0,1.0,0.0
4,0,4,39.0,21.0,1,7568.333333,9792.5,14360.0,10075.0,4655.0,...,0.0,7965.166667,0.0,11010.5,0.0,2974.0,0.0,23895.5,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195784,0,16,46.0,3.0,0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
195785,0,226,31.0,10.0,0,0.000000,0.0,0.0,0.0,0.0,...,0.0,10903.000000,0.0,10903.0,0.0,10903.0,0.0,10903.0,0.0,1.0
195786,1,109,21.0,3.0,0,0.000000,0.0,0.0,0.0,0.0,...,0.0,1466.000000,0.0,1466.0,0.0,1466.0,0.0,1466.0,0.0,1.0
195787,1,7,58.0,14.0,0,3132.500000,8465.0,3695.0,8465.0,2570.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Split the dataset into train and test sets (7:3 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=35)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(137052, 44) (58737, 44) (137052,) (58737,)


In [21]:
pos_train = (len(y_train[y_train == 1]) / len(y_train)) * 100
pos_test = (len(y_test[y_test == 1]) / len(y_test)) * 100

print("Percentage of postive in train:", pos_train)
print("Percentage of positive in test:", pos_test)

Percentage of postive in train: 2.816449230948837
Percentage of positive in test: 2.8159422510512964


In [22]:
#over-sampling train dataset with SMOTE
sm = SMOTE(random_state=35)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

pos_train_resampled = (len(y_train_resampled[y_train_resampled == 1]) / len(y_train_resampled)) * 100
print("Percentage of postive in resampled train:", pos_train_resampled)

Percentage of postive in resampled train: 50.0


In [23]:
# Standardize the original data for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Standardize the resampled data
scaler = StandardScaler()
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_resampled_scaled = scaler.transform(X_test)

#**Model Selection**

In [None]:
# 1. Logistic Regression with original data
log_reg = LogisticRegression(random_state=35,solver='saga',class_weight='balanced')
lr_parameters = {'C':[0.0005,0.001,0.005], 'max_iter':[200,300,400]}
lr_clf = GridSearchCV(log_reg, lr_parameters, scoring='balanced_accuracy')
lr_clf.fit(X_train_scaled, y_train)
lr_clf.best_params_



{'C': 0.001, 'max_iter': 200}

In [None]:
# 1.1 Logistic Regression with resampled data
log_reg_res = LogisticRegression(random_state=35,solver='saga')
lr_res_parameters = {'C':[0.001,0.01], 'max_iter':[200,300,400]}
lr_clf_res = GridSearchCV(log_reg_res, lr_res_parameters, scoring='balanced_accuracy')
lr_clf_res.fit(X_train_resampled_scaled, y_train_resampled)
lr_clf_res.best_params_



{'C': 0.001, 'max_iter': 200}

In [None]:
# 2. Random Forest with original data
rf = RandomForestClassifier(random_state=35)
rf_parameters = {'criterion':['gini','entropy','log_loss'],'max_depth':[4,6,8],}
rf_clf = GridSearchCV(rf, rf_parameters)
rf_clf.fit(X_train_scaled, y_train)
rf_clf.best_params_

{'criterion': 'gini', 'max_depth': 8}

In [None]:
# 2.1 Random Forest with resampled data
rf = RandomForestClassifier(random_state=35)
rf_res_parameters = {'criterion':['gini','entropy','log_loss'],'max_depth':[2,4,6]}
rf_clf_res = GridSearchCV(rf, rf_res_parameters)
rf_clf_res.fit(X_train_resampled_scaled, y_train_resampled)
rf_clf_res.best_params_

{'criterion': 'entropy', 'max_depth': 6}

In [24]:
class ANN(nn.Module):
    def __init__(self, input_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

In [39]:
# Standardize the data for better performance in neural networks
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)  # Assuming y is a single column (reshape if needed)

X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test.values).view(-1, 1)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Instantiate the model
input_size = X_train_scaled.shape[1]
ann_model = ANN(input_size)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(ann_model.parameters(), lr=0.001)

#**Evaluation**

In [26]:
# Logistic Reg with original data with default estimator
log_reg = LogisticRegression(random_state=35,solver='saga',class_weight='balanced', max_iter=1000)
log_reg.fit(X_train_scaled, y_train)
logreg_predictions = log_reg.predict(X_test_scaled)
logreg_accuracy = balanced_accuracy_score(y_test, logreg_predictions)
print(f"Logistic Regression accuracy: {accuracy_score(y_test, logreg_predictions)}")
print(f"Logistic Regression balanced accuracy: {logreg_accuracy}")
print(f"Logistic Regression roc_auc_score: {roc_auc_score(y_test, logreg_predictions)}")
print(f"Logistic Regression F1 score: {f1_score(y_test, logreg_predictions)}")
print(f"Logistic Regression precison: {precision_score(y_test, logreg_predictions)}")
print(f"Logistic Regression recall score: {recall_score(y_test, logreg_predictions)}")



Logistic Regression accuracy: 0.9174285373784837
Logistic Regression balanced accuracy: 0.8709242058928554
Logistic Regression roc_auc_score: 0.8709242058928554
Logistic Regression F1 score: 0.35914376321353064
Logistic Regression precison: 0.22979370984105513
Logistic Regression recall score: 0.8216444981862152


In [27]:
# Logistic Reg with original data with best estimator
log_reg = LogisticRegression(random_state=35,solver='saga',class_weight='balanced', C=0.001, max_iter=200)
log_reg.fit(X_train_scaled, y_train)
logreg_predictions = log_reg.predict(X_test_scaled)
logreg_accuracy = balanced_accuracy_score(y_test, logreg_predictions)
print(f"Logistic Regression accuracy: {accuracy_score(y_test, logreg_predictions)}")
print(f"Logistic Regression balanced accuracy: {logreg_accuracy}")
print(f"Logistic Regression roc_auc_score: {roc_auc_score(y_test, logreg_predictions)}")
print(f"Logistic Regression F1 score: {f1_score(y_test, logreg_predictions)}")
print(f"Logistic Regression precison: {precision_score(y_test, logreg_predictions)}")
print(f"Logistic Regression recall score: {recall_score(y_test, logreg_predictions)}")

Logistic Regression accuracy: 0.9168496858879411
Logistic Regression balanced accuracy: 0.8703328556493641
Logistic Regression roc_auc_score: 0.870332855649364
Logistic Regression F1 score: 0.3573684210526316
Logistic Regression precison: 0.22838883282879247
Logistic Regression recall score: 0.8210399032648126




In [28]:
# Logistic Reg with resampled data with default estimator
log_reg = LogisticRegression(random_state=35,solver='saga')
log_reg.fit(X_train_resampled_scaled, y_train_resampled)
logreg_predictions = log_reg.predict(X_test_resampled_scaled)
logreg_accuracy = balanced_accuracy_score(y_test, logreg_predictions)
print(f"Logistic Regression accuracy: {accuracy_score(y_test, logreg_predictions)}")
print(f"Logistic Regression balanced accuracy: {logreg_accuracy}")
print(f"Logistic Regression roc_auc_score: {roc_auc_score(y_test, logreg_predictions)}")
print(f"Logistic Regression F1 score: {f1_score(y_test, logreg_predictions)}")
print(f"Logistic Regression precison: {precision_score(y_test, logreg_predictions)}")
print(f"Logistic Regression recall score: {recall_score(y_test, logreg_predictions)}")



Logistic Regression accuracy: 0.9194374925515433
Logistic Regression balanced accuracy: 0.8719577885707104
Logistic Regression roc_auc_score: 0.8719577885707104
Logistic Regression F1 score: 0.3648322147651007
Logistic Regression precison: 0.234472049689441
Logistic Regression recall score: 0.8216444981862152


In [29]:
# Logistic Reg with resampled data with best estimator
log_reg = LogisticRegression(random_state=35,solver='saga',class_weight='balanced', C=0.001, max_iter=200)
log_reg.fit(X_train_resampled_scaled, y_train_resampled)
logreg_predictions = log_reg.predict(X_test_resampled_scaled)
logreg_accuracy = balanced_accuracy_score(y_test, logreg_predictions)
print(f"Logistic Regression accuracy: {accuracy_score(y_test, logreg_predictions)}")
print(f"Logistic Regression balanced accuracy: {logreg_accuracy}")
print(f"Logistic Regression roc_auc_score: {roc_auc_score(y_test, logreg_predictions)}")
print(f"Logistic Regression F1 score: {f1_score(y_test, logreg_predictions)}")
print(f"Logistic Regression precison: {precision_score(y_test, logreg_predictions)}")
print(f"Logistic Regression recall score: {recall_score(y_test, logreg_predictions)}")

Logistic Regression accuracy: 0.9166964604933858
Logistic Regression balanced accuracy: 0.8702540230722395
Logistic Regression roc_auc_score: 0.8702540230722395
Logistic Regression F1 score: 0.3569457221711132
Logistic Regression precison: 0.22804366078925273
Logistic Regression recall score: 0.8210399032648126


In [31]:
# Random Forest with original data with default estimator
rf = RandomForestClassifier(random_state=35)
rf.fit(X_train_scaled, y_train)
rf_predictions = rf.predict(X_test_scaled)
rf_accuracy = balanced_accuracy_score(y_test, rf_predictions)
print(f"Random Forest accuracy: {accuracy_score(y_test, rf_predictions)}")
print(f"Random Forest balanced ccuracy: {rf_accuracy}")
print(f"Random Forest roc_auc_score: {roc_auc_score(y_test, rf_predictions)}")
print(f"Random Forest F1 score: {f1_score(y_test, rf_predictions)}")
print(f"Random Forest precison: {precision_score(y_test, rf_predictions)}")
print(f"Random Forest recall score: {recall_score(y_test, rf_predictions)}")

Random Forest accuracy: 0.9772204913427652
Random Forest balanced ccuracy: 0.6756619495136391
Random Forest roc_auc_score: 0.6756619495136392
Random Forest F1 score: 0.46820349761526237
Random Forest precison: 0.6832946635730859
Random Forest recall score: 0.3561064087061669


In [32]:
# Random Forest with original data with best estimator
rf = RandomForestClassifier(random_state=35,criterion='gini', max_depth=8)
rf.fit(X_train_scaled, y_train)
rf_predictions = rf.predict(X_test_scaled)
print(f"Random Forest accuracy: {accuracy_score(y_test, rf_predictions)}")
print(f"Random Forest balanced ccuracy: {rf_accuracy}")
print(f"Random Forest roc_auc_score: {roc_auc_score(y_test, rf_predictions)}")
print(f"Random Forest F1 score: {f1_score(y_test, rf_predictions)}")
print(f"Random Forest precison: {precision_score(y_test, rf_predictions)}")
print(f"Random Forest recall score: {recall_score(y_test, rf_predictions)}")

Random Forest accuracy: 0.9765735396768647
Random Forest balanced ccuracy: 0.6756619495136391
Random Forest roc_auc_score: 0.6406915831697669
Random Forest F1 score: 0.4063848144952545
Random Forest precison: 0.7093373493975904
Random Forest recall score: 0.28476420798065294


In [35]:
# Random Forest with resampled data with defalut estimator
rf = RandomForestClassifier(random_state=35)
rf.fit(X_train_resampled_scaled, y_train_resampled)
rf_predictions = rf.predict(X_test_resampled_scaled)
rf_accuracy = balanced_accuracy_score(y_test, rf_predictions)
print(f"Random Forest accuracy: {accuracy_score(y_test, rf_predictions)}")
print(f"Random Forest balanced ccuracy: {rf_accuracy}")
print(f"Random Forest roc_auc_score: {roc_auc_score(y_test, rf_predictions)}")
print(f"Random Forest F1 score: {f1_score(y_test, rf_predictions)}")
print(f"Random Forest precison: {precision_score(y_test, rf_predictions)}")
print(f"Random Forest recall score: {recall_score(y_test, rf_predictions)}")

Random Forest accuracy: 0.9693378960450824
Random Forest balanced ccuracy: 0.774344851292188
Random Forest roc_auc_score: 0.7743448512921881
Random Forest F1 score: 0.5104648002174504
Random Forest precison: 0.46370370370370373
Random Forest recall score: 0.567714631197098


In [37]:
# Random Forest with resampled data with best estimator
rf = RandomForestClassifier(random_state=35,criterion='entropy', max_depth=6)
rf.fit(X_train_resampled_scaled, y_train_resampled)
rf_predictions = rf.predict(X_test_resampled_scaled)
rf_accuracy = balanced_accuracy_score(y_test, rf_predictions)
print(f"Random Forest accuracy: {accuracy_score(y_test, rf_predictions)}")
print(f"Random Forest balanced ccuracy: {rf_accuracy}")
print(f"Random Forest roc_auc_score: {roc_auc_score(y_test, rf_predictions)}")
print(f"Random Forest F1 score: {f1_score(y_test, rf_predictions)}")
print(f"Random Forest precison: {precision_score(y_test, rf_predictions)}")
print(f"Random Forest recall score: {recall_score(y_test, rf_predictions)}")

Random Forest accuracy: 0.9174796125100022
Random Forest balanced ccuracy: 0.8706569451330983
Random Forest roc_auc_score: 0.8706569451330983
Random Forest F1 score: 0.3591167526113976
Random Forest precison: 0.2298189202910814
Random Forest recall score: 0.8210399032648126


In [40]:
# Training the model with original data
num_epochs = 30
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = ann_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model on the test set
with torch.no_grad():
    test_outputs = ann_model(X_test_tensor)
    predicted_labels = (test_outputs >= 0.5).float()

ann_prediction = predicted_labels.numpy()
y_truth = y_test_tensor.numpy()
print(f"Accuracy: {accuracy_score(y_truth, ann_prediction)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_truth, ann_prediction)}")
print(f"ROC AUC: {roc_auc_score(y_truth, ann_prediction)}")
print(f"F1 score: {f1_score(y_truth, ann_prediction)}")
print(f"Precison: {precision_score(y_truth, ann_prediction)}")
print(f"Recall score: {recall_score(y_truth, ann_prediction)}")

Accuracy: 0.976828915334457
Balanced accuracy: 0.7324069158634722
ROC AUC: 0.7324069158634722
F1 score: 0.5350187905705499
Precison: 0.615082482325216
Recall score: 0.47339782345828296


In [42]:
# Standardize the data for better performance in neural networks
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train_resampled.values).view(-1, 1)  # Assuming y is a single column (reshape if needed)

X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test.values).view(-1, 1)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Instantiate the model
input_size = X_train_scaled.shape[1]
ann_model = ANN(input_size)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(ann_model.parameters(), lr=0.001)

In [43]:
# Training the model with original data
num_epochs = 30
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = ann_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model on the test set
with torch.no_grad():
    test_outputs = ann_model(X_test_tensor)
    predicted_labels = (test_outputs >= 0.5).float()

ann_prediction = predicted_labels.numpy()
y_truth = y_test_tensor.numpy()
print(f"Accuracy: {accuracy_score(y_truth, ann_prediction)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_truth, ann_prediction)}")
print(f"ROC AUC: {roc_auc_score(y_truth, ann_prediction)}")
print(f"F1 score: {f1_score(y_truth, ann_prediction)}")
print(f"Precison: {precision_score(y_truth, ann_prediction)}")
print(f"Recall score: {recall_score(y_truth, ann_prediction)}")

Accuracy: 0.9343684559987742
Balanced accuracy: 0.8297380767236389
ROC AUC: 0.829738076723639
F1 score: 0.38151772822076047
Precison: 0.25966368202664336
Recall score: 0.718863361547763


In [None]:
class ANN2(nn.Module):
    def __init__(self, input_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

In [47]:
# Standardize the data for better performance in neural networks
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)  # Assuming y is a single column (reshape if needed)

X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test.values).view(-1, 1)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

# Instantiate the model
input_size = X_train_scaled.shape[1]
ann_model = ANN2(input_size)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(ann_model.parameters(), lr=0.001)

In [48]:
# Training the model with original data
num_epochs = 20
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = ann_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model on the test set
with torch.no_grad():
    test_outputs = ann_model(X_test_tensor)
    predicted_labels = (test_outputs >= 0.5).float()

ann_prediction = predicted_labels.numpy()
y_truth = y_test_tensor.numpy()
print(f"Accuracy: {accuracy_score(y_truth, ann_prediction)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_truth, ann_prediction)}")
print(f"ROC AUC: {roc_auc_score(y_truth, ann_prediction)}")
print(f"F1 score: {f1_score(y_truth, ann_prediction)}")
print(f"Precison: {precision_score(y_truth, ann_prediction)}")
print(f"Recall score: {recall_score(y_truth, ann_prediction)}")

Accuracy: 0.977560992219555
Balanced accuracy: 0.7357189432532755
ROC AUC: 0.7357189432532755
F1 score: 0.5461432506887053
Precison: 0.6344
Recall score: 0.47944377267230953
