In [33]:
#Base

import pandas as pd
import os

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import RFE

import statsmodels.api as sm

from tqdm import tqdm


dataset_folder = os.path.join(os.getcwd(), 'dataset')

def df_from_csv(filename):
    f = os.path.join(dataset_folder, filename)
    return pd.read_csv(f, delimiter='\t|\n|,', engine='python')

x_train = df_from_csv('ori_trainx.csv')
y_train = df_from_csv('ori_trainy.csv')
alert_date = df_from_csv('ori_alert_date.csv')
alert_cust = df_from_csv('ori_custinfo.csv')
xdp = df_from_csv('ori_xdp.csv')

smote = SMOTE(random_state=0)
tqdm.pandas(desc='Progress Bar')

def SMOTE_oversampling(input_x, input_y):
    x_train, x_test, y_train, y_test = train_test_split(input_x, input_y, test_size=0.3)
    x_cols = x_train.columns
    y_cols = y_train.columns

    sm_data_x, sm_data_y = smote.fit_resample(x_train, y_train)
    sm_data_x = pd.DataFrame(data=sm_data_x, columns=x_cols)
    sm_data_y = pd.DataFrame(data=sm_data_y, columns=y_cols)

    print("length of oversampled data is ",len(sm_data_x))
    print("Number of non-sar in oversampled data",len(sm_data_y[sm_data_y['sar_flag']==0]))
    print("Number of sar",len(sm_data_y[sm_data_y['sar_flag']==1]))
    print("Proportion of no subscription data in oversampled data is ", len(sm_data_y[sm_data_y['sar_flag']==0])/len(sm_data_x))
    print("Proportion of subscription data in oversampled data is ", len(sm_data_y[sm_data_y['sar_flag']==1])/len(sm_data_x))

    return sm_data_x, sm_data_y

def RFE_analysis(input_x, input_y, model):
    rfe = RFE(model, step=300)
    rfe.fit(input_x, input_y.values.ravel())
    print(input_x.columns)
    print(rfe.support_)
    print(rfe.ranking_)
    return rfe.get_support(1)

def LR_analysis(input_x, input_y):
    sm_LR = sm.Logit(input_y, input_x)
    result = sm_LR.fit()
    print(result.summary2())

def LR_training(input_x, input_y, LR_model):
    x_train, x_test, y_train, y_test = train_test_split(input_x, input_y, test_size=0.25)
    LR_model.fit(x_train, y_train)
    y_train_predict = LR_model.predict(x_train)
    y_test_predict = LR_model.predict(x_test)
    train_acc = accuracy_score(y_train, y_train_predict)
    test_acc = accuracy_score(y_test, y_test_predict)
    print(f'{train_acc}  {test_acc}')
    cm = confusion_matrix(y_true=y_test, y_pred=y_test_predict)
    print(cm)
    return LR_model

def LR_test(input_x, input_y, LR_model):
    y_predict = LR_model.predict(input_x)
    test_acc = accuracy_score(input_y, y_predict)
    print(f'{test_acc}')
    cm = confusion_matrix(y_true=input_y, y_pred=y_predict)
    print(cm)
    print(classification_report(y_true=input_y, y_pred=y_predict))

def calculate_tx(row):
    if not pd.isna(row['risk_rank']):
        temp_frame = xdp[(xdp['tx_date'] <= row['date']) & (xdp['cust_id'] == row['cust_id'])]
        total_tx = (temp_frame['tx_amt'] * temp_frame['exchg_rate'])
        row['total_tx'] = total_tx.sum()
        row['avg_tx'] = total_tx.mean()
        row['count_tx'] = total_tx.count()
        row['std_tx'] = total_tx.std()
        row['tx_per_day'] = row['count_tx']/row['date']+1
    return row


In [35]:
# Join data for training

ccba = pd.read_pickle('aggregated_ccba_cust.pkl')
cdtx = pd.read_pickle('cdtx_agg.pkl')
xdp = pd.read_pickle('xdp_train_test1.pkl')
remit = pd.read_pickle('cust_remit_raw.pkl')

In [36]:
xdp.head()

Unnamed: 0,alert_key,date,sar_flag,cust_id,risk_rank,AGE,total_tx,avg_tx,count_tx,std_tx,tx_per_day
0,171189,0,0,972ee157e63316e8a50dd489bc93730a3ee8a8959d5c6b...,1,4,1515.0,505.0,3,450.223278,0.0
1,171202,0,0,a10ab33f90926fb18d7bb5e78034d7f04a1fbed95b7951...,3,2,209202.0,69734.0,3,75670.655263,0.0
2,171599,0,0,3433ecc068ed1c9e2f5037cab5d42d7b901e9bd624c1fb...,1,4,310593.0,77648.25,4,134725.210913,0.0
3,171737,0,0,a0861608615a9365d90f4ba0a813c0ea0471987f925c8b...,3,4,43125.0,4312.5,10,5920.365665,0.0
4,171142,0,0,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,3,296344.0,4939.066667,60,6547.864806,0.0


In [3]:
train = df_from_csv('ori_trainx.csv')
train = train.merge(df_from_csv('ori_trainy.csv'), on='alert_key', how='inner')
train = train.merge(alert_cust, on='alert_key', how='left')
train = train[['alert_key', 'cust_id']]
train = train.merge(ccba, on='cust_id', how='left')
train = train.merge(cdtx, on='cust_id', how='left')
train = train.merge(xdp, on='alert_key', how='left')
train = train.merge(remit, on='alert_key', how='left')
train = train.reindex(sorted(train.columns), axis=1)
train

Unnamed: 0,AGE,alert_key,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,...,total_amt_tw_ntd,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,trade_sum,tx_per_day
0,4,171189,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,...,688721.0,375576.0,4.886652e+05,27.0,45.0,,496.0,1.515000e+03,,0.000000
1,2,171202,,,,,,,,,...,,2717416.0,2.621840e+06,,,,,2.092020e+05,,0.000000
2,4,171599,,,,,,,,,...,,326517.0,3.265170e+05,,,,,3.105930e+05,,0.000000
3,4,171737,,,,1893.333333,,,,1.500000,...,5680.0,1014759.0,1.275499e+05,,,,3.0,4.312500e+04,,0.000000
4,3,171142,,,,7048.368421,,,,1.055556,...,133919.0,241719.0,2.212473e+05,,,,19.0,2.963440e+05,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,352132,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,...,2583949.0,3218731.0,2.361632e+06,65.0,578.0,,2614.0,1.532000e+08,364459.0,0.390110
23902,3,352125,,,,,,,,,...,,928963.0,5.041325e+05,,,,,4.615283e+07,22127.0,0.054945
23903,6,352080,,,,,,,,,...,,69080.0,7.500171e+04,,,,,1.555772e+08,43676.0,0.192308
23904,6,352075,,,,,,,,,...,,262604.0,3.333440e+05,,,,,1.365559e+08,,1.181319


In [4]:
train['bad_value'] = train.isnull().sum(axis=1) + (train == 0).astype(int).sum(axis=1)

In [5]:
train = train.drop(['cust_id_x', 'cust_id_y', 'date'], axis=1)
train.to_pickle('aggregate.pkl')

In [6]:
train = train.drop(['alert_key'], axis=1)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,trade_sum,tx_per_day,bad_value
0,4,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,5.050000e+02,...,375576.0,4.886652e+05,27.0,45.0,,496.0,1.515000e+03,,0.000000,25
1,2,,,,,,,,,6.973400e+04,...,2717416.0,2.621840e+06,,,,,2.092020e+05,,0.000000,43
2,4,,,,,,,,,7.764825e+04,...,326517.0,3.265170e+05,,,,,3.105930e+05,,0.000000,43
3,4,,,,1893.333333,,,,1.500000,4.312500e+03,...,1014759.0,1.275499e+05,,,,3.0,4.312500e+04,,0.000000,37
4,3,,,,7048.368421,,,,1.055556,4.939067e+03,...,241719.0,2.212473e+05,,,,19.0,2.963440e+05,,0.000000,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,1.078873e+06,...,3218731.0,2.361632e+06,65.0,578.0,,2614.0,1.532000e+08,364459.0,0.390110,22
23902,3,,,,,,,,,2.307641e+06,...,928963.0,5.041325e+05,,,,,4.615283e+07,22127.0,0.054945,40
23903,6,,,,,,,,,2.222531e+06,...,69080.0,7.500171e+04,,,,,1.555772e+08,43676.0,0.192308,40
23904,6,,,,,,,,,3.175717e+05,...,262604.0,3.333440e+05,,,,,1.365559e+08,,1.181319,41


In [7]:
train['bad_value'] = train.isnull().sum(axis=1) + (train == 0).astype(int).sum(axis=1)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,trade_sum,tx_per_day,bad_value
0,4,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,5.050000e+02,...,375576.0,4.886652e+05,27.0,45.0,,496.0,1.515000e+03,,0.000000,24
1,2,,,,,,,,,6.973400e+04,...,2717416.0,2.621840e+06,,,,,2.092020e+05,,0.000000,42
2,4,,,,,,,,,7.764825e+04,...,326517.0,3.265170e+05,,,,,3.105930e+05,,0.000000,42
3,4,,,,1893.333333,,,,1.500000,4.312500e+03,...,1014759.0,1.275499e+05,,,,3.0,4.312500e+04,,0.000000,36
4,3,,,,7048.368421,,,,1.055556,4.939067e+03,...,241719.0,2.212473e+05,,,,19.0,2.963440e+05,,0.000000,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,1.078873e+06,...,3218731.0,2.361632e+06,65.0,578.0,,2614.0,1.532000e+08,364459.0,0.390110,22
23902,3,,,,,,,,,2.307641e+06,...,928963.0,5.041325e+05,,,,,4.615283e+07,22127.0,0.054945,40
23903,6,,,,,,,,,2.222531e+06,...,69080.0,7.500171e+04,,,,,1.555772e+08,43676.0,0.192308,40
23904,6,,,,,,,,,3.175717e+05,...,262604.0,3.333440e+05,,,,,1.365559e+08,,1.181319,41


In [8]:
ori_train = train.copy()
# train = train.loc[train['bad_value']<41]
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,trade_sum,tx_per_day,bad_value
0,4,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,5.050000e+02,...,375576.0,4.886652e+05,27.0,45.0,,496.0,1.515000e+03,,0.000000,24
1,2,,,,,,,,,6.973400e+04,...,2717416.0,2.621840e+06,,,,,2.092020e+05,,0.000000,42
2,4,,,,,,,,,7.764825e+04,...,326517.0,3.265170e+05,,,,,3.105930e+05,,0.000000,42
3,4,,,,1893.333333,,,,1.500000,4.312500e+03,...,1014759.0,1.275499e+05,,,,3.0,4.312500e+04,,0.000000,36
4,3,,,,7048.368421,,,,1.055556,4.939067e+03,...,241719.0,2.212473e+05,,,,19.0,2.963440e+05,,0.000000,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,1.078873e+06,...,3218731.0,2.361632e+06,65.0,578.0,,2614.0,1.532000e+08,364459.0,0.390110,22
23902,3,,,,,,,,,2.307641e+06,...,928963.0,5.041325e+05,,,,,4.615283e+07,22127.0,0.054945,40
23903,6,,,,,,,,,2.222531e+06,...,69080.0,7.500171e+04,,,,,1.555772e+08,43676.0,0.192308,40
23904,6,,,,,,,,,3.175717e+05,...,262604.0,3.333440e+05,,,,,1.365559e+08,,1.181319,41


In [9]:
ori_train.iloc[:, 1:] = ori_train.iloc[:, 1:].fillna(0)
train.iloc[:, 1:] = train.iloc[:, 1:].fillna(0)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,trade_sum,tx_per_day,bad_value
0,4,9044.740741,3033.355556,0.0,1388.550403,1.350000,1.097561,0.0,2.147186,5.050000e+02,...,375576.0,4.886652e+05,27.0,45.0,0.0,496.0,1.515000e+03,0.0,0.000000,24
1,2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,6.973400e+04,...,2717416.0,2.621840e+06,0.0,0.0,0.0,0.0,2.092020e+05,0.0,0.000000,42
2,4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,7.764825e+04,...,326517.0,3.265170e+05,0.0,0.0,0.0,0.0,3.105930e+05,0.0,0.000000,42
3,4,0.000000,0.000000,0.0,1893.333333,0.000000,0.000000,0.0,1.500000,4.312500e+03,...,1014759.0,1.275499e+05,0.0,0.0,0.0,3.0,4.312500e+04,0.0,0.000000,36
4,3,0.000000,0.000000,0.0,7048.368421,0.000000,0.000000,0.0,1.055556,4.939067e+03,...,241719.0,2.212473e+05,0.0,0.0,0.0,19.0,2.963440e+05,0.0,0.000000,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,0.0,988.503826,1.160714,2.513043,0.0,6.668367,1.078873e+06,...,3218731.0,2.361632e+06,65.0,578.0,0.0,2614.0,1.532000e+08,364459.0,0.390110,22
23902,3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,2.307641e+06,...,928963.0,5.041325e+05,0.0,0.0,0.0,0.0,4.615283e+07,22127.0,0.054945,40
23903,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,2.222531e+06,...,69080.0,7.500171e+04,0.0,0.0,0.0,0.0,1.555772e+08,43676.0,0.192308,40
23904,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,3.175717e+05,...,262604.0,3.333440e+05,0.0,0.0,0.0,0.0,1.365559e+08,0.0,1.181319,41


In [10]:
ori_train = ori_train.drop(['bad_value'], axis=1)
train = train.drop(['bad_value'], axis=1)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_tw_ntd,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,trade_sum,tx_per_day
0,4,9044.740741,3033.355556,0.0,1388.550403,1.350000,1.097561,0.0,2.147186,5.050000e+02,...,688721.0,375576.0,4.886652e+05,27.0,45.0,0.0,496.0,1.515000e+03,0.0,0.000000
1,2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,6.973400e+04,...,0.0,2717416.0,2.621840e+06,0.0,0.0,0.0,0.0,2.092020e+05,0.0,0.000000
2,4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,7.764825e+04,...,0.0,326517.0,3.265170e+05,0.0,0.0,0.0,0.0,3.105930e+05,0.0,0.000000
3,4,0.000000,0.000000,0.0,1893.333333,0.000000,0.000000,0.0,1.500000,4.312500e+03,...,5680.0,1014759.0,1.275499e+05,0.0,0.0,0.0,3.0,4.312500e+04,0.0,0.000000
4,3,0.000000,0.000000,0.0,7048.368421,0.000000,0.000000,0.0,1.055556,4.939067e+03,...,133919.0,241719.0,2.212473e+05,0.0,0.0,0.0,19.0,2.963440e+05,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,0.0,988.503826,1.160714,2.513043,0.0,6.668367,1.078873e+06,...,2583949.0,3218731.0,2.361632e+06,65.0,578.0,0.0,2614.0,1.532000e+08,364459.0,0.390110
23902,3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,2.307641e+06,...,0.0,928963.0,5.041325e+05,0.0,0.0,0.0,0.0,4.615283e+07,22127.0,0.054945
23903,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,2.222531e+06,...,0.0,69080.0,7.500171e+04,0.0,0.0,0.0,0.0,1.555772e+08,43676.0,0.192308
23904,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,3.175717e+05,...,0.0,262604.0,3.333440e+05,0.0,0.0,0.0,0.0,1.365559e+08,0.0,1.181319


In [11]:
# Training
ori_train_x = ori_train.loc[:, train.columns != 'sar_flag']
ori_train_y = ori_train.loc[:, train.columns == 'sar_flag']
train_x = train.loc[:, train.columns != 'sar_flag']
train_y = train.loc[:, train.columns == 'sar_flag']
train_x

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_tw_ntd,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,trade_sum,tx_per_day
0,4,9044.740741,3033.355556,0.0,1388.550403,1.350000,1.097561,0.0,2.147186,5.050000e+02,...,688721.0,375576.0,4.886652e+05,27.0,45.0,0.0,496.0,1.515000e+03,0.0,0.000000
1,2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,6.973400e+04,...,0.0,2717416.0,2.621840e+06,0.0,0.0,0.0,0.0,2.092020e+05,0.0,0.000000
2,4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,7.764825e+04,...,0.0,326517.0,3.265170e+05,0.0,0.0,0.0,0.0,3.105930e+05,0.0,0.000000
3,4,0.000000,0.000000,0.0,1893.333333,0.000000,0.000000,0.0,1.500000,4.312500e+03,...,5680.0,1014759.0,1.275499e+05,0.0,0.0,0.0,3.0,4.312500e+04,0.0,0.000000
4,3,0.000000,0.000000,0.0,7048.368421,0.000000,0.000000,0.0,1.055556,4.939067e+03,...,133919.0,241719.0,2.212473e+05,0.0,0.0,0.0,19.0,2.963440e+05,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,0.0,988.503826,1.160714,2.513043,0.0,6.668367,1.078873e+06,...,2583949.0,3218731.0,2.361632e+06,65.0,578.0,0.0,2614.0,1.532000e+08,364459.0,0.390110
23902,3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,2.307641e+06,...,0.0,928963.0,5.041325e+05,0.0,0.0,0.0,0.0,4.615283e+07,22127.0,0.054945
23903,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,2.222531e+06,...,0.0,69080.0,7.500171e+04,0.0,0.0,0.0,0.0,1.555772e+08,43676.0,0.192308
23904,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,3.175717e+05,...,0.0,262604.0,3.333440e+05,0.0,0.0,0.0,0.0,1.365559e+08,0.0,1.181319


In [12]:
#normalize
from sklearn import preprocessing

def normalize_mms(input_df):
    scaler = preprocessing.MinMaxScaler()
    result = scaler.fit_transform(input_df)
    return pd.DataFrame(result, index=input_df.index, columns=input_df.columns)

In [13]:
train_x = normalize_mms(train_x)
train_x

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_tw_ntd,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,trade_sum,tx_per_day
0,0.4,0.005648,0.018591,0.0,0.000110,0.004122,0.055273,0.0,0.031604,0.000008,...,0.001897,0.005085,0.009321,0.000229,0.011081,0.0,0.023780,2.083666e-08,0.000000,0.000000
1,0.2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.001071,...,0.000000,0.036790,0.050010,0.000000,0.000000,0.0,0.000000,2.877274e-06,0.000000,0.000000
2,0.4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.001193,...,0.000000,0.004421,0.006228,0.000000,0.000000,0.0,0.000000,4.271763e-06,0.000000,0.000000
3,0.4,0.000000,0.000000,0.0,0.000151,0.000000,0.000000,0.0,0.022078,0.000066,...,0.000016,0.013738,0.002433,0.000000,0.000000,0.0,0.000144,5.931227e-07,0.000000,0.000000
4,0.3,0.000000,0.000000,0.0,0.000560,0.000000,0.000000,0.0,0.015536,0.000076,...,0.000369,0.003273,0.004220,0.000000,0.000000,0.0,0.000911,4.075788e-06,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,0.3,0.002705,0.007501,0.0,0.000079,0.003544,0.126556,0.0,0.098149,0.016576,...,0.007115,0.043577,0.045047,0.000551,0.142329,0.0,0.125324,2.107046e-03,0.008875,0.001085
23902,0.3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.035455,...,0.000000,0.012577,0.009616,0.000000,0.000000,0.0,0.000000,6.347662e-04,0.000539,0.000153
23903,0.6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.034148,...,0.000000,0.000935,0.001431,0.000000,0.000000,0.0,0.000000,2.139742e-03,0.001064,0.000535
23904,0.6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.004879,...,0.000000,0.003555,0.006358,0.000000,0.000000,0.0,0.000000,1.878131e-03,0.000000,0.003285


In [14]:
sm_x, sm_y = SMOTE_oversampling(train_x, train_y)

length of oversampled data is  33126
Number of non-sar in oversampled data 16563
Number of sar 16563
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


In [15]:
RF = RandomForestClassifier()
rfe_columns = RFE_analysis(sm_x, sm_y, RF)

Index(['AGE', 'avg_amt_nontw_nonntd', 'avg_amt_nontw_ntd', 'avg_amt_tw_nonntd',
       'avg_amt_tw_ntd', 'avg_freq_nontw_nonntd', 'avg_freq_nontw_ntd',
       'avg_freq_tw_nonntd', 'avg_freq_tw_ntd', 'avg_tx', 'count_tx',
       'cucah_quarter1_mean', 'cucah_quarter1_std', 'cucah_quarter1_sum',
       'cucah_quarter2_mean', 'cucah_quarter2_std', 'cucah_quarter2_sum',
       'cucah_quarter3_mean', 'cucah_quarter3_std', 'cucah_quarter3_sum',
       'cucah_quarter4_mean', 'cucah_quarter4_std', 'cucah_quarter4_sum',
       'cucah_total_mean', 'cucah_total_std', 'cucah_total_sum', 'risk_rank',
       'sd_amt_nontw_nonntd', 'sd_amt_nontw_ntd', 'sd_amt_tw_nonntd',
       'sd_amt_tw_ntd', 'sd_freq_nontw_nonntd', 'sd_freq_nontw_ntd',
       'sd_freq_tw_nonntd', 'sd_freq_tw_ntd', 'std_tx', 'sum_assets',
       'total_amt_nontw_nonntd', 'total_amt_nontw_ntd', 'total_amt_tw_nonntd',
       'total_amt_tw_ntd', 'total_asset_x', 'total_asset_y',
       'total_freq_nontw_nonntd', 'total_freq_nontw_ntd

In [16]:
# sm_x = sm_x[sm_x.columns[rfe_columns]]
sm_x

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_tw_ntd,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,trade_sum,tx_per_day
0,0.5,0.0,0.000000,0.0,0.000327,0.0,0.000000,0.0,0.028601,0.001230,...,0.001939,0.000411,0.003365,0.0,0.000000,0.0,0.008198,0.000118,0.000000,0.001328
1,0.3,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.040908,...,0.000000,0.000394,0.000644,0.0,0.000000,0.0,0.000000,0.005310,0.027290,0.002253
2,0.2,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000003,...,0.000000,0.003446,0.004579,0.0,0.000000,0.0,0.000000,0.000002,0.022337,0.017048
3,0.3,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000053,...,0.000000,0.008622,0.007377,0.0,0.000000,0.0,0.000000,0.000004,0.000000,0.005833
4,0.4,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000001,0.000002,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33121,0.3,0.0,0.000000,0.0,0.000017,0.0,0.000000,0.0,0.003955,0.003920,...,0.000021,0.000587,0.000826,0.0,0.000000,0.0,0.000258,0.000201,0.056286,0.000678
33122,0.3,0.0,0.000000,0.0,0.004243,0.0,0.000000,0.0,0.015965,0.001492,...,0.000434,0.002577,0.003631,0.0,0.000000,0.0,0.000830,0.001064,0.000000,0.004937
33123,0.3,0.0,0.000234,0.0,0.000017,0.0,0.010992,0.0,0.005341,0.019265,...,0.000079,0.001915,0.002099,0.0,0.000269,0.0,0.001392,0.001135,0.643624,0.005959
33124,0.4,0.0,0.002975,0.0,0.000022,0.0,0.006861,0.0,0.002435,0.000111,...,0.000013,0.000576,0.000811,0.0,0.000034,0.0,0.000111,0.000035,0.140464,0.003982


In [17]:
RF = LR_training(train_x, train_y, RF)

  LR_model.fit(x_train, y_train)


0.9999442244408501  0.9896269031286599
[[5912    2]
 [  60    3]]


In [18]:
RF = LR_training(sm_x, sm_y, RF)

  LR_model.fit(x_train, y_train)


0.9999597488327161  0.9905819850277711
[[4030   68]
 [  10 4174]]


In [19]:
# Training Evaluation: DONT USE RFE IN RF
# train_x_rfe = train_x[train_x.columns[rfe_columns]]
LR_test(train_x, train_y, RF)


0.9905881368694052
[[23492   180]
 [   45   189]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     23672
           1       0.51      0.81      0.63       234

    accuracy                           0.99     23906
   macro avg       0.76      0.90      0.81     23906
weighted avg       0.99      0.99      0.99     23906



In [20]:
# Fine-tuning test

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

RF_search = RandomForestClassifier()
RF_random = RandomizedSearchCV(estimator = RF_search, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RF_random.fit(sm_x, sm_y)
RF_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


{'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 40,
 'bootstrap': False}

In [21]:
# Test
# train_x_rfe = train_x[train_x.columns[rfe_columns]]


# {'n_estimators': 1000,
#  'min_samples_split': 2,
#  'min_samples_leaf': 1,
#  'max_features': 'auto',
#  'max_depth': 50,
#  'bootstrap': False}


# {'n_estimators': 400,
#  'min_samples_split': 2,
#  'min_samples_leaf': 1,
#  'max_features': 'sqrt',
#  'max_depth': None,
#  'bootstrap': False}

#best param
# RF_random = RandomForestClassifier(n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=50, bootstrap=False)
# RF_random.fit(sm_x, sm_y)
ori_train_x_normal = normalize_mms(ori_train_x)
LR_test(ori_train_x_normal, ori_train_y, RF_random)

0.9951894921776959
[[23600    72]
 [   43   191]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     23672
           1       0.73      0.82      0.77       234

    accuracy                           1.00     23906
   macro avg       0.86      0.91      0.88     23906
weighted avg       1.00      1.00      1.00     23906



In [22]:
# Test
xdp = df_from_csv('ori_xdp.csv')

test_xdp = df_from_csv('submit_format.csv')
test_xdp = test_xdp.drop(['probability'], axis=1)
test_xdp = test_xdp.merge(alert_date, on='alert_key', how='left')
test_xdp = test_xdp.merge(alert_cust, on='alert_key', how='left')
test_xdp = test_xdp.progress_apply(calculate_tx, axis=1)
test_xdp

Progress Bar: 100%|██████████| 3850/3850 [02:08<00:00, 29.86it/s]


Unnamed: 0,AGE,alert_key,avg_tx,count_tx,cust_id,date,occupation_code,risk_rank,std_tx,total_asset,total_tx,tx_per_day
0,3.0,357307,3.715794e+04,280.0,1d69b4daf9c5c8b8d68b1776193b6e80bf2e84c999d038...,370.0,4.0,1.0,1.134937e+05,2588452.0,1.040422e+07,1.756757
1,,376329,,,,,,,,,,
2,,373644,,,,,,,,,,
3,5.0,357668,3.953497e+05,356.0,8b51184740375f7ccdd68484aeeaca44c5892818eda908...,370.0,19.0,1.0,3.312853e+06,1964540.0,1.407445e+08,1.962162
4,3.0,354443,1.391112e+06,243.0,c6def618ad861703c025be4f41bdf7569310228ae93aef...,372.0,2.0,1.0,5.941655e+06,444392.0,3.380401e+08,1.653226
...,...,...,...,...,...,...,...,...,...,...,...,...
3845,2.0,364485,2.116042e+04,339.0,f41c0860cf0cb5e7b72b0ced16ab427a06fc0ad3fc0f71...,393.0,17.0,1.0,4.365511e+04,644169.0,7.173381e+06,1.862595
3846,2.0,363155,1.021493e+06,530.0,823fc5ce48cd827628ce0d1c574e6c3582cb772cf6173a...,392.0,17.0,1.0,4.010326e+06,114439.0,5.413915e+08,2.352041
3847,,368710,,,,,,,,,,
3848,3.0,358067,2.265351e+06,309.0,33ff49ea8a07c6d1b7cc203dcc3638ebde62dfb960a169...,382.0,9.0,1.0,7.909179e+06,367478.0,6.999934e+08,1.808901


In [23]:
# Test
test = df_from_csv('submit_format.csv')
test = test.drop(['probability'], axis=1)
test = test.merge(test_xdp, on='alert_key', how='left')
test = test.merge(ccba, on='cust_id', how='left')
test = test.merge(cdtx, on='cust_id', how='left')
test = test.merge(remit, on='alert_key', how='left')
test = test.drop(['alert_key', 'cust_id', 'occupation_code', 'total_asset', 'date'], axis=1)
test['bad_value'] = test.isnull().sum(axis=1) + (test == 0).astype(int).sum(axis=1)
test = test.loc[test['bad_value'] < 50]
test.iloc[:, :] = test.iloc[:, :].fillna(0)
test = test.drop(['bad_value'], axis=1)
test = test.reindex(sorted(test.columns), axis=1)
test = normalize_mms(test)
test

#test[test.columns.difference(train_x.columns.tolist(), sort=False)]

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_tw_ntd,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,trade_sum,tx_per_day
0,0.250,0.000000,0.000000,0.0,0.009252,0.000000,0.000000,0.0,0.018408,0.002079,...,0.098068,0.133775,0.150170,0.000000,0.00000,0.0,0.007458,0.001114,0.000000,0.002202
3,0.500,0.000000,0.000000,0.0,0.000204,0.000000,0.000000,0.0,0.016324,0.022120,...,0.000327,0.101530,0.113974,0.000000,0.00000,0.0,0.001126,0.015066,0.000131,0.002800
4,0.250,0.000000,0.004095,0.0,0.000036,0.000000,0.112200,0.0,0.028396,0.077833,...,0.001485,0.022967,0.021281,0.000000,0.00123,0.0,0.029130,0.036186,0.063476,0.001901
5,0.375,0.000000,0.004175,0.0,0.000037,0.000000,0.112200,0.0,0.042043,0.008877,...,0.002010,0.017026,0.018172,0.000000,0.00369,0.0,0.038418,0.004807,0.054310,0.002117
6,0.250,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.012880,...,0.000000,0.016757,0.018810,0.000000,0.00000,0.0,0.000000,0.002366,0.019646,0.000711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3842,0.375,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000
3844,0.625,0.000000,0.000000,0.0,0.000443,0.000000,0.000000,0.0,0.018089,0.002789,...,0.003633,0.020827,0.030088,0.000000,0.00000,0.0,0.005770,0.000459,0.185010,0.000648
3845,0.125,0.001118,0.009599,0.0,0.000108,0.054348,0.115600,0.0,0.033196,0.001184,...,0.013083,0.033291,0.037372,0.000138,0.02091,0.0,0.085280,0.000768,0.000000,0.002510
3846,0.125,0.000000,0.007782,0.0,0.000155,0.000000,0.115233,0.0,0.022280,0.057153,...,0.003137,0.005914,0.010011,0.000000,0.02337,0.0,0.014213,0.057954,0.085655,0.003935


In [24]:
y_test = RF_random.predict_proba(test)
y_test = pd.DataFrame(y_test, columns=['probability0','probability'])
y_test = y_test.drop(['probability0'], axis=1)
# y_test['probability'] = round(y_test['probability'], 6)
y_test.index = test.index
y_test

Unnamed: 0,probability
0,0.193571
3,0.062143
4,0.092143
5,0.098571
6,0.007857
...,...
3842,0.001183
3844,0.046429
3845,0.095714
3846,0.090714


In [28]:
y_test.sort_values(by='probability', ascending=True)

Unnamed: 0,probability
1719,0.000000
2212,0.000000
1090,0.000000
3161,0.000000
86,0.000005
...,...
2617,0.605371
1467,0.606831
2548,0.633711
541,0.664775


In [31]:
y_test.loc[y_test['probability'] == 0, 'probability'] = 0.000001
y_test.sort_values(by='probability', ascending=True)

Unnamed: 0,probability
1719,0.000001
2212,0.000001
1090,0.000001
3161,0.000001
86,0.000005
...,...
2617,0.605371
1467,0.606831
2548,0.633711
541,0.664775


In [25]:
xy_test = pd.concat([test, y_test], axis=1)
submit = df_from_csv('submit_format.csv')
submit = submit.drop(columns=['probability'])
submit = pd.concat([submit, xy_test], axis=1)
submit = submit[['alert_key','probability']]
submit['probability'] = submit['probability'].fillna(0)
submit = submit.sort_values(by='probability', ascending=False)
submit.to_csv('final_submit.csv')
submit

Unnamed: 0,alert_key,probability
3472,364978,0.714335
541,364667,0.664775
2548,354178,0.633711
1467,364467,0.606831
2617,361996,0.605371
...,...,...
1648,371071,0.000000
1650,369630,0.000000
1651,373443,0.000000
1652,374325,0.000000


In [26]:
submit.isnull().any()

alert_key      False
probability    False
dtype: bool

In [27]:
submit.loc[submit.duplicated(keep=False)]

Unnamed: 0,alert_key,probability
