In [1]:
#Base

import pandas as pd
import os

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import RFE

import statsmodels.api as sm

from tqdm import tqdm


dataset_folder = os.path.join(os.getcwd(), 'dataset')

def df_from_csv(filename):
    f = os.path.join(dataset_folder, filename)
    return pd.read_csv(f, delimiter='\t|\n|,', engine='python')

x_train = df_from_csv('ori_trainx.csv')
y_train = df_from_csv('ori_trainy.csv')
alert_date = df_from_csv('ori_alert_date.csv')
alert_cust = df_from_csv('ori_custinfo.csv')
xdp = df_from_csv('ori_xdp.csv')

smote = SMOTE(random_state=0)
tqdm.pandas(desc='Progress Bar')

def SMOTE_oversampling(input_x, input_y):
    x_train, x_test, y_train, y_test = train_test_split(input_x, input_y, test_size=0.3)
    x_cols = x_train.columns
    y_cols = y_train.columns

    sm_data_x, sm_data_y = smote.fit_resample(x_train, y_train)
    sm_data_x = pd.DataFrame(data=sm_data_x, columns=x_cols)
    sm_data_y = pd.DataFrame(data=sm_data_y, columns=y_cols)

    print("length of oversampled data is ",len(sm_data_x))
    print("Number of non-sar in oversampled data",len(sm_data_y[sm_data_y['sar_flag']==0]))
    print("Number of sar",len(sm_data_y[sm_data_y['sar_flag']==1]))
    print("Proportion of no subscription data in oversampled data is ", len(sm_data_y[sm_data_y['sar_flag']==0])/len(sm_data_x))
    print("Proportion of subscription data in oversampled data is ", len(sm_data_y[sm_data_y['sar_flag']==1])/len(sm_data_x))

    return sm_data_x, sm_data_y

def RFE_analysis(input_x, input_y, model):
    rfe = RFE(model, step=300)
    rfe.fit(input_x, input_y.values.ravel())
    print(input_x.columns)
    print(rfe.support_)
    print(rfe.ranking_)
    return rfe.get_support(1)

def LR_analysis(input_x, input_y):
    sm_LR = sm.Logit(input_y, input_x)
    result = sm_LR.fit()
    print(result.summary2())

def LR_training(input_x, input_y, LR_model):
    x_train, x_test, y_train, y_test = train_test_split(input_x, input_y, test_size=0.25)
    LR_model.fit(x_train, y_train)
    y_train_predict = LR_model.predict(x_train)
    y_test_predict = LR_model.predict(x_test)
    train_acc = accuracy_score(y_train, y_train_predict)
    test_acc = accuracy_score(y_test, y_test_predict)
    print(f'{train_acc}  {test_acc}')
    cm = confusion_matrix(y_true=y_test, y_pred=y_test_predict)
    print(cm)
    return LR_model

def LR_test(input_x, input_y, LR_model):
    y_predict = LR_model.predict(input_x)
    test_acc = accuracy_score(input_y, y_predict)
    print(f'{test_acc}')
    cm = confusion_matrix(y_true=input_y, y_pred=y_predict)
    print(cm)
    print(classification_report(y_true=input_y, y_pred=y_predict))

def calculate_tx(row):
    if not pd.isna(row['risk_rank']):
        temp_frame = xdp[(xdp['tx_date'] <= row['date']) & (xdp['cust_id'] == row['cust_id'])]
        total_tx = (temp_frame['tx_amt'] * temp_frame['exchg_rate'])
        row['total_tx'] = total_tx.sum()
        row['avg_tx'] = total_tx.mean()
        row['count_tx'] = total_tx.count()
        row['std_tx'] = total_tx.std()
        row['tx_per_day'] = row['count_tx']/row['date']+1
    return row


  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [2]:
# Join data for training

ccba = pd.read_pickle('aggregated_ccba_cust.pkl')
cdtx = pd.read_pickle('cdtx_agg.pkl')
xdp = pd.read_pickle('xdp_x_train.pkl')
remit = pd.read_pickle('cust_remit_raw.pkl')

In [3]:
xdp.head()

Unnamed: 0,alert_key,date,sar_flag,cust_id,risk_rank,occupation_code,total_asset,AGE,cr_1_total_tx,cr_1_avg_tx,...,db_2_total_tx,db_2_avg_tx,db_2_count_tx,db_2_std_tx,db_2_tx_per_day,db_3_total_tx,db_3_avg_tx,db_3_count_tx,db_3_std_tx,db_3_tx_per_day
0,171189,0,0,972ee157e63316e8a50dd489bc93730a3ee8a8959d5c6b...,1,17.0,375576.0,4,0.0,,...,0.0,,0,,0.0,0.0,,0,,0.0
1,171202,0,0,a10ab33f90926fb18d7bb5e78034d7f04a1fbed95b7951...,3,12.0,2717416.0,2,0.0,,...,0.0,,0,,0.0,156902.0,156902.0,1,,1.0
2,171599,0,0,3433ecc068ed1c9e2f5037cab5d42d7b901e9bd624c1fb...,1,12.0,326517.0,4,0.0,,...,0.0,,0,,0.0,0.0,,0,,0.0
3,171737,0,0,a0861608615a9365d90f4ba0a813c0ea0471987f925c8b...,3,14.0,1014759.0,4,0.0,,...,0.0,,0,,0.0,0.0,,0,,0.0
4,171142,0,0,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,12.0,241719.0,3,0.0,,...,0.0,,0,,0.0,0.0,,0,,0.0


In [4]:
train = df_from_csv('ori_trainx.csv')
train = train.merge(df_from_csv('ori_trainy.csv'), on='alert_key', how='inner')
train = train.merge(alert_cust, on='alert_key', how='left')
train = train[['alert_key', 'cust_id']]
train = train.merge(ccba, on='cust_id', how='left')
train = train.merge(cdtx, on='cust_id', how='left')
train = train.merge(xdp, on='alert_key', how='left')
train = train.merge(remit, on='alert_key', how='left')
train = train.reindex(sorted(train.columns), axis=1)
train

Unnamed: 0,AGE,alert_key,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,4,171189,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,...,,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,,496.0,
1,2,171202,,,,,,,,,...,,,2717416.0,2717416.0,2.621840e+06,,,,,
2,4,171599,,,,,,,,,...,,,326517.0,326517.0,3.265170e+05,,,,,
3,4,171737,,,,1893.333333,,,,1.500000,...,,5680.0,1014759.0,1014759.0,1.275499e+05,,,,3.0,
4,3,171142,,,,7048.368421,,,,1.055556,...,,133919.0,241719.0,241719.0,2.212473e+05,,,,19.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,352132,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,...,,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,,2614.0,364459.0
23902,3,352125,,,,,,,,,...,,,928963.0,928963.0,5.041325e+05,,,,,22127.0
23903,6,352080,,,,,,,,,...,,,69080.0,69080.0,7.500171e+04,,,,,43676.0
23904,6,352075,,,,,,,,,...,,,262604.0,262604.0,3.333440e+05,,,,,


In [5]:
train['bad_value'] = train.isnull().sum(axis=1) + (train == 0).astype(int).sum(axis=1)

In [6]:
train = train.drop(['cust_id_x', 'cust_id_y', 'date'], axis=1)
train.to_pickle('aggregate.pkl')

In [7]:
train = train.drop(['alert_key'], axis=1)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum,bad_value
0,4,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,,...,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,,496.0,,54
1,2,,,,,,,,,,...,,2717416.0,2717416.0,2.621840e+06,,,,,,60
2,4,,,,,,,,,,...,,326517.0,326517.0,3.265170e+05,,,,,,72
3,4,,,,1893.333333,,,,1.500000,,...,5680.0,1014759.0,1014759.0,1.275499e+05,,,,3.0,,62
4,3,,,,7048.368421,,,,1.055556,,...,133919.0,241719.0,241719.0,2.212473e+05,,,,19.0,,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,932.000000,...,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,,2614.0,364459.0,43
23902,3,,,,,,,,,,...,,928963.0,928963.0,5.041325e+05,,,,,22127.0,61
23903,6,,,,,,,,,360378.928571,...,,69080.0,69080.0,7.500171e+04,,,,,43676.0,60
23904,6,,,,,,,,,369905.885463,...,,262604.0,262604.0,3.333440e+05,,,,,,56


In [8]:
train['bad_value'] = train.isnull().sum(axis=1) + (train == 0).astype(int).sum(axis=1)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum,bad_value
0,4,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,,...,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,,496.0,,53
1,2,,,,,,,,,,...,,2717416.0,2717416.0,2.621840e+06,,,,,,59
2,4,,,,,,,,,,...,,326517.0,326517.0,3.265170e+05,,,,,,71
3,4,,,,1893.333333,,,,1.500000,,...,5680.0,1014759.0,1014759.0,1.275499e+05,,,,3.0,,61
4,3,,,,7048.368421,,,,1.055556,,...,133919.0,241719.0,241719.0,2.212473e+05,,,,19.0,,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,932.000000,...,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,,2614.0,364459.0,43
23902,3,,,,,,,,,,...,,928963.0,928963.0,5.041325e+05,,,,,22127.0,61
23903,6,,,,,,,,,360378.928571,...,,69080.0,69080.0,7.500171e+04,,,,,43676.0,60
23904,6,,,,,,,,,369905.885463,...,,262604.0,262604.0,3.333440e+05,,,,,,56


In [9]:
ori_train = train.copy()
# train = train.loc[train['bad_value']<41]
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum,bad_value
0,4,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,,...,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,,496.0,,53
1,2,,,,,,,,,,...,,2717416.0,2717416.0,2.621840e+06,,,,,,59
2,4,,,,,,,,,,...,,326517.0,326517.0,3.265170e+05,,,,,,71
3,4,,,,1893.333333,,,,1.500000,,...,5680.0,1014759.0,1014759.0,1.275499e+05,,,,3.0,,61
4,3,,,,7048.368421,,,,1.055556,,...,133919.0,241719.0,241719.0,2.212473e+05,,,,19.0,,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,932.000000,...,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,,2614.0,364459.0,43
23902,3,,,,,,,,,,...,,928963.0,928963.0,5.041325e+05,,,,,22127.0,61
23903,6,,,,,,,,,360378.928571,...,,69080.0,69080.0,7.500171e+04,,,,,43676.0,60
23904,6,,,,,,,,,369905.885463,...,,262604.0,262604.0,3.333440e+05,,,,,,56


In [10]:
ori_train.iloc[:, 1:] = ori_train.iloc[:, 1:].fillna(0)
train.iloc[:, 1:] = train.iloc[:, 1:].fillna(0)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum,bad_value
0,4,9044.740741,3033.355556,0.0,1388.550403,1.350000,1.097561,0.0,2.147186,0.000000,...,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,0.0,496.0,0.0,53
1,2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,2717416.0,2717416.0,2.621840e+06,0.0,0.0,0.0,0.0,0.0,59
2,4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,326517.0,326517.0,3.265170e+05,0.0,0.0,0.0,0.0,0.0,71
3,4,0.000000,0.000000,0.0,1893.333333,0.000000,0.000000,0.0,1.500000,0.000000,...,5680.0,1014759.0,1014759.0,1.275499e+05,0.0,0.0,0.0,3.0,0.0,61
4,3,0.000000,0.000000,0.0,7048.368421,0.000000,0.000000,0.0,1.055556,0.000000,...,133919.0,241719.0,241719.0,2.212473e+05,0.0,0.0,0.0,19.0,0.0,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,0.0,988.503826,1.160714,2.513043,0.0,6.668367,932.000000,...,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,0.0,2614.0,364459.0,43
23902,3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,928963.0,928963.0,5.041325e+05,0.0,0.0,0.0,0.0,22127.0,61
23903,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,360378.928571,...,0.0,69080.0,69080.0,7.500171e+04,0.0,0.0,0.0,0.0,43676.0,60
23904,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,369905.885463,...,0.0,262604.0,262604.0,3.333440e+05,0.0,0.0,0.0,0.0,0.0,56


In [11]:
ori_train = ori_train.drop(['bad_value'], axis=1)
train = train.drop(['bad_value'], axis=1)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,4,9044.740741,3033.355556,0.0,1388.550403,1.350000,1.097561,0.0,2.147186,0.000000,...,0.0,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,0.0,496.0,0.0
1,2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,2717416.0,2717416.0,2.621840e+06,0.0,0.0,0.0,0.0,0.0
2,4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,326517.0,326517.0,3.265170e+05,0.0,0.0,0.0,0.0,0.0
3,4,0.000000,0.000000,0.0,1893.333333,0.000000,0.000000,0.0,1.500000,0.000000,...,0.0,5680.0,1014759.0,1014759.0,1.275499e+05,0.0,0.0,0.0,3.0,0.0
4,3,0.000000,0.000000,0.0,7048.368421,0.000000,0.000000,0.0,1.055556,0.000000,...,0.0,133919.0,241719.0,241719.0,2.212473e+05,0.0,0.0,0.0,19.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,0.0,988.503826,1.160714,2.513043,0.0,6.668367,932.000000,...,0.0,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,0.0,2614.0,364459.0
23902,3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,928963.0,928963.0,5.041325e+05,0.0,0.0,0.0,0.0,22127.0
23903,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,360378.928571,...,0.0,0.0,69080.0,69080.0,7.500171e+04,0.0,0.0,0.0,0.0,43676.0
23904,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,369905.885463,...,0.0,0.0,262604.0,262604.0,3.333440e+05,0.0,0.0,0.0,0.0,0.0


In [12]:
# Training
ori_train_x = ori_train.loc[:, train.columns != 'sar_flag']
ori_train_y = ori_train.loc[:, train.columns == 'sar_flag']
train_x = train.loc[:, train.columns != 'sar_flag']
train_y = train.loc[:, train.columns == 'sar_flag']
train_x

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,4,9044.740741,3033.355556,0.0,1388.550403,1.350000,1.097561,0.0,2.147186,0.000000,...,0.0,688721.0,375576.0,375576.0,4.886652e+05,27.0,45.0,0.0,496.0,0.0
1,2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,2717416.0,2717416.0,2.621840e+06,0.0,0.0,0.0,0.0,0.0
2,4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,326517.0,326517.0,3.265170e+05,0.0,0.0,0.0,0.0,0.0
3,4,0.000000,0.000000,0.0,1893.333333,0.000000,0.000000,0.0,1.500000,0.000000,...,0.0,5680.0,1014759.0,1014759.0,1.275499e+05,0.0,0.0,0.0,3.0,0.0
4,3,0.000000,0.000000,0.0,7048.368421,0.000000,0.000000,0.0,1.055556,0.000000,...,0.0,133919.0,241719.0,241719.0,2.212473e+05,0.0,0.0,0.0,19.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,0.0,988.503826,1.160714,2.513043,0.0,6.668367,932.000000,...,0.0,2583949.0,3218731.0,3218731.0,2.361632e+06,65.0,578.0,0.0,2614.0,364459.0
23902,3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,928963.0,928963.0,5.041325e+05,0.0,0.0,0.0,0.0,22127.0
23903,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,360378.928571,...,0.0,0.0,69080.0,69080.0,7.500171e+04,0.0,0.0,0.0,0.0,43676.0
23904,6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,369905.885463,...,0.0,0.0,262604.0,262604.0,3.333440e+05,0.0,0.0,0.0,0.0,0.0


In [13]:
#normalize
from sklearn import preprocessing

def normalize_mms(input_df):
    scaler = preprocessing.MinMaxScaler()
    result = scaler.fit_transform(input_df)
    return pd.DataFrame(result, index=input_df.index, columns=input_df.columns)

In [14]:
train_x = normalize_mms(train_x)
train_x

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,0.4,0.005648,0.018591,0.0,0.000110,0.004122,0.055273,0.0,0.031604,0.000308,...,0.0,0.001897,0.005085,0.005085,0.009321,0.000229,0.011081,0.0,0.023780,0.000000
1,0.2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000308,...,0.0,0.000000,0.036790,0.036790,0.050010,0.000000,0.000000,0.0,0.000000,0.000000
2,0.4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000308,...,0.0,0.000000,0.004421,0.004421,0.006228,0.000000,0.000000,0.0,0.000000,0.000000
3,0.4,0.000000,0.000000,0.0,0.000151,0.000000,0.000000,0.0,0.022078,0.000308,...,0.0,0.000016,0.013738,0.013738,0.002433,0.000000,0.000000,0.0,0.000144,0.000000
4,0.3,0.000000,0.000000,0.0,0.000560,0.000000,0.000000,0.0,0.015536,0.000308,...,0.0,0.000369,0.003273,0.003273,0.004220,0.000000,0.000000,0.0,0.000911,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,0.3,0.002705,0.007501,0.0,0.000079,0.003544,0.126556,0.0,0.098149,0.000354,...,0.0,0.007115,0.043577,0.043577,0.045047,0.000551,0.142329,0.0,0.125324,0.008875
23902,0.3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000308,...,0.0,0.000000,0.012577,0.012577,0.009616,0.000000,0.000000,0.0,0.000000,0.000539
23903,0.6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.018332,...,0.0,0.000000,0.000935,0.000935,0.001431,0.000000,0.000000,0.0,0.000000,0.001064
23904,0.6,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.018809,...,0.0,0.000000,0.003555,0.003555,0.006358,0.000000,0.000000,0.0,0.000000,0.000000


In [15]:
sm_x, sm_y = SMOTE_oversampling(train_x, train_y)

length of oversampled data is  33170
Number of non-sar in oversampled data 16585
Number of sar 16585
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


In [16]:
RF = RandomForestClassifier()
rfe_columns = RFE_analysis(sm_x, sm_y, RF)

Index(['AGE', 'avg_amt_nontw_nonntd', 'avg_amt_nontw_ntd', 'avg_amt_tw_nonntd',
       'avg_amt_tw_ntd', 'avg_freq_nontw_nonntd', 'avg_freq_nontw_ntd',
       'avg_freq_tw_nonntd', 'avg_freq_tw_ntd', 'cr_1_avg_tx', 'cr_1_count_tx',
       'cr_1_std_tx', 'cr_1_total_tx', 'cr_1_tx_per_day', 'cr_2_avg_tx',
       'cr_2_count_tx', 'cr_2_std_tx', 'cr_2_total_tx', 'cr_2_tx_per_day',
       'cr_3_avg_tx', 'cr_3_count_tx', 'cr_3_std_tx', 'cr_3_total_tx',
       'cr_3_tx_per_day', 'cucah_quarter1_mean', 'cucah_quarter1_std',
       'cucah_quarter1_sum', 'cucah_quarter2_mean', 'cucah_quarter2_std',
       'cucah_quarter2_sum', 'cucah_quarter3_mean', 'cucah_quarter3_std',
       'cucah_quarter3_sum', 'cucah_quarter4_mean', 'cucah_quarter4_std',
       'cucah_quarter4_sum', 'cucah_total_mean', 'cucah_total_std',
       'cucah_total_sum', 'db_1_avg_tx', 'db_1_count_tx', 'db_1_std_tx',
       'db_1_total_tx', 'db_1_tx_per_day', 'db_2_avg_tx', 'db_2_count_tx',
       'db_2_std_tx', 'db_2_total_tx', '

In [17]:
# sm_x = sm_x[sm_x.columns[rfe_columns]]
sm_x

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,0.400000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.003402,...,0.0,0.000000e+00,0.004325,0.004325,0.009470,0.000000,0.000000,0.0,0.000000,0.107272
1,0.400000,0.000365,0.004162,0.0,0.000139,0.003054,0.050360,0.0,0.019391,0.000308,...,0.0,8.010966e-04,0.000401,0.000401,0.000540,0.000025,0.000246,0.0,0.007959,0.000000
2,0.300000,0.000000,0.000000,0.0,0.000002,0.000000,0.000000,0.0,0.014719,0.000467,...,0.0,8.261141e-08,0.001434,0.001434,0.009925,0.000000,0.000000,0.0,0.000048,0.000000
3,0.500000,0.000000,0.021041,0.0,0.003924,0.000000,0.050360,0.0,0.024885,0.000308,...,0.0,2.228357e-02,0.007716,0.007716,0.008875,0.000000,0.000246,0.0,0.007863,0.000000
4,0.200000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.019010,...,0.0,0.000000e+00,0.007708,0.007708,0.008692,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33165,0.291320,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.002643,...,0.0,0.000000e+00,0.007653,0.007653,0.010783,0.000000,0.000000,0.0,0.000000,0.000000
33166,0.408901,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000312,...,0.0,0.000000e+00,0.000152,0.000152,0.000214,0.000000,0.000000,0.0,0.000000,0.000000
33167,0.200000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000405,...,0.0,0.000000e+00,0.000107,0.000107,0.000151,0.000000,0.000000,0.0,0.000000,0.054262
33168,0.431741,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000425,...,0.0,0.000000e+00,0.000913,0.000913,0.001286,0.000000,0.000000,0.0,0.000000,0.143436


In [18]:
RF = LR_training(train_x, train_y, RF)

  LR_model.fit(x_train, y_train)


1.0  0.9886230550443367
[[5907    5]
 [  63    2]]


In [19]:
RF = LR_training(sm_x, sm_y, RF)

  LR_model.fit(x_train, y_train)


0.9999598022269566  0.9955384058844808
[[4166   22]
 [  15 4090]]


In [20]:
# Training Evaluation: DONT USE RFE IN RF
# train_x_rfe = train_x[train_x.columns[rfe_columns]]
LR_test(train_x, train_y, RF)


0.9939764075964193
[[23611    61]
 [   83   151]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     23672
           1       0.71      0.65      0.68       234

    accuracy                           0.99     23906
   macro avg       0.85      0.82      0.84     23906
weighted avg       0.99      0.99      0.99     23906



In [21]:
# Fine-tuning test

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

RF_search = RandomForestClassifier()
RF_random = RandomizedSearchCV(estimator = RF_search, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RF_random.fit(sm_x, sm_y)
RF_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'n_estimators': 600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 60,
 'bootstrap': False}

In [22]:
# Test
# train_x_rfe = train_x[train_x.columns[rfe_columns]]


# {'n_estimators': 1000,
#  'min_samples_split': 2,
#  'min_samples_leaf': 1,
#  'max_features': 'auto',
#  'max_depth': 50,
#  'bootstrap': False}


# {'n_estimators': 400,
#  'min_samples_split': 2,
#  'min_samples_leaf': 1,
#  'max_features': 'sqrt',
#  'max_depth': None,
#  'bootstrap': False}

#best param
# RF_random = RandomForestClassifier(n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=50, bootstrap=False)
# RF_random.fit(sm_x, sm_y)
ori_train_x_normal = normalize_mms(ori_train_x)
LR_test(ori_train_x_normal, ori_train_y, RF_random)

0.995775119216933
[[23648    24]
 [   77   157]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     23672
           1       0.87      0.67      0.76       234

    accuracy                           1.00     23906
   macro avg       0.93      0.83      0.88     23906
weighted avg       1.00      1.00      1.00     23906



In [23]:
# Test
# xdp = df_from_csv('ori_xdp.csv')
#
# test_xdp = df_from_csv('submit_format.csv')
# test_xdp = test_xdp.drop(['probability'], axis=1)
# test_xdp = test_xdp.merge(alert_date, on='alert_key', how='left')
# test_xdp = test_xdp.merge(alert_cust, on='alert_key', how='left')
# test_xdp = test_xdp.progress_apply(calculate_tx, axis=1)
# test_xdp

In [24]:
test_xdp = pd.read_pickle('xdp_x_test.pkl')

In [31]:
# Test
test = df_from_csv('submit_format.csv')
test = test.drop(['probability'], axis=1)
test = test.merge(test_xdp, on='alert_key', how='left')
test = test.merge(ccba, on='cust_id', how='left')
test = test.merge(cdtx, on='cust_id', how='left')
test = test.merge(remit, on='alert_key', how='left')
test = test.drop(['alert_key', 'cust_id', 'date'], axis=1)
test['bad_value'] = test.isnull().sum(axis=1) + (test == 0).astype(int).sum(axis=1)
test = test.loc[test['bad_value'] < 77]
test.iloc[:, :] = test.iloc[:, :].fillna(0)
test = test.drop(['bad_value'], axis=1)
test = test.reindex(sorted(test.columns), axis=1)
test = normalize_mms(test)
test

#test[test.columns.difference(train_x.columns.tolist(), sort=False)]

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,cr_1_avg_tx,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,0.250,0.000000,0.000000,0.0,0.009252,0.000000,0.000000,0.0,0.018408,0.005766,...,0.0,0.098068,0.133775,0.133775,0.150170,0.000000,0.00000,0.0,0.007458,0.000000
3,0.500,0.000000,0.000000,0.0,0.000204,0.000000,0.000000,0.0,0.016324,0.000000,...,0.0,0.000327,0.101530,0.101530,0.113974,0.000000,0.00000,0.0,0.001126,0.000131
4,0.250,0.000000,0.004095,0.0,0.000036,0.000000,0.112200,0.0,0.028396,0.000000,...,0.0,0.001485,0.022967,0.022967,0.021281,0.000000,0.00123,0.0,0.029130,0.063476
5,0.375,0.000000,0.004175,0.0,0.000037,0.000000,0.112200,0.0,0.042043,0.002113,...,0.0,0.002010,0.017026,0.017026,0.018172,0.000000,0.00369,0.0,0.038418,0.054310
6,0.250,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000448,...,0.0,0.000000,0.016757,0.016757,0.018810,0.000000,0.00000,0.0,0.000000,0.019646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3842,0.375,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000
3844,0.625,0.000000,0.000000,0.0,0.000443,0.000000,0.000000,0.0,0.018089,0.000000,...,0.0,0.003633,0.020827,0.020827,0.030088,0.000000,0.00000,0.0,0.005770,0.185010
3845,0.125,0.001118,0.009599,0.0,0.000108,0.054348,0.115600,0.0,0.033196,0.015637,...,0.0,0.013083,0.033291,0.033291,0.037372,0.000138,0.02091,0.0,0.085280,0.000000
3846,0.125,0.000000,0.007782,0.0,0.000155,0.000000,0.115233,0.0,0.022280,0.000001,...,0.0,0.003137,0.005914,0.005914,0.010011,0.000000,0.02337,0.0,0.014213,0.085655


In [33]:
y_test = RF_random.predict_proba(test)
y_test = pd.DataFrame(y_test, columns=['probability0','probability'])
y_test = y_test.drop(['probability0'], axis=1)
# y_test['probability'] = round(y_test['probability'], 6)
y_test.index = test.index
y_test

Unnamed: 0,probability
0,0.212222
3,0.024444
4,0.140694
5,0.260000
6,0.080000
...,...
3842,0.035000
3844,0.065139
3845,0.239028
3846,0.167639


In [34]:
y_test.sort_values(by='probability', ascending=True)

Unnamed: 0,probability
2019,0.000000
1569,0.000000
1530,0.000000
2735,0.000000
1057,0.000000
...,...
2681,0.477222
3537,0.480278
1816,0.487180
3472,0.539722


In [None]:
y_test.loc[y_test['probability'] == 0, 'probability'] = 0.000001
y_test.sort_values(by='probability', ascending=True)

In [35]:
xy_test = pd.concat([test, y_test], axis=1)
submit = df_from_csv('submit_format.csv')
submit = submit.drop(columns=['probability'])
submit = pd.concat([submit, xy_test], axis=1)
submit = submit[['alert_key','probability']]
submit['probability'] = submit['probability'].fillna(0)
submit = submit.sort_values(by='probability', ascending=False)
submit.to_csv('final_submit.csv')
submit

Unnamed: 0,alert_key,probability
3745,364031,0.569444
3472,364978,0.539722
1816,355737,0.487180
3537,353075,0.480278
2681,358192,0.477222
...,...,...
1646,367567,0.000000
1647,373667,0.000000
1648,371071,0.000000
1650,369630,0.000000


In [None]:
submit.isnull().any()

In [None]:
submit.loc[submit.duplicated(keep=False)]