In [83]:
#Base

import pandas as pd
import os

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

from sklearn.feature_selection import RFE

import statsmodels.api as sm

from tqdm import tqdm


dataset_folder = os.path.join(os.getcwd(), 'dataset')

def df_from_csv(filename):
    f = os.path.join(dataset_folder, filename)
    return pd.read_csv(f, delimiter='\t|\n|,', engine='python')

x_train = df_from_csv('ori_trainx.csv')
y_train = df_from_csv('ori_trainy.csv')
alert_date = df_from_csv('ori_alert_date.csv')
alert_cust = df_from_csv('ori_custinfo.csv')
xdp = df_from_csv('ori_xdp.csv')

smote = SMOTE(random_state=0)
tqdm.pandas(desc='Progress Bar')

def SMOTE_oversampling(input_x, input_y):
    x_train, x_test, y_train, y_test = train_test_split(input_x, input_y, test_size=0.3)
    x_cols = x_train.columns
    y_cols = y_train.columns

    sm_data_x, sm_data_y = smote.fit_resample(x_train, y_train)
    sm_data_x = pd.DataFrame(data=sm_data_x, columns=x_cols)
    sm_data_y = pd.DataFrame(data=sm_data_y, columns=y_cols)

    print("length of oversampled data is ",len(sm_data_x))
    print("Number of non-sar in oversampled data",len(sm_data_y[sm_data_y['sar_flag']==0]))
    print("Number of sar",len(sm_data_y[sm_data_y['sar_flag']==1]))
    print("Proportion of no subscription data in oversampled data is ", len(sm_data_y[sm_data_y['sar_flag']==0])/len(sm_data_x))
    print("Proportion of subscription data in oversampled data is ", len(sm_data_y[sm_data_y['sar_flag']==1])/len(sm_data_x))

    return sm_data_x, sm_data_y

def RFE_analysis(input_x, input_y, model):
    rfe = RFE(model, step=300)
    rfe.fit(input_x, input_y.values.ravel())
    print(input_x.columns)
    print(rfe.support_)
    print(rfe.ranking_)
    return rfe.get_support(1)

def LR_analysis(input_x, input_y):
    sm_LR = sm.Logit(input_y, input_x)
    result = sm_LR.fit()
    print(result.summary2())

def LR_training(input_x, input_y, LR_model):
    x_train, x_test, y_train, y_test = train_test_split(input_x, input_y, test_size=0.25)
    LR_model.fit(x_train, y_train)
    y_train_predict = LR_model.predict(x_train)
    y_test_predict = LR_model.predict(x_test)
    train_acc = accuracy_score(y_train, y_train_predict)
    test_acc = accuracy_score(y_test, y_test_predict)
    print(f'{train_acc}  {test_acc}')
    cm = confusion_matrix(y_true=y_test, y_pred=y_test_predict)
    print(cm)
    return LR_model

def LR_test(input_x, input_y, LR_model):
    y_predict = LR_model.predict(input_x)
    test_acc = accuracy_score(input_y, y_predict)
    print(f'{test_acc}')
    cm = confusion_matrix(y_true=input_y, y_pred=y_predict)
    print(cm)

def calculate_tx(row):
    if not pd.isna(row['risk_rank']):
        temp_frame = xdp[(xdp['tx_date'] <= row['date']) & (xdp['cust_id'] == row['cust_id'])]
        total_tx = (temp_frame['tx_amt'] * temp_frame['exchg_rate'])
        row['total_tx'] = total_tx.sum()
        row['avg_tx'] = total_tx.mean()
        row['count_tx'] = total_tx.count()
        row['std_tx'] = total_tx.std()
        row['tx_per_day'] = row['count_tx']/row['date']+1
    return row


In [2]:
# Join data for training

ccba = pd.read_pickle('aggregated_ccba_cust.pkl')
cdtx = pd.read_pickle('cdtx_agg.pkl')
xdp = pd.read_pickle('xdp_train_test1.pkl')

In [3]:
train = df_from_csv('ori_trainx.csv')
train = train.merge(df_from_csv('ori_trainy.csv'), on='alert_key', how='inner')
train = train.merge(alert_cust, on='alert_key', how='left')
train = train[['alert_key', 'cust_id']]
train = train.merge(ccba, on='cust_id', how='left')
train = train.merge(cdtx, on='cust_id', how='left')
train = train.merge(xdp, on='alert_key', how='left')
train = train.reindex(sorted(train.columns), axis=1)
train

Unnamed: 0,AGE,alert_key,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,...,total_amt_nontw_nonntd,total_amt_nontw_ntd,total_amt_tw_nonntd,total_amt_tw_ntd,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,tx_per_day
0,4,171189,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,...,244208.0,136501.0,,688721.0,27.0,45.0,,496.0,1.515000e+03,0.000000
1,2,171202,,,,,,,,,...,,,,,,,,,2.092020e+05,0.000000
2,4,171599,,,,,,,,,...,,,,,,,,,3.105930e+05,0.000000
3,4,171737,,,,1893.333333,,,,1.500000,...,,,,5680.0,,,,3.0,4.312500e+04,0.000000
4,3,171142,,,,7048.368421,,,,1.055556,...,,,,133919.0,,,,19.0,2.963440e+05,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,352132,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,...,281614.0,707437.0,,2583949.0,65.0,578.0,,2614.0,1.532000e+08,0.390110
23902,3,352125,,,,,,,,,...,,,,,,,,,4.615283e+07,0.054945
23903,6,352080,,,,,,,,,...,,,,,,,,,1.555772e+08,0.192308
23904,6,352075,,,,,,,,,...,,,,,,,,,1.365559e+08,1.181319


In [4]:
train['bad_value'] = train.isnull().sum(axis=1) + (train == 0).astype(int).sum(axis=1)

In [5]:
train = train.drop(['cust_id_x', 'cust_id_y'], axis=1)
train.to_pickle('aggregate.pkl')

In [6]:
train = train.drop(['alert_key'], axis=1)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_nontw_ntd,total_amt_tw_nonntd,total_amt_tw_ntd,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,tx_per_day,bad_value
0,4,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,5.050000e+02,...,136501.0,,688721.0,27.0,45.0,,496.0,1.515000e+03,0.000000,24
1,2,,,,,,,,,6.973400e+04,...,,,,,,,,2.092020e+05,0.000000,42
2,4,,,,,,,,,7.764825e+04,...,,,,,,,,3.105930e+05,0.000000,42
3,4,,,,1893.333333,,,,1.500000,4.312500e+03,...,,,5680.0,,,,3.0,4.312500e+04,0.000000,36
4,3,,,,7048.368421,,,,1.055556,4.939067e+03,...,,,133919.0,,,,19.0,2.963440e+05,0.000000,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,1.078873e+06,...,707437.0,,2583949.0,65.0,578.0,,2614.0,1.532000e+08,0.390110,22
23902,3,,,,,,,,,2.307641e+06,...,,,,,,,,4.615283e+07,0.054945,40
23903,6,,,,,,,,,2.222531e+06,...,,,,,,,,1.555772e+08,0.192308,40
23904,6,,,,,,,,,3.175717e+05,...,,,,,,,,1.365559e+08,1.181319,40


In [7]:
train['bad_value'] = train.isnull().sum(axis=1) + (train == 0).astype(int).sum(axis=1)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_nontw_ntd,total_amt_tw_nonntd,total_amt_tw_ntd,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,tx_per_day,bad_value
0,4,9044.740741,3033.355556,,1388.550403,1.350000,1.097561,,2.147186,5.050000e+02,...,136501.0,,688721.0,27.0,45.0,,496.0,1.515000e+03,0.000000,24
1,2,,,,,,,,,6.973400e+04,...,,,,,,,,2.092020e+05,0.000000,42
2,4,,,,,,,,,7.764825e+04,...,,,,,,,,3.105930e+05,0.000000,42
3,4,,,,1893.333333,,,,1.500000,4.312500e+03,...,,,5680.0,,,,3.0,4.312500e+04,0.000000,36
4,3,,,,7048.368421,,,,1.055556,4.939067e+03,...,,,133919.0,,,,19.0,2.963440e+05,0.000000,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,,988.503826,1.160714,2.513043,,6.668367,1.078873e+06,...,707437.0,,2583949.0,65.0,578.0,,2614.0,1.532000e+08,0.390110,22
23902,3,,,,,,,,,2.307641e+06,...,,,,,,,,4.615283e+07,0.054945,40
23903,6,,,,,,,,,2.222531e+06,...,,,,,,,,1.555772e+08,0.192308,40
23904,6,,,,,,,,,3.175717e+05,...,,,,,,,,1.365559e+08,1.181319,40


In [8]:
train = train.loc[train['bad_value']<41]
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_nontw_ntd,total_amt_tw_nonntd,total_amt_tw_ntd,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,tx_per_day,bad_value
0,4,9044.740741,3033.355556,,1.388550e+03,1.350000,1.097561,,2.147186,5.050000e+02,...,136501.0,,688721.0,27.0,45.0,,496.0,1.515000e+03,0.000000,24
3,4,,,,1.893333e+03,,,,1.500000,4.312500e+03,...,,,5680.0,,,,3.0,4.312500e+04,0.000000,36
4,3,,,,7.048368e+03,,,,1.055556,4.939067e+03,...,,,133919.0,,,,19.0,2.963440e+05,0.000000,36
5,6,,,,1.112640e+06,,,,2.000000,1.063333e+03,...,,,11126405.0,,,,10.0,3.190000e+03,0.000000,36
7,3,,,,2.710122e+04,,,,1.500000,1.916968e+05,...,,,243911.0,,,,9.0,1.725271e+06,0.000000,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,,9.885038e+02,1.160714,2.513043,,6.668367,1.078873e+06,...,707437.0,,2583949.0,65.0,578.0,,2614.0,1.532000e+08,0.390110,22
23902,3,,,,,,,,,2.307641e+06,...,,,,,,,,4.615283e+07,0.054945,40
23903,6,,,,,,,,,2.222531e+06,...,,,,,,,,1.555772e+08,0.192308,40
23904,6,,,,,,,,,3.175717e+05,...,,,,,,,,1.365559e+08,1.181319,40


In [9]:
train.iloc[:, 1:] = train.iloc[:, 1:].fillna(0)
train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.iloc[:, 1:] = train.iloc[:, 1:].fillna(0)


Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_nontw_ntd,total_amt_tw_nonntd,total_amt_tw_ntd,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,tx_per_day,bad_value
0,4,9044.740741,3033.355556,0.0,1.388550e+03,1.350000,1.097561,0.0,2.147186,5.050000e+02,...,136501.0,0.0,688721.0,27.0,45.0,0.0,496.0,1.515000e+03,0.000000,24
3,4,0.000000,0.000000,0.0,1.893333e+03,0.000000,0.000000,0.0,1.500000,4.312500e+03,...,0.0,0.0,5680.0,0.0,0.0,0.0,3.0,4.312500e+04,0.000000,36
4,3,0.000000,0.000000,0.0,7.048368e+03,0.000000,0.000000,0.0,1.055556,4.939067e+03,...,0.0,0.0,133919.0,0.0,0.0,0.0,19.0,2.963440e+05,0.000000,36
5,6,0.000000,0.000000,0.0,1.112640e+06,0.000000,0.000000,0.0,2.000000,1.063333e+03,...,0.0,0.0,11126405.0,0.0,0.0,0.0,10.0,3.190000e+03,0.000000,36
7,3,0.000000,0.000000,0.0,2.710122e+04,0.000000,0.000000,0.0,1.500000,1.916968e+05,...,0.0,0.0,243911.0,0.0,0.0,0.0,9.0,1.725271e+06,0.000000,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,0.0,9.885038e+02,1.160714,2.513043,0.0,6.668367,1.078873e+06,...,707437.0,0.0,2583949.0,65.0,578.0,0.0,2614.0,1.532000e+08,0.390110,22
23902,3,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.0,0.000000,2.307641e+06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.615283e+07,0.054945,40
23903,6,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.0,0.000000,2.222531e+06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.555772e+08,0.192308,40
23904,6,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.0,0.000000,3.175717e+05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.365559e+08,1.181319,40


In [10]:
train = train.drop(['bad_value'], axis=1)
train

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_nontw_nonntd,total_amt_nontw_ntd,total_amt_tw_nonntd,total_amt_tw_ntd,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,tx_per_day
0,4,9044.740741,3033.355556,0.0,1.388550e+03,1.350000,1.097561,0.0,2.147186,5.050000e+02,...,244208.0,136501.0,0.0,688721.0,27.0,45.0,0.0,496.0,1.515000e+03,0.000000
3,4,0.000000,0.000000,0.0,1.893333e+03,0.000000,0.000000,0.0,1.500000,4.312500e+03,...,0.0,0.0,0.0,5680.0,0.0,0.0,0.0,3.0,4.312500e+04,0.000000
4,3,0.000000,0.000000,0.0,7.048368e+03,0.000000,0.000000,0.0,1.055556,4.939067e+03,...,0.0,0.0,0.0,133919.0,0.0,0.0,0.0,19.0,2.963440e+05,0.000000
5,6,0.000000,0.000000,0.0,1.112640e+06,0.000000,0.000000,0.0,2.000000,1.063333e+03,...,0.0,0.0,0.0,11126405.0,0.0,0.0,0.0,10.0,3.190000e+03,0.000000
7,3,0.000000,0.000000,0.0,2.710122e+04,0.000000,0.000000,0.0,1.500000,1.916968e+05,...,0.0,0.0,0.0,243911.0,0.0,0.0,0.0,9.0,1.725271e+06,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,0.0,9.885038e+02,1.160714,2.513043,0.0,6.668367,1.078873e+06,...,281614.0,707437.0,0.0,2583949.0,65.0,578.0,0.0,2614.0,1.532000e+08,0.390110
23902,3,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.0,0.000000,2.307641e+06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.615283e+07,0.054945
23903,6,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.0,0.000000,2.222531e+06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.555772e+08,0.192308
23904,6,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.0,0.000000,3.175717e+05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.365559e+08,1.181319


In [11]:
# Training
train_x = train.loc[:, train.columns != 'sar_flag']
train_y = train.loc[:, train.columns == 'sar_flag']
train_x

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_nontw_nonntd,total_amt_nontw_ntd,total_amt_tw_nonntd,total_amt_tw_ntd,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,tx_per_day
0,4,9044.740741,3033.355556,0.0,1.388550e+03,1.350000,1.097561,0.0,2.147186,5.050000e+02,...,244208.0,136501.0,0.0,688721.0,27.0,45.0,0.0,496.0,1.515000e+03,0.000000
3,4,0.000000,0.000000,0.0,1.893333e+03,0.000000,0.000000,0.0,1.500000,4.312500e+03,...,0.0,0.0,0.0,5680.0,0.0,0.0,0.0,3.0,4.312500e+04,0.000000
4,3,0.000000,0.000000,0.0,7.048368e+03,0.000000,0.000000,0.0,1.055556,4.939067e+03,...,0.0,0.0,0.0,133919.0,0.0,0.0,0.0,19.0,2.963440e+05,0.000000
5,6,0.000000,0.000000,0.0,1.112640e+06,0.000000,0.000000,0.0,2.000000,1.063333e+03,...,0.0,0.0,0.0,11126405.0,0.0,0.0,0.0,10.0,3.190000e+03,0.000000
7,3,0.000000,0.000000,0.0,2.710122e+04,0.000000,0.000000,0.0,1.500000,1.916968e+05,...,0.0,0.0,0.0,243911.0,0.0,0.0,0.0,9.0,1.725271e+06,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,4332.523077,1223.939446,0.0,9.885038e+02,1.160714,2.513043,0.0,6.668367,1.078873e+06,...,281614.0,707437.0,0.0,2583949.0,65.0,578.0,0.0,2614.0,1.532000e+08,0.390110
23902,3,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.0,0.000000,2.307641e+06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.615283e+07,0.054945
23903,6,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.0,0.000000,2.222531e+06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.555772e+08,0.192308
23904,6,0.000000,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.0,0.000000,3.175717e+05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.365559e+08,1.181319


In [106]:
#normalize
from sklearn import preprocessing

def normalize_mms(input_df):
    scaler = preprocessing.MinMaxScaler()
    result = scaler.fit_transform(input_df)
    return pd.DataFrame(result, index=input_df.index, columns=input_df.columns)

In [13]:
train_x = normalize_mms(train_x)
train_x

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_nontw_nonntd,total_amt_nontw_ntd,total_amt_tw_nonntd,total_amt_tw_ntd,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,tx_per_day
0,0.375,0.005648,0.018591,0.0,0.000110,0.004122,0.055273,0.0,0.031604,0.000008,...,0.002336,0.004388,0.0,0.001897,0.000229,0.011081,0.0,0.023780,2.083666e-08,0.000000
1,0.375,0.000000,0.000000,0.0,0.000151,0.000000,0.000000,0.0,0.022078,0.000066,...,0.000000,0.000000,0.0,0.000016,0.000000,0.000000,0.0,0.000144,5.931227e-07,0.000000
2,0.250,0.000000,0.000000,0.0,0.000560,0.000000,0.000000,0.0,0.015536,0.000076,...,0.000000,0.000000,0.0,0.000369,0.000000,0.000000,0.0,0.000911,4.075788e-06,0.000000
3,0.625,0.000000,0.000000,0.0,0.088478,0.000000,0.000000,0.0,0.029437,0.000016,...,0.000000,0.000000,0.0,0.030639,0.000000,0.000000,0.0,0.000479,4.387389e-08,0.000000
4,0.250,0.000000,0.000000,0.0,0.002155,0.000000,0.000000,0.0,0.022078,0.002945,...,0.000000,0.000000,0.0,0.000672,0.000000,0.000000,0.0,0.000431,2.372864e-05,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21776,0.250,0.002705,0.007501,0.0,0.000079,0.003544,0.126556,0.0,0.098149,0.016576,...,0.002694,0.022742,0.0,0.007115,0.000551,0.142329,0.0,0.125324,2.107046e-03,0.001085
21777,0.250,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.035455,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,6.347662e-04,0.000153
21778,0.625,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.034148,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,2.139742e-03,0.000535
21779,0.625,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.004879,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,1.878131e-03,0.003285


In [14]:
sm_x, sm_y = SMOTE_oversampling(train_x, train_y)

length of oversampled data is  30168
Number of non-sar in oversampled data 15084
Number of sar 15084
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


In [15]:
LR = LogisticRegression(max_iter=1000, n_jobs=-1)
rfe_columns = RFE_analysis(sm_x, sm_y, LR)

Index(['AGE', 'avg_amt_nontw_nonntd', 'avg_amt_nontw_ntd', 'avg_amt_tw_nonntd',
       'avg_amt_tw_ntd', 'avg_freq_nontw_nonntd', 'avg_freq_nontw_ntd',
       'avg_freq_tw_nonntd', 'avg_freq_tw_ntd', 'avg_tx', 'count_tx',
       'cucah_quarter1_mean', 'cucah_quarter1_std', 'cucah_quarter1_sum',
       'cucah_quarter2_mean', 'cucah_quarter2_std', 'cucah_quarter2_sum',
       'cucah_quarter3_mean', 'cucah_quarter3_std', 'cucah_quarter3_sum',
       'cucah_quarter4_mean', 'cucah_quarter4_std', 'cucah_quarter4_sum',
       'cucah_total_mean', 'cucah_total_std', 'cucah_total_sum', 'date',
       'risk_rank', 'sd_amt_nontw_nonntd', 'sd_amt_nontw_ntd',
       'sd_amt_tw_nonntd', 'sd_amt_tw_ntd', 'sd_freq_nontw_nonntd',
       'sd_freq_nontw_ntd', 'sd_freq_tw_nonntd', 'sd_freq_tw_ntd', 'std_tx',
       'total_amt_nontw_nonntd', 'total_amt_nontw_ntd', 'total_amt_tw_nonntd',
       'total_amt_tw_ntd', 'total_freq_nontw_nonntd', 'total_freq_nontw_ntd',
       'total_freq_tw_nonntd', 'total_freq_t

In [16]:
sm_x = sm_x[sm_x.columns[rfe_columns]]
sm_x

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_ntd,count_tx,cucah_quarter1_std,cucah_quarter2_std,...,sd_amt_tw_ntd,sd_freq_nontw_nonntd,sd_freq_nontw_ntd,sd_freq_tw_ntd,total_amt_nontw_nonntd,total_amt_nontw_ntd,total_amt_tw_ntd,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_tx
0,1.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.001070,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000019
1,0.375000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.002617,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000246
2,0.125000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000049,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000005
3,0.125000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000148,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000002
4,0.500000,0.0,0.001698,0.038948,0.0,0.050360,0.016241,0.000839,0.0,0.0,...,0.077861,0.0,0.0,0.003012,0.0,0.000009,0.043159,0.0,0.000246,0.000409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30163,0.125000,0.0,0.000000,0.000003,0.0,0.000000,0.000859,0.000772,0.0,0.0,...,0.000004,0.0,0.0,0.000223,0.0,0.000000,0.000010,0.0,0.000000,0.001753
30164,0.250000,0.0,0.000000,0.000080,0.0,0.000000,0.019328,0.001154,0.0,0.0,...,0.000260,0.0,0.0,0.005183,0.0,0.000000,0.000744,0.0,0.000000,0.002455
30165,0.375000,0.0,0.023222,0.000239,0.0,0.033557,0.031401,0.008880,0.0,0.0,...,0.000230,0.0,0.0,0.011976,0.0,0.000244,0.006173,0.0,0.000328,0.001776
30166,0.250000,0.0,0.000000,0.000067,0.0,0.000000,0.016289,0.001035,0.0,0.0,...,0.000219,0.0,0.0,0.004369,0.0,0.000000,0.000627,0.0,0.000000,0.002104


In [None]:
# LR_analysis(sm_x, sm_y)

In [17]:
LR = LR_training(sm_x, sm_y, LR)

  y = column_or_1d(y, warn=True)


0.7304870502961195  0.7309732166534075
[[2714 1074]
 [ 955 2799]]


In [19]:
# Training Evaluation
train_x_rfe = train_x[train_x.columns[rfe_columns]]
LR_test(train_x_rfe, train_y, LR)


0.719021165235756
[[15493  6054]
 [   66   168]]


In [None]:
# Fine-tuning test

from sklearn.model_selection import GridSearchCV

LR = LogisticRegression(max_iter=1000)
LRparam_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    # 'max_iter': list(range(100,800,100)),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}
LRparam_grid2 = {
    'C': [0.1, 1, 10, 100, 1000, 10000],
    'penalty': ['l1', 'l2'],
    'max_iter': [100,200,300,400,600,700,800,900],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

#{'C': 10000, 'max_iter': 600, 'penalty': 'l2', 'solver': 'lbfgs'}

LR_search = GridSearchCV(LR, param_grid=LRparam_grid2, refit = True, verbose = 3, cv=5)

# fitting the model for grid search
LR_search.fit(sm_x , sm_y.values.ravel())
# summarize
print('Mean Accuracy: %.3f' % LR_search.best_score_)
print('Config: %s' % LR_search.best_params_)

In [21]:
# Test
train_x_rfe = train_x[train_x.columns[rfe_columns]]
LR_test(train_x_rfe, train_y, LR_search)

0.72370414581516
[[15593  5954]
 [   64   170]]


In [84]:
# Test
test_xdp = df_from_csv('submit_format.csv')
test_xdp = test_xdp.drop(['probability'], axis=1)
test_xdp = test_xdp.merge(alert_date, on='alert_key', how='left')
test_xdp = test_xdp.merge(alert_cust, on='alert_key', how='left')
test_xdp = test_xdp.progress_apply(calculate_tx, axis=1)
test_xdp

Progress Bar: 100%|██████████| 3850/3850 [02:09<00:00, 29.73it/s]


Unnamed: 0,AGE,alert_key,avg_tx,count_tx,cust_id,date,occupation_code,risk_rank,std_tx,total_asset,total_tx,tx_per_day
0,3.0,357307,3.715794e+04,280.0,1d69b4daf9c5c8b8d68b1776193b6e80bf2e84c999d038...,370.0,4.0,1.0,1.134937e+05,2588452.0,1.040422e+07,1.756757
1,,376329,,,,,,,,,,
2,,373644,,,,,,,,,,
3,5.0,357668,3.953497e+05,356.0,8b51184740375f7ccdd68484aeeaca44c5892818eda908...,370.0,19.0,1.0,3.312853e+06,1964540.0,1.407445e+08,1.962162
4,3.0,354443,1.391112e+06,243.0,c6def618ad861703c025be4f41bdf7569310228ae93aef...,372.0,2.0,1.0,5.941655e+06,444392.0,3.380401e+08,1.653226
...,...,...,...,...,...,...,...,...,...,...,...,...
3845,2.0,364485,2.116042e+04,339.0,f41c0860cf0cb5e7b72b0ced16ab427a06fc0ad3fc0f71...,393.0,17.0,1.0,4.365511e+04,644169.0,7.173381e+06,1.862595
3846,2.0,363155,1.021493e+06,530.0,823fc5ce48cd827628ce0d1c574e6c3582cb772cf6173a...,392.0,17.0,1.0,4.010326e+06,114439.0,5.413915e+08,2.352041
3847,,368710,,,,,,,,,,
3848,3.0,358067,2.265351e+06,309.0,33ff49ea8a07c6d1b7cc203dcc3638ebde62dfb960a169...,382.0,9.0,1.0,7.909179e+06,367478.0,6.999934e+08,1.808901


In [107]:
# Test
test = df_from_csv('submit_format.csv')
test = test.drop(['probability'], axis=1)
test = test.merge(test_xdp, on='alert_key', how='left')
test = test.merge(ccba, on='cust_id', how='left')
test = test.merge(cdtx, on='cust_id', how='left')
test = test.drop(['alert_key', 'cust_id', 'occupation_code', 'total_asset'], axis=1)
test['bad_value'] = test.isnull().sum(axis=1) + (test == 0).astype(int).sum(axis=1)
test = test.loc[test['bad_value'] < 47]
test.iloc[:, :] = test.iloc[:, :].fillna(0)
test = test.drop(['bad_value'], axis=1)
test = test.reindex(sorted(test.columns), axis=1)
test = normalize_mms(test)
test

#test[test.columns.difference(train_x.columns.tolist(), sort=False)]

Unnamed: 0,AGE,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,avg_freq_tw_ntd,avg_tx,...,total_amt_nontw_nonntd,total_amt_nontw_ntd,total_amt_tw_nonntd,total_amt_tw_ntd,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,total_tx,tx_per_day
0,0.250,0.000000,0.000000,0.0,0.009252,0.000000,0.000000,0.0,0.018408,0.002079,...,0.000000,0.000000,0.0,0.098068,0.000000,0.00000,0.0,0.007458,0.001114,0.002202
3,0.500,0.000000,0.000000,0.0,0.000204,0.000000,0.000000,0.0,0.016324,0.022120,...,0.000000,0.000000,0.0,0.000327,0.000000,0.00000,0.0,0.001126,0.015066,0.002800
4,0.250,0.000000,0.004095,0.0,0.000036,0.000000,0.112200,0.0,0.028396,0.077833,...,0.000000,0.000022,0.0,0.001485,0.000000,0.00123,0.0,0.029130,0.036186,0.001901
5,0.375,0.000000,0.004175,0.0,0.000037,0.000000,0.112200,0.0,0.042043,0.008877,...,0.000000,0.000068,0.0,0.002010,0.000000,0.00369,0.0,0.038418,0.004807,0.002117
6,0.250,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.012880,...,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.002366,0.000711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3842,0.375,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000
3844,0.625,0.000000,0.000000,0.0,0.000443,0.000000,0.000000,0.0,0.018089,0.002789,...,0.000000,0.000000,0.0,0.003633,0.000000,0.00000,0.0,0.005770,0.000459,0.000648
3845,0.125,0.001118,0.009599,0.0,0.000108,0.054348,0.115600,0.0,0.033196,0.001184,...,0.000157,0.000892,0.0,0.013083,0.000138,0.02091,0.0,0.085280,0.000768,0.002510
3846,0.125,0.000000,0.007782,0.0,0.000155,0.000000,0.115233,0.0,0.022280,0.057153,...,0.000000,0.000808,0.0,0.003137,0.000000,0.02337,0.0,0.014213,0.057954,0.003935


In [113]:
y_test = LR_search.predict_proba(test[test.columns[rfe_columns]])
y_test = pd.DataFrame(y_test, columns=['probability0','probability'])
y_test = y_test.drop(['probability0'], axis=1)
y_test['probability'] = round(y_test['probability'], 6)
y_test.index = test.index
y_test

Unnamed: 0,probability
0,0.396870
3,0.617891
4,0.546751
5,0.367926
6,0.062164
...,...
3842,0.835441
3844,0.258380
3845,0.005943
3846,0.059511


In [116]:
xy_test = pd.concat([test, y_test], axis=1)
submit = df_from_csv('submit_format.csv')
submit = submit.drop(columns=['probability'])
submit = pd.concat([submit, xy_test], axis=1)
submit = submit[['alert_key','probability']]
submit['probability'] = submit['probability'].fillna(0.01)
submit = submit.sort_values(by='probability', ascending=False)
submit.to_csv('final_submit.csv')
submit

Unnamed: 0,alert_key,probability
528,364191,1.000000
98,356306,1.000000
3286,358265,1.000000
1347,353568,0.999941
2438,361277,0.999576
...,...,...
3611,359626,0.000000
2165,354668,0.000000
1753,353879,0.000000
1496,354013,0.000000


In [117]:
submit.isnull().any()

alert_key      False
probability    False
dtype: bool

In [118]:
submit.loc[submit.duplicated(keep=False)]

Unnamed: 0,alert_key,probability


# DNN

In [None]:
# DNN
# https://towardsdatascience.com/deep-learning-with-python-neural-networks-complete-tutorial-6b53c0b06af0
# https://towardsdatascience.com/deep-neural-networks-for-regression-problems-81321897ca33

from keras import models, layers, utils, backend as K
from keras.callbacks import ModelCheckpoint
import tensorflow as tf

input_x = sm_x
input_y = sm_y
n_features = len(input_x.columns)
epoch = 50

inputs = layers.Input(name='input', shape=(n_features,))

h1 = layers.Dense(name='h1', units=int(round((n_features+1)/2)), activation='relu')(inputs)
h1 = layers.Dropout(name='drop1', rate=0.2)(h1)

h2 = layers.Dense(name='h2', units=int(round((n_features+1)/4)), activation='relu')(h1)
h2 = layers.Dropout(name='drop2', rate=0.2)(h2)

outputs = layers.Dense(name='output', units=1, activation='sigmoid')(h2)

DNN_1 = models.Model(inputs=inputs, outputs=outputs, name='DNN_1')
DNN_1.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
DNN_1.summary()

In [None]:
# Save checkpoint in case learning stopped in the middle
# param 'mode' is based on 'monitor' param
# classification: monitor accuracy
# regression: monitor loss

checkpoint_path = "weights-improvement-{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
callbacks = [checkpoint]

In [None]:
DNN_1.fit(x=input_x, y=input_y, batch_size=32, epochs=epoch, validation_split=0.3, callbacks=callbacks)

In [None]:
best_file = 'weights-improvement-01-1.00.hdf5'
DNN_1.load_weights(best_file)
DNN_1.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

In [None]:
DNN_y_pred = DNN_1.predict(input_x)

In [None]:
DNN_y_pred