In [1]:
#Base

import pandas as pd
import os

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

import statsmodels.api as sm
from tqdm import tqdm

dataset_folder = os.path.join(os.getcwd(), 'dataset')

def df_from_csv(filename):
    f = os.path.join(dataset_folder, filename)
    return pd.read_csv(f, delimiter='\t|\n|,', engine='python')

x_train = df_from_csv('ori_trainx.csv')
y_train = df_from_csv('ori_trainy.csv')
alert_date = df_from_csv('ori_alert_date.csv')
alert_cust = df_from_csv('ori_custinfo.csv')

smote = SMOTE(random_state=0)
tqdm.pandas(desc='Progress Bar')

def RFE_analysis(input_x, input_y, model):
    rfe = RFE(model, step=300, n_features_to_select=0.97)
    rfe.fit(input_x, input_y.values.ravel())
    print(input_x.columns)
    print(rfe.support_)
    print(rfe.ranking_)
    return rfe.get_support(1)

def LR_analysis(input_x, input_y):
    sm_LR = sm.Logit(input_y, input_x)
    result = sm_LR.fit()
    print(result.summary2())

def LR_training(input_x, input_y, LR_model):
    x_train, x_test, y_train, y_test = train_test_split(input_x, input_y, test_size=0.2, random_state=42)
    LR_model.fit(x_train, y_train)
    y_train_predict = LR_model.predict(x_train)
    y_test_predict = LR_model.predict(x_test)
    train_acc = accuracy_score(y_train, y_train_predict)
    test_acc = accuracy_score(y_test, y_test_predict)

    print(f'Train acc: {train_acc}  Test acc:{test_acc}')
    cm_train = confusion_matrix(y_true=y_train, y_pred=y_train_predict)
    print(cm_train)
    print(classification_report(y_true=y_train, y_pred=y_train_predict))

    print(f'\nTest acc: {train_acc}  Test acc:{test_acc}')
    cm_test = confusion_matrix(y_true=y_test, y_pred=y_test_predict)
    print(cm_test)
    print(classification_report(y_true=y_test, y_pred=y_test_predict))

    return LR_model

def LR_test(input_x, input_y, LR_model):
    y_predict = LR_model.predict(input_x)
    test_acc = accuracy_score(input_y, y_predict)
    print(f'Test acc: {test_acc}')
    cm = confusion_matrix(y_true=input_y, y_pred=y_predict)
    print(cm)
    print(classification_report(y_true=input_y, y_pred=y_predict))

def calculate_tx(row):
    if not pd.isna(row['risk_rank']):
        temp_frame = xdp[(xdp['tx_date'] <= row['date']) & (xdp['cust_id'] == row['cust_id'])]
        total_tx = (temp_frame['tx_amt'] * temp_frame['exchg_rate'])
        row['total_tx'] = total_tx.sum()
        row['avg_tx'] = total_tx.mean()
        row['count_tx'] = total_tx.count()
        row['std_tx'] = total_tx.std()
        row['tx_per_day'] = row['count_tx']/row['date']+1
    return row

In [2]:
# Join data for training

ccba = pd.read_pickle('aggregated_ccba_cust.pkl')
cdtx = pd.read_pickle('alert_key_cdtx.pkl')
xdp = pd.read_pickle('xdp_x_train.pkl')
remit = pd.read_pickle('cust_remit_raw_pre_20221221.pkl')

In [3]:
# Delete bad columns in one of dataset

remit = remit.iloc[:, [0, 1, 2, 3, 4, 5, 7]]
remit

Unnamed: 0,alert_key,total_asset_x,sum_assets,total_asset_y,trade_sum,count_assets,ratio
0,352249,1465816.0,33853238.0,1.471880e+06,,23,
1,352253,98177.0,4997197.0,8.923566e+04,2923745.0,56,0.030521
2,352254,2052922.0,2052922.0,2.052922e+06,,1,
3,352280,201906.0,3529876.0,6.660143e+04,164364.0,53,0.405207
4,352282,7450.0,14900.0,7.450000e+03,8356.0,2,0.891575
...,...,...,...,...,...,...,...
25746,352123,12207.0,427479.0,1.042632e+04,,41,
25747,352124,259985.0,3490135.0,1.517450e+05,,23,
25748,352125,928963.0,1008265.0,5.041325e+05,22127.0,2,22.783590
25749,352128,21647.0,902254.0,3.007513e+04,,30,


In [4]:
train = df_from_csv('ori_trainx.csv')
train = train.merge(df_from_csv('ori_trainy.csv'), on='alert_key', how='inner')
train = train.merge(alert_cust, on='alert_key', how='left')
train = train[['alert_key', 'cust_id']]
train = train.merge(ccba, on='cust_id', how='left')
train = train.merge(cdtx, on='alert_key', how='left')
train = train.merge(xdp, on='alert_key', how='left')
train = train.merge(remit, on='alert_key', how='left')
train = train.reindex(sorted(train.columns), axis=1)
train

Unnamed: 0,AGE,alert_key,atm_count_rate,atm_tx_rate,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,4,171189,1.000000,1.000000,,673.00000,,,,1.000000,...,0.0,0.0,375576.0,375576.0,4.886652e+05,0,1,0,0,
1,2,171202,0.000000,0.000000,,,,,,,...,0.0,0.0,2717416.0,2717416.0,2.621840e+06,0,0,0,0,
2,4,171599,1.000000,1.000000,,,,,,,...,0.0,0.0,326517.0,326517.0,3.265170e+05,0,0,0,0,
3,4,171737,0.900000,0.969090,,,,,,,...,0.0,0.0,1014759.0,1014759.0,1.275499e+05,0,0,0,0,
4,3,171142,1.000000,1.000000,,,,,,,...,0.0,0.0,241719.0,241719.0,2.212473e+05,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,352132,0.690141,0.977016,4400.571429,1242.34238,,1013.510067,1.166667,2.371287,...,0.0,2416208.0,3218731.0,3218731.0,2.361632e+06,63,479,0,2384,364459.0
23902,3,352125,0.650000,0.608263,,,,,,,...,0.0,0.0,928963.0,928963.0,5.041325e+05,0,0,0,0,22127.0
23903,6,352080,0.600000,0.941020,,,,,,,...,0.0,0.0,69080.0,69080.0,7.500171e+04,0,0,0,0,43676.0
23904,6,352075,0.969767,0.998672,,,,,,,...,0.0,0.0,262604.0,262604.0,3.333440e+05,0,0,0,0,


In [5]:
train['bad_value'] = train.isnull().sum(axis=1) + (train == 0).astype(int).sum(axis=1)

In [6]:
train = train.drop(['cust_id_x', 'cust_id_y', 'date'], axis=1)
train.to_pickle('aggregate.pkl')

In [7]:
train = train.drop(['alert_key'], axis=1)
train

Unnamed: 0,AGE,atm_count_rate,atm_tx_rate,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,...,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum,bad_value
0,4,1.000000,1.000000,,673.00000,,,,1.000000,,...,0.0,375576.0,375576.0,4.886652e+05,0,1,0,0,,86
1,2,0.000000,0.000000,,,,,,,,...,0.0,2717416.0,2717416.0,2.621840e+06,0,0,0,0,,85
2,4,1.000000,1.000000,,,,,,,,...,0.0,326517.0,326517.0,3.265170e+05,0,0,0,0,,90
3,4,0.900000,0.969090,,,,,,,,...,0.0,1014759.0,1014759.0,1.275499e+05,0,0,0,0,,86
4,3,1.000000,1.000000,,,,,,,,...,0.0,241719.0,241719.0,2.212473e+05,0,0,0,0,,83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,0.690141,0.977016,4400.571429,1242.34238,,1013.510067,1.166667,2.371287,,...,2416208.0,3218731.0,3218731.0,2.361632e+06,63,479,0,2384,364459.0,58
23902,3,0.650000,0.608263,,,,,,,,...,0.0,928963.0,928963.0,5.041325e+05,0,0,0,0,22127.0,78
23903,6,0.600000,0.941020,,,,,,,,...,0.0,69080.0,69080.0,7.500171e+04,0,0,0,0,43676.0,77
23904,6,0.969767,0.998672,,,,,,,,...,0.0,262604.0,262604.0,3.333440e+05,0,0,0,0,,72


In [8]:
train['bad_value'] = train.isnull().sum(axis=1) + (train == 0).astype(int).sum(axis=1)
train

Unnamed: 0,AGE,atm_count_rate,atm_tx_rate,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,...,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum,bad_value
0,4,1.000000,1.000000,,673.00000,,,,1.000000,,...,0.0,375576.0,375576.0,4.886652e+05,0,1,0,0,,85
1,2,0.000000,0.000000,,,,,,,,...,0.0,2717416.0,2717416.0,2.621840e+06,0,0,0,0,,84
2,4,1.000000,1.000000,,,,,,,,...,0.0,326517.0,326517.0,3.265170e+05,0,0,0,0,,89
3,4,0.900000,0.969090,,,,,,,,...,0.0,1014759.0,1014759.0,1.275499e+05,0,0,0,0,,85
4,3,1.000000,1.000000,,,,,,,,...,0.0,241719.0,241719.0,2.212473e+05,0,0,0,0,,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,0.690141,0.977016,4400.571429,1242.34238,,1013.510067,1.166667,2.371287,,...,2416208.0,3218731.0,3218731.0,2.361632e+06,63,479,0,2384,364459.0,58
23902,3,0.650000,0.608263,,,,,,,,...,0.0,928963.0,928963.0,5.041325e+05,0,0,0,0,22127.0,78
23903,6,0.600000,0.941020,,,,,,,,...,0.0,69080.0,69080.0,7.500171e+04,0,0,0,0,43676.0,77
23904,6,0.969767,0.998672,,,,,,,,...,0.0,262604.0,262604.0,3.333440e+05,0,0,0,0,,72


In [9]:
ori_train = train.copy()
train

Unnamed: 0,AGE,atm_count_rate,atm_tx_rate,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,...,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum,bad_value
0,4,1.000000,1.000000,,673.00000,,,,1.000000,,...,0.0,375576.0,375576.0,4.886652e+05,0,1,0,0,,85
1,2,0.000000,0.000000,,,,,,,,...,0.0,2717416.0,2717416.0,2.621840e+06,0,0,0,0,,84
2,4,1.000000,1.000000,,,,,,,,...,0.0,326517.0,326517.0,3.265170e+05,0,0,0,0,,89
3,4,0.900000,0.969090,,,,,,,,...,0.0,1014759.0,1014759.0,1.275499e+05,0,0,0,0,,85
4,3,1.000000,1.000000,,,,,,,,...,0.0,241719.0,241719.0,2.212473e+05,0,0,0,0,,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,0.690141,0.977016,4400.571429,1242.34238,,1013.510067,1.166667,2.371287,,...,2416208.0,3218731.0,3218731.0,2.361632e+06,63,479,0,2384,364459.0,58
23902,3,0.650000,0.608263,,,,,,,,...,0.0,928963.0,928963.0,5.041325e+05,0,0,0,0,22127.0,78
23903,6,0.600000,0.941020,,,,,,,,...,0.0,69080.0,69080.0,7.500171e+04,0,0,0,0,43676.0,77
23904,6,0.969767,0.998672,,,,,,,,...,0.0,262604.0,262604.0,3.333440e+05,0,0,0,0,,72


In [10]:
ori_train.iloc[:, 1:] = ori_train.iloc[:, 1:].fillna(0)
train.iloc[:, 1:] = train.iloc[:, 1:].fillna(0)
train

Unnamed: 0,AGE,atm_count_rate,atm_tx_rate,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,...,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum,bad_value
0,4,1.000000,1.000000,0.000000,673.00000,0.0,0.000000,0.000000,1.000000,0.0,...,0.0,375576.0,375576.0,4.886652e+05,0,1,0,0,0.0,85
1,2,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,2717416.0,2717416.0,2.621840e+06,0,0,0,0,0.0,84
2,4,1.000000,1.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,326517.0,326517.0,3.265170e+05,0,0,0,0,0.0,89
3,4,0.900000,0.969090,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,1014759.0,1014759.0,1.275499e+05,0,0,0,0,0.0,85
4,3,1.000000,1.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,241719.0,241719.0,2.212473e+05,0,0,0,0,0.0,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,0.690141,0.977016,4400.571429,1242.34238,0.0,1013.510067,1.166667,2.371287,0.0,...,2416208.0,3218731.0,3218731.0,2.361632e+06,63,479,0,2384,364459.0,58
23902,3,0.650000,0.608263,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,928963.0,928963.0,5.041325e+05,0,0,0,0,22127.0,78
23903,6,0.600000,0.941020,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,69080.0,69080.0,7.500171e+04,0,0,0,0,43676.0,77
23904,6,0.969767,0.998672,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,262604.0,262604.0,3.333440e+05,0,0,0,0,0.0,72


In [11]:
ori_train = ori_train.drop(['bad_value'], axis=1)
train = train.drop(['bad_value'], axis=1)
train

Unnamed: 0,AGE,atm_count_rate,atm_tx_rate,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,4,1.000000,1.000000,0.000000,673.00000,0.0,0.000000,0.000000,1.000000,0.0,...,0.0,0.0,375576.0,375576.0,4.886652e+05,0,1,0,0,0.0
1,2,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,2717416.0,2717416.0,2.621840e+06,0,0,0,0,0.0
2,4,1.000000,1.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,326517.0,326517.0,3.265170e+05,0,0,0,0,0.0
3,4,0.900000,0.969090,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,1014759.0,1014759.0,1.275499e+05,0,0,0,0,0.0
4,3,1.000000,1.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,241719.0,241719.0,2.212473e+05,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,0.690141,0.977016,4400.571429,1242.34238,0.0,1013.510067,1.166667,2.371287,0.0,...,0.0,2416208.0,3218731.0,3218731.0,2.361632e+06,63,479,0,2384,364459.0
23902,3,0.650000,0.608263,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,928963.0,928963.0,5.041325e+05,0,0,0,0,22127.0
23903,6,0.600000,0.941020,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,69080.0,69080.0,7.500171e+04,0,0,0,0,43676.0
23904,6,0.969767,0.998672,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,262604.0,262604.0,3.333440e+05,0,0,0,0,0.0


In [12]:
# Training Dataset
ori_train_x = ori_train.loc[:, train.columns != 'sar_flag']
ori_train_y = ori_train.loc[:, train.columns == 'sar_flag']
train_x = train.loc[:, train.columns != 'sar_flag']
train_y = train.loc[:, train.columns == 'sar_flag']
train_x

Unnamed: 0,AGE,atm_count_rate,atm_tx_rate,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,4,1.000000,1.000000,0.000000,673.00000,0.0,0.000000,0.000000,1.000000,0.0,...,0.0,0.0,375576.0,375576.0,4.886652e+05,0,1,0,0,0.0
1,2,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,2717416.0,2717416.0,2.621840e+06,0,0,0,0,0.0
2,4,1.000000,1.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,326517.0,326517.0,3.265170e+05,0,0,0,0,0.0
3,4,0.900000,0.969090,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,1014759.0,1014759.0,1.275499e+05,0,0,0,0,0.0
4,3,1.000000,1.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,241719.0,241719.0,2.212473e+05,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,3,0.690141,0.977016,4400.571429,1242.34238,0.0,1013.510067,1.166667,2.371287,0.0,...,0.0,2416208.0,3218731.0,3218731.0,2.361632e+06,63,479,0,2384,364459.0
23902,3,0.650000,0.608263,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,928963.0,928963.0,5.041325e+05,0,0,0,0,22127.0
23903,6,0.600000,0.941020,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,69080.0,69080.0,7.500171e+04,0,0,0,0,43676.0
23904,6,0.969767,0.998672,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,262604.0,262604.0,3.333440e+05,0,0,0,0,0.0


In [13]:
# Normalize function
from sklearn import preprocessing

def normalize_mms(input_df):
    scaler = preprocessing.MinMaxScaler()
    result = scaler.fit_transform(input_df)
    return pd.DataFrame(result, index=input_df.index, columns=input_df.columns)

In [14]:
train_x = normalize_mms(train_x)
train_x

Unnamed: 0,AGE,atm_count_rate,atm_tx_rate,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,0.4,1.000000,1.000000,0.000000,0.004125,0.0,0.000000,0.00000,0.038462,0.0,...,0.0,0.000000,0.005085,0.005085,0.009321,0.000000,0.000402,0.0,0.00000,0.000000
1,0.2,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,...,0.0,0.000000,0.036790,0.036790,0.050010,0.000000,0.000000,0.0,0.00000,0.000000
2,0.4,1.000000,1.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,...,0.0,0.000000,0.004421,0.004421,0.006228,0.000000,0.000000,0.0,0.00000,0.000000
3,0.4,0.900000,0.969090,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,...,0.0,0.000000,0.013738,0.013738,0.002433,0.000000,0.000000,0.0,0.00000,0.000000
4,0.3,1.000000,1.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,...,0.0,0.000000,0.003273,0.003273,0.004220,0.000000,0.000000,0.0,0.00000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23901,0.3,0.690141,0.977016,0.001785,0.007614,0.0,0.000024,0.00334,0.091203,0.0,...,0.0,0.018918,0.043577,0.043577,0.045047,0.002654,0.192679,0.0,0.20534,0.008875
23902,0.3,0.650000,0.608263,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,...,0.0,0.000000,0.012577,0.012577,0.009616,0.000000,0.000000,0.0,0.00000,0.000539
23903,0.6,0.600000,0.941020,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,...,0.0,0.000000,0.000935,0.000935,0.001431,0.000000,0.000000,0.0,0.00000,0.001064
23904,0.6,0.969767,0.998672,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.0,...,0.0,0.000000,0.003555,0.003555,0.006358,0.000000,0.000000,0.0,0.00000,0.000000


In [15]:
# View Categorial Feature
attribute_list = list(train_x.columns)
for i, v in enumerate(attribute_list):
    print(f'{i} {v}')

0 AGE
1 atm_count_rate
2 atm_tx_rate
3 avg_amt_nontw_nonntd
4 avg_amt_nontw_ntd
5 avg_amt_tw_nonntd
6 avg_amt_tw_ntd
7 avg_freq_nontw_nonntd
8 avg_freq_nontw_ntd
9 avg_freq_tw_nonntd
10 avg_freq_tw_ntd
11 cb_count_rate
12 cb_tx_rate
13 count_assets
14 cr_1_avg_tx
15 cr_1_count_tx
16 cr_1_std_tx
17 cr_1_total_tx
18 cr_1_tx_per_day
19 cr_2_avg_tx
20 cr_2_count_tx
21 cr_2_std_tx
22 cr_2_total_tx
23 cr_2_tx_per_day
24 cr_3_avg_tx
25 cr_3_count_tx
26 cr_3_std_tx
27 cr_3_total_tx
28 cr_3_tx_per_day
29 cucah_quarter1_mean
30 cucah_quarter1_std
31 cucah_quarter1_sum
32 cucah_quarter2_mean
33 cucah_quarter2_std
34 cucah_quarter2_sum
35 cucah_quarter3_mean
36 cucah_quarter3_std
37 cucah_quarter3_sum
38 cucah_quarter4_mean
39 cucah_quarter4_std
40 cucah_quarter4_sum
41 cucah_total_mean
42 cucah_total_std
43 cucah_total_sum
44 db_1_avg_tx
45 db_1_count_tx
46 db_1_std_tx
47 db_1_total_tx
48 db_1_tx_per_day
49 db_2_avg_tx
50 db_2_count_tx
51 db_2_std_tx
52 db_2_total_tx
53 db_2_tx_per_day
54 db_3_av

In [16]:
cf_index = [0] + list(range(59,80))
cf_index

[0,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79]

In [17]:
smn = SMOTENC(categorical_features=cf_index)
smt = SMOTETomek(smote=smn, random_state=42)
ada = ADASYN(random_state=42)
sm_x, sm_y = ada.fit_resample(train_x, train_y)

In [18]:
RF = RandomForestClassifier(random_state=42)

In [19]:
sm_x

Unnamed: 0,AGE,atm_count_rate,atm_tx_rate,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,0.40000,1.000000,1.000000,0.0,4.124798e-03,0.0,0.000000e+00,0.0,0.038462,0.0,...,0.0,0.000000e+00,0.005085,0.005085,0.009321,0.0,4.022526e-04,0.0,0.000000e+00,0.000000e+00
1,0.20000,0.000000,0.000000,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.0,...,0.0,0.000000e+00,0.036790,0.036790,0.050010,0.0,0.000000e+00,0.0,0.000000e+00,0.000000e+00
2,0.40000,1.000000,1.000000,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.0,...,0.0,0.000000e+00,0.004421,0.004421,0.006228,0.0,0.000000e+00,0.0,0.000000e+00,0.000000e+00
3,0.40000,0.900000,0.969090,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.0,...,0.0,0.000000e+00,0.013738,0.013738,0.002433,0.0,0.000000e+00,0.0,0.000000e+00,0.000000e+00
4,0.30000,1.000000,1.000000,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.0,...,0.0,0.000000e+00,0.003273,0.003273,0.004220,0.0,0.000000e+00,0.0,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47292,0.20000,0.212815,0.361712,0.0,1.753813e-07,0.0,5.973528e-08,0.0,0.000012,0.0,...,0.0,5.177058e-07,0.001917,0.001917,0.002701,0.0,1.264891e-07,0.0,7.041990e-07,3.750706e-08
47293,0.20000,0.323894,0.400762,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.0,...,0.0,0.000000e+00,0.000595,0.000595,0.000839,0.0,0.000000e+00,0.0,0.000000e+00,0.000000e+00
47294,0.20000,0.469377,0.637090,0.0,3.505811e-04,0.0,1.194087e-04,0.0,0.024176,0.0,...,0.0,1.034876e-03,0.013702,0.013702,0.019305,0.0,2.528473e-04,0.0,1.407669e-03,7.497529e-05
47295,0.20000,0.417370,0.581269,0.0,2.795516e-04,0.0,9.521594e-05,0.0,0.019278,0.0,...,0.0,8.252048e-04,0.011313,0.011313,0.015939,0.0,2.016192e-04,0.0,1.122468e-03,5.978493e-05


In [20]:
RF = LR_training(train_x, train_y, RF)
LR_test(train_x, train_y, RF)

  LR_model.fit(x_train, y_train)


Train acc: 1.0  Test acc:0.9882894186532831
[[18948     0]
 [    0   176]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18948
           1       1.00      1.00      1.00       176

    accuracy                           1.00     19124
   macro avg       1.00      1.00      1.00     19124
weighted avg       1.00      1.00      1.00     19124


Test acc: 1.0  Test acc:0.9882894186532831
[[4722    2]
 [  54    4]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4724
           1       0.67      0.07      0.12        58

    accuracy                           0.99      4782
   macro avg       0.83      0.53      0.56      4782
weighted avg       0.98      0.99      0.98      4782

Test acc: 0.997657491843052
[[23670     2]
 [   54   180]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     23672
           1       0.99      0.7

In [21]:
RF = LR_training(sm_x, sm_y, RF)
LR_test(train_x, train_y, RF)

  LR_model.fit(x_train, y_train)


Train acc: 1.0  Test acc:0.9952431289640592
[[18957     0]
 [    0 18880]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18957
           1       1.00      1.00      1.00     18880

    accuracy                           1.00     37837
   macro avg       1.00      1.00      1.00     37837
weighted avg       1.00      1.00      1.00     37837


Test acc: 1.0  Test acc:0.9952431289640592
[[4688   27]
 [  18 4727]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      4715
           1       0.99      1.00      1.00      4745

    accuracy                           1.00      9460
   macro avg       1.00      1.00      1.00      9460
weighted avg       1.00      1.00      1.00      9460

Test acc: 0.998117627373881
[[23645    27]
 [   18   216]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     23672
           1       0.89      0.9

In [22]:
rfe_columns = RFE_analysis(sm_x, sm_y, RF)
sm_x_rfe = sm_x[sm_x.columns[rfe_columns]]
RF = LR_training(sm_x_rfe, sm_y, RF)

Index(['AGE', 'atm_count_rate', 'atm_tx_rate', 'avg_amt_nontw_nonntd',
       'avg_amt_nontw_ntd', 'avg_amt_tw_nonntd', 'avg_amt_tw_ntd',
       'avg_freq_nontw_nonntd', 'avg_freq_nontw_ntd', 'avg_freq_tw_nonntd',
       ...
       'total_amt_tw_nonntd', 'total_amt_tw_ntd', 'total_asset',
       'total_asset_x', 'total_asset_y', 'total_freq_nontw_nonntd',
       'total_freq_nontw_ntd', 'total_freq_tw_nonntd', 'total_freq_tw_ntd',
       'trade_sum'],
      dtype='object', length=103)
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False  True  True  True  True  True  True  True


  LR_model.fit(x_train, y_train)


Train acc: 1.0  Test acc:0.9953488372093023
[[18957     0]
 [    0 18880]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18957
           1       1.00      1.00      1.00     18880

    accuracy                           1.00     37837
   macro avg       1.00      1.00      1.00     37837
weighted avg       1.00      1.00      1.00     37837


Test acc: 1.0  Test acc:0.9953488372093023
[[4689   26]
 [  18 4727]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      4715
           1       0.99      1.00      1.00      4745

    accuracy                           1.00      9460
   macro avg       1.00      1.00      1.00      9460
weighted avg       1.00      1.00      1.00      9460



In [23]:
# Training Evaluation: RFE
train_x_rfe = train_x[train_x.columns[rfe_columns]]
attribute_list = list(train_x_rfe.columns)
for i, v in enumerate(attribute_list):
    print(f'{i} {v}')

# Training Evaluation: Without RFE
# LR_test(train_x, train_y, RF)

0 AGE
1 atm_count_rate
2 atm_tx_rate
3 avg_amt_nontw_nonntd
4 avg_amt_nontw_ntd
5 avg_amt_tw_nonntd
6 avg_amt_tw_ntd
7 avg_freq_nontw_nonntd
8 avg_freq_nontw_ntd
9 avg_freq_tw_nonntd
10 avg_freq_tw_ntd
11 cb_count_rate
12 cb_tx_rate
13 count_assets
14 cr_1_avg_tx
15 cr_1_count_tx
16 cr_1_std_tx
17 cr_1_total_tx
18 cr_1_tx_per_day
19 cr_2_avg_tx
20 cr_2_count_tx
21 cr_2_std_tx
22 cr_2_total_tx
23 cr_2_tx_per_day
24 cr_3_avg_tx
25 cr_3_count_tx
26 cr_3_std_tx
27 cr_3_total_tx
28 cr_3_tx_per_day
29 cucah_quarter1_mean
30 cucah_quarter1_std
31 cucah_quarter1_sum
32 cucah_quarter2_mean
33 cucah_quarter2_std
34 cucah_quarter2_sum
35 cucah_quarter3_mean
36 cucah_quarter3_std
37 cucah_quarter3_sum
38 cucah_quarter4_mean
39 cucah_quarter4_std
40 cucah_quarter4_sum
41 cucah_total_mean
42 cucah_total_std
43 cucah_total_sum
44 db_1_avg_tx
45 db_1_count_tx
46 db_1_std_tx
47 db_1_total_tx
48 db_1_tx_per_day
49 db_2_avg_tx
50 db_2_count_tx
51 db_2_std_tx
52 db_2_total_tx
53 db_2_tx_per_day
54 db_3_av

In [24]:
LR_test(train_x_rfe, train_y, RF)

Test acc: 0.9981594578766837
[[23646    26]
 [   18   216]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     23672
           1       0.89      0.92      0.91       234

    accuracy                           1.00     23906
   macro avg       0.95      0.96      0.95     23906
weighted avg       1.00      1.00      1.00     23906



In [None]:
# Fine-tuning test

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(60, 110, num = 6)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

param_grid = {
    'bootstrap': [True],
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators,
}

RF_search = RandomForestClassifier()
# RF_random = RandomizedSearchCV(estimator = RF_search, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RF_random = GridSearchCV(estimator = RF_search, param_grid = param_grid,
                           cv = 4, n_jobs = -1, verbose = 2)
RF_random.fit(sm_x, sm_y)
RF_random.best_params_

In [None]:
# {'bootstrap': True,
#  'max_depth': 90,
#  'max_features': 'sqrt',
#  'min_samples_leaf': 1,
#  'min_samples_split': 2,
#  'n_estimators': 200}

# Use Best Tuning Param
RF_random = RandomForestClassifier(n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=90, bootstrap=True)
RF_random.fit(sm_x, sm_y)

# LR_test(ori_train_x_normal, ori_train_y, RF_random)

In [None]:
ori_train_x_normal = normalize_mms(ori_train_x)
LR_test(ori_train_x_normal, ori_train_y, RF_random)

In [26]:
test_xdp = pd.read_pickle('xdp_x_test.pkl')

In [27]:
# Test
test = df_from_csv('submit_format.csv')
test = test.drop(['probability'], axis=1)
test = test.merge(test_xdp, on='alert_key', how='left')
test = test.merge(ccba, on='cust_id', how='left')
test = test.merge(cdtx, on='alert_key', how='left')
test = test.merge(remit, on='alert_key', how='left')
test = test.drop(['alert_key', 'cust_id', 'date'], axis=1)
test['bad_value'] = test.isnull().sum(axis=1) + (test == 0).astype(int).sum(axis=1)
test.iloc[:, :] = test.iloc[:, :].fillna(0)
test = test.drop(['bad_value'], axis=1)
test = test.reindex(sorted(test.columns), axis=1)
test = normalize_mms(test)
test

Unnamed: 0,AGE,atm_count_rate,atm_tx_rate,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,0.333333,0.550000,0.588727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.133775,0.133775,0.150170,0.0,0.0,0.0,0.0,0.000000
1,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
2,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
3,0.555556,0.491573,0.383901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.101530,0.101530,0.113974,0.0,0.0,0.0,0.0,0.000131
4,0.333333,0.855967,0.969150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.022967,0.022967,0.021281,0.0,0.0,0.0,0.0,0.063476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3845,0.222222,0.766962,0.752510,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.033291,0.033291,0.037372,0.0,0.0,0.0,0.0,0.000000
3846,0.222222,0.781132,0.961093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.005914,0.005914,0.010011,0.0,0.0,0.0,0.0,0.085655
3847,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
3848,0.333333,0.906149,0.999761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.018992,0.018992,0.023327,0.0,0.0,0.0,0.0,0.046544


In [28]:
attribute_list = list(train_x_rfe.columns)
for i, v in enumerate(attribute_list):
    print(f'{i} {v} {test.columns[i]}')

0 AGE AGE
1 atm_count_rate atm_count_rate
2 atm_tx_rate atm_tx_rate
3 avg_amt_nontw_nonntd avg_amt_nontw_nonntd
4 avg_amt_nontw_ntd avg_amt_nontw_ntd
5 avg_amt_tw_nonntd avg_amt_tw_nonntd
6 avg_amt_tw_ntd avg_amt_tw_ntd
7 avg_freq_nontw_nonntd avg_freq_nontw_nonntd
8 avg_freq_nontw_ntd avg_freq_nontw_ntd
9 avg_freq_tw_nonntd avg_freq_tw_nonntd
10 avg_freq_tw_ntd avg_freq_tw_ntd
11 cb_count_rate cb_count_rate
12 cb_tx_rate cb_tx_rate
13 count_assets count_assets
14 cr_1_avg_tx cr_1_avg_tx
15 cr_1_count_tx cr_1_count_tx
16 cr_1_std_tx cr_1_std_tx
17 cr_1_total_tx cr_1_total_tx
18 cr_1_tx_per_day cr_1_tx_per_day
19 cr_2_avg_tx cr_2_avg_tx
20 cr_2_count_tx cr_2_count_tx
21 cr_2_std_tx cr_2_std_tx
22 cr_2_total_tx cr_2_total_tx
23 cr_2_tx_per_day cr_2_tx_per_day
24 cr_3_avg_tx cr_3_avg_tx
25 cr_3_count_tx cr_3_count_tx
26 cr_3_std_tx cr_3_std_tx
27 cr_3_total_tx cr_3_total_tx
28 cr_3_tx_per_day cr_3_tx_per_day
29 cucah_quarter1_mean cucah_quarter1_mean
30 cucah_quarter1_std cucah_quarter1_s

In [29]:
test = test.drop(['sd_amt_tw_nonntd','sd_freq_tw_nonntd'], axis=1)

In [30]:
# Handle Missing Column
test['occupation_code_8'] = 0
test = test.reindex(sorted(test.columns), axis=1)
attribute_list = list(train_x_rfe.columns)
for i, v in enumerate(attribute_list):
    print(f'{i} {v} {test.columns[i]}')

0 AGE AGE
1 atm_count_rate atm_count_rate
2 atm_tx_rate atm_tx_rate
3 avg_amt_nontw_nonntd avg_amt_nontw_nonntd
4 avg_amt_nontw_ntd avg_amt_nontw_ntd
5 avg_amt_tw_nonntd avg_amt_tw_nonntd
6 avg_amt_tw_ntd avg_amt_tw_ntd
7 avg_freq_nontw_nonntd avg_freq_nontw_nonntd
8 avg_freq_nontw_ntd avg_freq_nontw_ntd
9 avg_freq_tw_nonntd avg_freq_tw_nonntd
10 avg_freq_tw_ntd avg_freq_tw_ntd
11 cb_count_rate cb_count_rate
12 cb_tx_rate cb_tx_rate
13 count_assets count_assets
14 cr_1_avg_tx cr_1_avg_tx
15 cr_1_count_tx cr_1_count_tx
16 cr_1_std_tx cr_1_std_tx
17 cr_1_total_tx cr_1_total_tx
18 cr_1_tx_per_day cr_1_tx_per_day
19 cr_2_avg_tx cr_2_avg_tx
20 cr_2_count_tx cr_2_count_tx
21 cr_2_std_tx cr_2_std_tx
22 cr_2_total_tx cr_2_total_tx
23 cr_2_tx_per_day cr_2_tx_per_day
24 cr_3_avg_tx cr_3_avg_tx
25 cr_3_count_tx cr_3_count_tx
26 cr_3_std_tx cr_3_std_tx
27 cr_3_total_tx cr_3_total_tx
28 cr_3_tx_per_day cr_3_tx_per_day
29 cucah_quarter1_mean cucah_quarter1_mean
30 cucah_quarter1_std cucah_quarter1_s

In [31]:
test

Unnamed: 0,AGE,atm_count_rate,atm_tx_rate,avg_amt_nontw_nonntd,avg_amt_nontw_ntd,avg_amt_tw_nonntd,avg_amt_tw_ntd,avg_freq_nontw_nonntd,avg_freq_nontw_ntd,avg_freq_tw_nonntd,...,total_amt_tw_nonntd,total_amt_tw_ntd,total_asset,total_asset_x,total_asset_y,total_freq_nontw_nonntd,total_freq_nontw_ntd,total_freq_tw_nonntd,total_freq_tw_ntd,trade_sum
0,0.333333,0.550000,0.588727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.133775,0.133775,0.150170,0.0,0.0,0.0,0.0,0.000000
1,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
2,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
3,0.555556,0.491573,0.383901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.101530,0.101530,0.113974,0.0,0.0,0.0,0.0,0.000131
4,0.333333,0.855967,0.969150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.022967,0.022967,0.021281,0.0,0.0,0.0,0.0,0.063476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3845,0.222222,0.766962,0.752510,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.033291,0.033291,0.037372,0.0,0.0,0.0,0.0,0.000000
3846,0.222222,0.781132,0.961093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.005914,0.005914,0.010011,0.0,0.0,0.0,0.0,0.085655
3847,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
3848,0.333333,0.906149,0.999761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.018992,0.018992,0.023327,0.0,0.0,0.0,0.0,0.046544


In [32]:
y_test = RF.predict_proba(test)
# y_test = RF_random.predict_proba(test)
y_test = pd.DataFrame(y_test, columns=['probability0','probability'])
y_test = y_test.drop(['probability0'], axis=1)
# y_test['probability'] = round(y_test['probability'], 6)
y_test.index = test.index
y_test

Unnamed: 0,probability
0,0.39
1,0.00
2,0.00
3,0.06
4,0.05
...,...
3845,0.41
3846,0.13
3847,0.00
3848,0.16


In [33]:
y_test.sort_values(by='probability', ascending=True)

Unnamed: 0,probability
1924,0.00
2173,0.00
2175,0.00
2178,0.00
2179,0.00
...,...
942,0.57
959,0.58
432,0.61
2941,0.61


In [34]:
y_test.loc[y_test['probability'] == 0, 'probability'] = 0
y_test.sort_values(by='probability', ascending=True)

Unnamed: 0,probability
1924,0.00
2173,0.00
2175,0.00
2178,0.00
2179,0.00
...,...
942,0.57
959,0.58
432,0.61
2941,0.61


In [35]:
xy_test = pd.concat([test, y_test], axis=1)
submit = df_from_csv('submit_format.csv')
submit = submit.drop(columns=['probability'])
submit = pd.concat([submit, xy_test], axis=1)
submit = submit[['alert_key','probability']]
submit['probability'] = submit['probability'].fillna(0)
submit = submit.sort_values(by='probability', ascending=False)
submit.to_csv('final_submit.csv')
submit

Unnamed: 0,alert_key,probability
2941,362243,0.61
432,358464,0.61
712,361296,0.61
959,355724,0.58
950,362285,0.57
...,...,...
1630,375197,0.00
1631,374624,0.00
1634,374551,0.00
1635,369296,0.00


In [36]:
submit.isnull().any()

alert_key      False
probability    False
dtype: bool

In [37]:
submit.loc[submit.duplicated(keep=False)]

Unnamed: 0,alert_key,probability
