In [1]:
#Reading Dataset

import pandas as pd
import os

dataset_folder = os.path.join(os.getcwd(), 'dataset')

def df_from_csv(filename):
    f = os.path.join(dataset_folder, filename)
    return pd.read_csv(f, delimiter='\t|\n|,', engine='python')

In [2]:
y_train = df_from_csv('ori_trainy.csv')
y_train.head()

Unnamed: 0,alert_key,sar_flag
0,171189,0
1,171202,0
2,171599,0
3,171737,0
4,171142,0


In [3]:
x_train = df_from_csv('ori_trainx.csv')
x_train.head()

Unnamed: 0,alert_key,date
0,171189,0
1,171202,0
2,171599,0
3,171737,0
4,171142,0


In [4]:
alert_cust = df_from_csv('ori_custinfo.csv')
alert_cust.head()

Unnamed: 0,alert_key,cust_id,risk_rank,occupation_code,total_asset,AGE
0,352249,82595ac69158ae08d34156784bdec0d9e2ca5b242b6d2a...,1,19.0,1465816.0,7
1,352253,b212d14cb35676926682b2cf849e295d948888f556c07e...,1,2.0,98177.0,2
2,352254,e5b0002791c7852644a2730abeaa893cdf14a072ef7812...,1,19.0,2052922.0,7
3,352280,74214c478dc6519fbefe4bc31693865bdcd698ab974b64...,3,15.0,201906.0,5
4,352282,0340e7611f0d82c3cb87e6194fa14bb2ccf8afbf1b3418...,1,12.0,7450.0,5


In [None]:
alert_date = df_from_csv('ori_alert_date.csv')
alert_date.head()

In [None]:
# Merge x and y train into one dataframe, then merge it with customer info data as whole training data

train = x_train.merge(y_train, on='alert_key', how='inner')
train = train.merge(alert_cust, on='alert_key', how='inner')
xdp_train = train.copy()
train.head()

In [None]:
# check null value on whole dataset & drop if necessary

train.isnull().any()

In [None]:
train[train.isnull().any(axis=1)]

In [None]:
train = train.dropna(how='any', axis=0)
train

In [None]:
# checking the number of records based on sar_flag

train.groupby(['sar_flag'])['sar_flag'].count()

In [None]:
# create method to analyze LR performance

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# SMOTE to fix imbalance data (upscale minor data by creating synthetic record)
# Other variant SMOTEENN

# SMOTE
from imblearn.over_sampling import SMOTE

smote_x = train[['risk_rank','occupation_code','total_asset','AGE']]
smote_y = train.loc[:, train.columns == 'sar_flag']

smote = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(smote_x, smote_y, test_size=0.3, random_state=0)
sm_cols = X_train.columns

sm_data_X,sm_data_y = smote.fit_resample(X_train, y_train)
sm_data_X = pd.DataFrame(data=sm_data_X,columns=sm_cols )
sm_data_y = pd.DataFrame(data=sm_data_y,columns=['sar_flag'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(sm_data_X))
print("Number of non-sar in oversampled data",len(sm_data_y[sm_data_y['sar_flag']==0]))
print("Number of sar",len(sm_data_y[sm_data_y['sar_flag']==1]))
print("Proportion of no subscription data in oversampled data is ", len(sm_data_y[sm_data_y['sar_flag']==0])/len(sm_data_X))
print("Proportion of subscription data in oversampled data is ", len(sm_data_y[sm_data_y['sar_flag']==1])/len(sm_data_X))

def SMOTE_oversampling(input_x, input_y):
    x_train, x_test, y_train, y_test = train_test_split(input_x, input_y, test_size=0.3)
    x_cols = x_train.columns
    y_cols = y_train.columns

    sm_data_x, sm_data_y = smote.fit_resample(x_train, y_train)
    sm_data_x = pd.DataFrame(data=sm_data_x, columns=x_cols)
    sm_data_y = pd.DataFrame(data=sm_data_y, columns=y_cols)

    print("length of oversampled data is ",len(sm_data_x))
    print("Number of non-sar in oversampled data",len(sm_data_y[sm_data_y['sar_flag']==0]))
    print("Number of sar",len(sm_data_y[sm_data_y['sar_flag']==1]))
    print("Proportion of no subscription data in oversampled data is ", len(sm_data_y[sm_data_y['sar_flag']==0])/len(sm_data_x))
    print("Proportion of subscription data in oversampled data is ", len(sm_data_y[sm_data_y['sar_flag']==1])/len(sm_data_x))

    return sm_data_x, sm_data_y

In [None]:
# RFE feature selection > issue on selecting total asset which cause overfitting instead

from sklearn.feature_selection import RFE

LR = LogisticRegression(max_iter=2000, n_jobs=-1)
# rfe = RFE(LR, step=100)
# rfe = rfe.fit(sm_data_X, sm_data_y.values.ravel())
# print(sm_data_X.columns)
# print(rfe.support_)
# print(rfe.ranking_)

def RFE_analysis(input_x, input_y, model):
    rfe = RFE(model, step=300)
    rfe.fit(input_x, input_y.values.ravel())
    print(input_x.columns)
    print(rfe.support_)
    print(rfe.ranking_)

RFE_analysis(sm_data_X, sm_data_y, LR)

In [None]:
# assess feature performance, P>|z| should be acceptable

import statsmodels.api as sm

updated_cols = ['risk_rank', 'AGE']
new_X = sm_data_X[updated_cols]
new_Y = sm_data_y['sar_flag']
# LR2 = sm.Logit(new_Y,new_X)
# result=LR2.fit()
# print(result.summary2())

def LR_analysis(input_x, input_y):
    sm_LR = sm.Logit(input_y, input_x)
    result = sm_LR.fit()
    print(result.summary2())

LR_analysis(new_X, new_Y)

In [None]:
# Train and test

from sklearn import metrics

# x_train, x_test, y_train, y_test = train_test_split(new_X, new_Y, test_size=0.25)
# LR.fit(x_train, y_train)
# y_train_predict = LR.predict(x_train)
# y_test_predict = LR.predict(x_test)
#
# # output = pd.DataFrame(y_test_predict, columns = ['sar_flag'])
# # output.to_csv('y_test_predict.csv', index=False)
#
# train_acc = accuracy_score(y_train, y_train_predict)
# test_acc = accuracy_score(y_test, y_test_predict)
# print(f'{train_acc}  {test_acc}')
# cm = confusion_matrix(y_true=y_test, y_pred=y_test_predict)
# print(cm)

def LR_training(input_x, input_y, LR_model):
    x_train, x_test, y_train, y_test = train_test_split(input_x, input_y, test_size=0.25)
    LR_model.fit(x_train, y_train)
    y_train_predict = LR_model.predict(x_train)
    y_test_predict = LR_model.predict(x_test)
    train_acc = accuracy_score(y_train, y_train_predict)
    test_acc = accuracy_score(y_test, y_test_predict)
    print(f'{train_acc}  {test_acc}')
    cm = confusion_matrix(y_true=y_test, y_pred=y_test_predict)
    print(cm)
    return LR_model

LR = LR_training(new_X, new_Y, LR)

In [None]:
# Apply to our real train dataset

x = train[['risk_rank', 'AGE']]
y = train['sar_flag']

# y_predict = LR.predict(x)
# test_acc = accuracy_score(y, y_predict)
# print(f'{test_acc}')
# cm = confusion_matrix(y_true=y, y_pred=y_predict)
# print(cm)

def LR_test(input_x, input_y, LR_model):
    y_predict = LR_model.predict(input_x)
    test_acc = accuracy_score(input_y, y_predict)
    print(f'{test_acc}')
    cm = confusion_matrix(y_true=input_y, y_pred=y_predict)
    print(cm)

LR_test(x, y, LR)

In [None]:
y_prob = LR.predict_proba(x)
y_prob

In [None]:
submit_format = df_from_csv('submit_format.csv')
submit_format

In [None]:
submit_format = submit_format.drop(['probability'], axis=1)
submit_format

In [None]:
submit_format = submit_format.merge(alert_cust, on='alert_key', how='left')
submit_format

In [None]:
x_submit = submit_format[['risk_rank', 'AGE']]
x_submit

In [None]:
x_submit.dropna(inplace=True)
x_submit


In [None]:
y_submit = LR.predict_proba(x_submit)
y_submit = pd.DataFrame(y_submit, columns = ['probability', 'inv_probability'])
y_submit.index = x_submit.index
y_submit

In [None]:
# submit_prob = pd.merge([x_submit, y_submit], axis=1, left_index=True, right_index=False)
submit_prob = pd.concat([x_submit, y_submit], axis=1)
submit_prob

In [None]:
final_submit = pd.concat([submit_format, submit_prob], axis=1)
final_submit = final_submit[['alert_key', 'probability']]
final_submit

In [None]:
final_submit['probability'] = final_submit['probability'].fillna(0)
final_submit

In [None]:
final_submit = final_submit.sort_values(by='probability', ascending=False)
final_submit

In [None]:
final_submit.to_csv('final_submit.csv', index=False)

In [None]:
# file load too long
xdp = df_from_csv('ori_xdp.csv')

In [None]:
# xdp_train originally from train.copy()
xdp_train = xdp_train.drop(['occupation_code', 'total_asset'], axis=1)
xdp_train

In [None]:
from tqdm import tqdm

def calculate_tx(row):
    if not pd.isna(row['risk_rank']):
        temp_frame = xdp[(xdp['tx_date'] <= row['date']) & (xdp['cust_id'] == row['cust_id'])]
        total_tx = (temp_frame['tx_amt'] * temp_frame['exchg_rate'])
        row['total_tx'] = total_tx.sum()
        row['avg_tx'] = total_tx.mean()
        row['count_tx'] = total_tx.count()
        row['std_tx'] = total_tx.std()
        row['tx_per_day'] = row['count_tx']/row['date']
    return row

In [None]:
# Experiment for function in .progress_apply



In [None]:
tqdm.pandas(desc='Progress Bar')
xdp_train_test = xdp_train.copy()
xdp_train_test = xdp_train_test.progress_apply(calculate_tx, axis=1)
xdp_train_test

In [None]:
xdp_train_test.to_pickle('xdp_train_test1.pkl')

In [None]:
xdp_train_test['avg_tx'] = xdp_train_test['avg_tx'].fillna(0)
xdp_train_test['std_tx'] = xdp_train_test['std_tx'].fillna(0)

In [None]:
xdp_train_test

In [None]:
xdp_train_test.isnull().any()

In [None]:
xdp_train_test

In [None]:
xdp_train_x = xdp_train_test[['risk_rank','AGE','total_tx','avg_tx','count_tx','std_tx']]
xdp_train_y = xdp_train_test.loc[:, xdp_train_test.columns == 'sar_flag']
sm_xdp_train_x, sm_xdp_train_y = SMOTE_oversampling(xdp_train_x, xdp_train_y)

In [None]:
LR_1 = LogisticRegression(max_iter=2000, n_jobs=-1)
RFE_analysis(sm_xdp_train_x, sm_xdp_train_y, LR_1)

In [None]:
LR_analysis(sm_xdp_train_x, sm_xdp_train_y)

In [None]:
LR_1 = LR_training(sm_xdp_train_x, sm_xdp_train_y, LR_1)

In [None]:
LR_test(xdp_train_x, xdp_train_y, LR_1)

In [None]:
xdp_train_test

In [None]:
submit_format = df_from_csv('submit_format.csv')
submit_format = submit_format.drop(['probability'], axis=1)
submit_format = submit_format.merge(alert_date, on='alert_key', how='left')
submit_format = submit_format.merge(alert_cust, on='alert_key', how='left')
submit_format = submit_format.progress_apply(calculate_tx, axis=1)
x_submit = submit_format[['risk_rank', 'AGE', 'total_tx', 'avg_tx', 'count_tx', 'std_tx']]
x_submit.dropna(inplace=True)
x_submit

In [None]:
def generate_submission(input_submit, input_x, LR_model):
    y_prob = LR_model.predict_proba(input_x)
    y_prob = pd.DataFrame(y_prob, columns = ['probability', 'inv_probability'])
    y_prob.index = input_x.index

    submit_prob = pd.concat([input_x, y_prob], axis=1)
    final_submit = pd.concat([input_submit, submit_prob], axis=1)
    final_submit = final_submit[['alert_key', 'probability']]
    final_submit['probability'] = final_submit['probability'].fillna(0)
    final_submit = final_submit.sort_values(by='probability', ascending=False)
    final_submit.to_csv('final_submit.csv', index=False)
    return final_submit

lr_1_submission = generate_submission(submit_format, x_submit, LR_1)
lr_1_submission

In [None]:
lr_1_submission.to_pickle('lr_1_submission.pkl')

In [None]:
# trying SVR

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

y = sm_xdp_train_y
X = sm_xdp_train_x
SVR_1 = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
SVR_1.fit(X, y)

In [None]:
def generate_submission_SVR(input_submit, input_x, SVR_model):
    y_prob = SVR_model.predict(input_x)
    y_prob = pd.DataFrame(y_prob, columns = ['probability'])
    y_prob.index = input_x.index

    submit_prob = pd.concat([input_x, y_prob], axis=1)
    final_submit = pd.concat([input_submit, submit_prob], axis=1)
    final_submit = final_submit[['alert_key', 'probability']]
    final_submit['probability'] = final_submit['probability'].fillna(0)
    final_submit.loc[final_submit['probability'] < 0, 'probability'] = 0
    final_submit = final_submit.sort_values(by='probability', ascending=False)
    final_submit.to_csv('final_submit.csv', index=False)
    return final_submit

final_submit = generate_submission_SVR(submit_format, x_submit, SVR_1)
final_submit

In [None]:
final_submit.loc[final_submit.duplicated(keep=False)]

In [29]:
# file load too long
xdp = df_from_csv('ori_xdp.csv')

In [30]:
xdp

Unnamed: 0,cust_id,debit_credit,tx_date,tx_time,tx_type,tx_amt,exchg_rate,info_asset_code,fiscTxId,txbranch,cross_bank,ATM
0,0172056578071e83399216fcd640bdc4de4583149d0fd9...,CR,36,18,2,68265.0,1.0,16,,,0,0
1,0172056578071e83399216fcd640bdc4de4583149d0fd9...,CR,42,17,2,932058.0,1.0,16,,,0,0
2,0172056578071e83399216fcd640bdc4de4583149d0fd9...,CR,39,18,2,6089.0,1.0,16,,,0,0
3,0172056578071e83399216fcd640bdc4de4583149d0fd9...,CR,49,15,2,776715.0,1.0,16,,,0,0
4,0172056578071e83399216fcd640bdc4de4583149d0fd9...,CR,70,19,2,61630.0,1.0,16,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1969813,fff94defcc33daab190e702926d583f6d805294354addb...,CR,39,20,2,1030.0,1.0,13,4.0,313.0,1,1
1969814,fff94defcc33daab190e702926d583f6d805294354addb...,CR,73,1,2,1030.0,1.0,13,4.0,144.0,1,1
1969815,fff94defcc33daab190e702926d583f6d805294354addb...,CR,43,22,2,1648.0,1.0,13,7.0,313.0,1,1
1969816,fff94defcc33daab190e702926d583f6d805294354addb...,DB,40,18,2,1030.0,1.0,13,3.0,310.0,1,1


In [31]:
# xdp_train originally from train.copy()
xdp_train = xdp_train.drop(['occupation_code', 'total_asset'], axis=1)
xdp_train

Unnamed: 0,alert_key,date,sar_flag,cust_id,risk_rank,AGE
0,171189,0,0,972ee157e63316e8a50dd489bc93730a3ee8a8959d5c6b...,1,4
1,171202,0,0,a10ab33f90926fb18d7bb5e78034d7f04a1fbed95b7951...,3,2
2,171599,0,0,3433ecc068ed1c9e2f5037cab5d42d7b901e9bd624c1fb...,1,4
3,171737,0,0,a0861608615a9365d90f4ba0a813c0ea0471987f925c8b...,3,4
4,171142,0,0,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,3
...,...,...,...,...,...,...
23901,352132,364,0,c49b33d1fde790ec03584672903f296e486615adfdc989...,1,3
23902,352125,364,0,0c2dc5fedc3689abf5ff4be14fe8fea8d23d22068297c2...,3,3
23903,352080,364,0,1a93961c8fa830a1f32f5316b44f4964d65397f0311b11...,2,6
23904,352075,364,0,9586b80d3ad1d74c0a1efe792cae2ecd073243fb01ed33...,3,6


In [34]:
from tqdm import tqdm

def calculate_tx(row):
    if not pd.isna(row['risk_rank']):
        temp_frame = xdp[(xdp['tx_date'] <= row['date']) & (xdp['cust_id'] == row['cust_id'])]
        total_tx = (temp_frame['tx_amt'] * temp_frame['exchg_rate'])
        row['total_tx'] = total_tx.sum()
        row['avg_tx'] = total_tx.mean()
        row['count_tx'] = total_tx.count()
        row['std_tx'] = total_tx.std()
        row['tx_per_day'] = row['count_tx']/row['date']
    return row

In [None]:
# Experiment for function in .progress_apply



In [35]:
tqdm.pandas(desc='Progress Bar')
xdp_train_test = xdp_train.copy()
xdp_train_test = xdp_train_test.progress_apply(calculate_tx, axis=1)
xdp_train_test

Progress Bar: 100%|██████████| 23906/23906 [28:48<00:00, 13.83it/s]


Unnamed: 0,alert_key,date,sar_flag,cust_id,risk_rank,AGE,total_tx,avg_tx,count_tx,std_tx
0,171189,0,0,972ee157e63316e8a50dd489bc93730a3ee8a8959d5c6b...,1,4,1.515000e+03,5.050000e+02,3,4.502233e+02
1,171202,0,0,a10ab33f90926fb18d7bb5e78034d7f04a1fbed95b7951...,3,2,2.092020e+05,6.973400e+04,3,7.567066e+04
2,171599,0,0,3433ecc068ed1c9e2f5037cab5d42d7b901e9bd624c1fb...,1,4,3.105930e+05,7.764825e+04,4,1.347252e+05
3,171737,0,0,a0861608615a9365d90f4ba0a813c0ea0471987f925c8b...,3,4,4.312500e+04,4.312500e+03,10,5.920366e+03
4,171142,0,0,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,3,2.963440e+05,4.939067e+03,60,6.547865e+03
...,...,...,...,...,...,...,...,...,...,...
23901,352132,364,0,c49b33d1fde790ec03584672903f296e486615adfdc989...,1,3,1.532000e+08,1.078873e+06,142,3.103629e+06
23902,352125,364,0,0c2dc5fedc3689abf5ff4be14fe8fea8d23d22068297c2...,3,3,4.615283e+07,2.307641e+06,20,2.605778e+06
23903,352080,364,0,1a93961c8fa830a1f32f5316b44f4964d65397f0311b11...,2,6,1.555772e+08,2.222531e+06,70,5.842826e+06
23904,352075,364,0,9586b80d3ad1d74c0a1efe792cae2ecd073243fb01ed33...,3,6,1.365559e+08,3.175717e+05,430,2.409931e+06


In [36]:
xdp_train_test.to_pickle('xdp_train_test1.pkl')

In [37]:
xdp_train_test['avg_tx'] = xdp_train_test['avg_tx'].fillna(0)
xdp_train_test['std_tx'] = xdp_train_test['std_tx'].fillna(0)

In [38]:
xdp_train_test

Unnamed: 0,alert_key,date,sar_flag,cust_id,risk_rank,AGE,total_tx,avg_tx,count_tx,std_tx
0,171189,0,0,972ee157e63316e8a50dd489bc93730a3ee8a8959d5c6b...,1,4,1.515000e+03,5.050000e+02,3,4.502233e+02
1,171202,0,0,a10ab33f90926fb18d7bb5e78034d7f04a1fbed95b7951...,3,2,2.092020e+05,6.973400e+04,3,7.567066e+04
2,171599,0,0,3433ecc068ed1c9e2f5037cab5d42d7b901e9bd624c1fb...,1,4,3.105930e+05,7.764825e+04,4,1.347252e+05
3,171737,0,0,a0861608615a9365d90f4ba0a813c0ea0471987f925c8b...,3,4,4.312500e+04,4.312500e+03,10,5.920366e+03
4,171142,0,0,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,3,2.963440e+05,4.939067e+03,60,6.547865e+03
...,...,...,...,...,...,...,...,...,...,...
23901,352132,364,0,c49b33d1fde790ec03584672903f296e486615adfdc989...,1,3,1.532000e+08,1.078873e+06,142,3.103629e+06
23902,352125,364,0,0c2dc5fedc3689abf5ff4be14fe8fea8d23d22068297c2...,3,3,4.615283e+07,2.307641e+06,20,2.605778e+06
23903,352080,364,0,1a93961c8fa830a1f32f5316b44f4964d65397f0311b11...,2,6,1.555772e+08,2.222531e+06,70,5.842826e+06
23904,352075,364,0,9586b80d3ad1d74c0a1efe792cae2ecd073243fb01ed33...,3,6,1.365559e+08,3.175717e+05,430,2.409931e+06


In [39]:
xdp_train_test.isnull().any()

alert_key    False
date         False
sar_flag     False
cust_id      False
risk_rank    False
AGE          False
total_tx     False
avg_tx       False
count_tx     False
std_tx       False
dtype: bool

In [40]:
xdp_train_test

Unnamed: 0,alert_key,date,sar_flag,cust_id,risk_rank,AGE,total_tx,avg_tx,count_tx,std_tx
0,171189,0,0,972ee157e63316e8a50dd489bc93730a3ee8a8959d5c6b...,1,4,1.515000e+03,5.050000e+02,3,4.502233e+02
1,171202,0,0,a10ab33f90926fb18d7bb5e78034d7f04a1fbed95b7951...,3,2,2.092020e+05,6.973400e+04,3,7.567066e+04
2,171599,0,0,3433ecc068ed1c9e2f5037cab5d42d7b901e9bd624c1fb...,1,4,3.105930e+05,7.764825e+04,4,1.347252e+05
3,171737,0,0,a0861608615a9365d90f4ba0a813c0ea0471987f925c8b...,3,4,4.312500e+04,4.312500e+03,10,5.920366e+03
4,171142,0,0,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,3,2.963440e+05,4.939067e+03,60,6.547865e+03
...,...,...,...,...,...,...,...,...,...,...
23901,352132,364,0,c49b33d1fde790ec03584672903f296e486615adfdc989...,1,3,1.532000e+08,1.078873e+06,142,3.103629e+06
23902,352125,364,0,0c2dc5fedc3689abf5ff4be14fe8fea8d23d22068297c2...,3,3,4.615283e+07,2.307641e+06,20,2.605778e+06
23903,352080,364,0,1a93961c8fa830a1f32f5316b44f4964d65397f0311b11...,2,6,1.555772e+08,2.222531e+06,70,5.842826e+06
23904,352075,364,0,9586b80d3ad1d74c0a1efe792cae2ecd073243fb01ed33...,3,6,1.365559e+08,3.175717e+05,430,2.409931e+06


In [41]:
xdp_train_x = xdp_train_test[['risk_rank','AGE','total_tx','avg_tx','count_tx','std_tx']]
xdp_train_y = xdp_train_test.loc[:, xdp_train_test.columns == 'sar_flag']
sm_xdp_train_x, sm_xdp_train_y = SMOTE_oversampling(xdp_train_x, xdp_train_y)

length of oversampled data is  33168
Number of non-sar in oversampled data 16584
Number of sar 16584
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


In [42]:
LR_1 = LogisticRegression(max_iter=2000, n_jobs=-1)
RFE_analysis(sm_xdp_train_x, sm_xdp_train_y, LR_1)

Index(['risk_rank', 'AGE', 'total_tx', 'avg_tx', 'count_tx', 'std_tx'], dtype='object')
[False False  True  True False  True]
[2 2 1 1 2 1]


In [43]:
LR_analysis(sm_xdp_train_x, sm_xdp_train_y)

Optimization terminated successfully.
         Current function value: 0.651428
         Iterations 7
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.060     
Dependent Variable: sar_flag         AIC:              43225.1046
Date:               2022-12-09 19:25 BIC:              43275.5607
No. Observations:   33168            Log-Likelihood:   -21607.   
Df Model:           5                LL-Null:          -22990.   
Df Residuals:       33162            LLR p-value:      0.0000    
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     7.0000                                       
------------------------------------------------------------------
              Coef.   Std.Err.     z      P>|z|    [0.025   0.975]
------------------------------------------------------------------
risk_rank    -0.4420    0.0152  -29.0014  0.0000  -0.4718  -0.4121
AGE           0.0761    0.0062   12.2443  0.0000   0.0639   0.

In [44]:
LR_1 = LR_training(sm_xdp_train_x, sm_xdp_train_y, LR_1)

  y = column_or_1d(y, warn=True)


0.6378437047756874  0.6433912204534491
[[1792 2347]
 [ 610 3543]]


In [45]:
LR_test(xdp_train_x, xdp_train_y, LR_1)

0.43863465238852173
[[10319 13353]
 [   67   167]]


In [46]:
xdp_train_test

Unnamed: 0,alert_key,date,sar_flag,cust_id,risk_rank,AGE,total_tx,avg_tx,count_tx,std_tx
0,171189,0,0,972ee157e63316e8a50dd489bc93730a3ee8a8959d5c6b...,1,4,1.515000e+03,5.050000e+02,3,4.502233e+02
1,171202,0,0,a10ab33f90926fb18d7bb5e78034d7f04a1fbed95b7951...,3,2,2.092020e+05,6.973400e+04,3,7.567066e+04
2,171599,0,0,3433ecc068ed1c9e2f5037cab5d42d7b901e9bd624c1fb...,1,4,3.105930e+05,7.764825e+04,4,1.347252e+05
3,171737,0,0,a0861608615a9365d90f4ba0a813c0ea0471987f925c8b...,3,4,4.312500e+04,4.312500e+03,10,5.920366e+03
4,171142,0,0,a39fea9aec90969fe66a2b2b4d1b86368a2d38e8b8d4bf...,3,3,2.963440e+05,4.939067e+03,60,6.547865e+03
...,...,...,...,...,...,...,...,...,...,...
23901,352132,364,0,c49b33d1fde790ec03584672903f296e486615adfdc989...,1,3,1.532000e+08,1.078873e+06,142,3.103629e+06
23902,352125,364,0,0c2dc5fedc3689abf5ff4be14fe8fea8d23d22068297c2...,3,3,4.615283e+07,2.307641e+06,20,2.605778e+06
23903,352080,364,0,1a93961c8fa830a1f32f5316b44f4964d65397f0311b11...,2,6,1.555772e+08,2.222531e+06,70,5.842826e+06
23904,352075,364,0,9586b80d3ad1d74c0a1efe792cae2ecd073243fb01ed33...,3,6,1.365559e+08,3.175717e+05,430,2.409931e+06


In [47]:
submit_format = df_from_csv('submit_format.csv')
submit_format = submit_format.drop(['probability'], axis=1)
submit_format = submit_format.merge(alert_date, on='alert_key', how='left')
submit_format = submit_format.merge(alert_cust, on='alert_key', how='left')
submit_format = submit_format.progress_apply(calculate_tx, axis=1)
x_submit = submit_format[['risk_rank', 'AGE', 'total_tx', 'avg_tx', 'count_tx', 'std_tx']]
x_submit.dropna(inplace=True)
x_submit

Progress Bar: 100%|██████████| 3850/3850 [02:12<00:00, 29.15it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,risk_rank,AGE,total_tx,avg_tx,count_tx,std_tx
0,1.0,3.0,1.040422e+07,3.715794e+04,280.0,1.134937e+05
3,1.0,5.0,1.407445e+08,3.953497e+05,356.0,3.312853e+06
4,1.0,3.0,3.380401e+08,1.391112e+06,243.0,5.941655e+06
5,1.0,4.0,4.490287e+07,1.586674e+05,283.0,4.286031e+05
6,3.0,3.0,2.209952e+07,2.302033e+05,96.0,1.065986e+06
...,...,...,...,...,...,...
3841,1.0,4.0,5.949962e+06,5.891051e+04,101.0,1.934595e+05
3844,1.0,6.0,4.286594e+06,4.984412e+04,86.0,9.196838e+04
3845,1.0,2.0,7.173381e+06,2.116042e+04,339.0,4.365511e+04
3846,1.0,2.0,5.413915e+08,1.021493e+06,530.0,4.010326e+06


In [48]:
def generate_submission(input_submit, input_x, LR_model):
    y_prob = LR_model.predict_proba(input_x)
    y_prob = pd.DataFrame(y_prob, columns = ['probability', 'inv_probability'])
    y_prob.index = input_x.index

    submit_prob = pd.concat([input_x, y_prob], axis=1)
    final_submit = pd.concat([input_submit, submit_prob], axis=1)
    final_submit = final_submit[['alert_key', 'probability']]
    final_submit['probability'] = final_submit['probability'].fillna(0)
    final_submit = final_submit.sort_values(by='probability', ascending=False)
    final_submit.to_csv('final_submit.csv', index=False)
    return final_submit

lr_1_submission = generate_submission(submit_format, x_submit, LR_1)
lr_1_submission

Unnamed: 0,alert_key,probability
199,364640,0.999959
1978,362891,0.999959
1154,362171,0.999958
1220,361408,0.999958
2952,360951,0.999957
...,...,...
1614,372903,0.000000
1616,372201,0.000000
1617,368902,0.000000
1618,373476,0.000000


In [49]:
lr_1_submission.to_pickle('lr_1_submission.pkl')

In [51]:
# trying SVR

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

y = sm_xdp_train_y
X = sm_xdp_train_x
SVR_1 = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
SVR_1.fit(X, y)

  y = column_or_1d(y, warn=True)


In [95]:
def generate_submission_SVR(input_submit, input_x, SVR_model):
    y_prob = SVR_model.predict(input_x)
    y_prob = pd.DataFrame(y_prob, columns = ['probability'])
    y_prob.index = input_x.index

    submit_prob = pd.concat([input_x, y_prob], axis=1)
    final_submit = pd.concat([input_submit, submit_prob], axis=1)
    final_submit = final_submit[['alert_key', 'probability']]
    final_submit['probability'] = final_submit['probability'].fillna(0)
    final_submit.loc[final_submit['probability'] < 0, 'probability'] = 0
    final_submit = final_submit.sort_values(by='probability', ascending=False)
    final_submit.to_csv('final_submit.csv', index=False)
    return final_submit

final_submit = generate_submission_SVR(submit_format, x_submit, SVR_1)
final_submit

Unnamed: 0,alert_key,probability
959,355724,0.996678
2554,362151,0.935116
558,358252,0.932623
100,354820,0.931383
1144,357885,0.929668
...,...,...
1592,367420,0.000000
1595,377601,0.000000
1596,369441,0.000000
1599,374756,0.000000


In [96]:
final_submit.loc[final_submit.duplicated(keep=False)]

Unnamed: 0,alert_key,probability
