In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torchvision import transforms, datasets
from torch.utils.data import Dataset, DataLoader
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, recall_score, confusion_matrix, matthews_corrcoef
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
import xgboost as xgb
from utils import score
import pickle

### Load data

In [2]:
label = pd.read_csv('./data/train_y_answer.csv')
custinfo = pd.read_csv('./data/public_train_x_custinfo_full_hashed.csv')
ds = pd.read_csv(f"./result/featuresV2.csv")
custs = custinfo.cust_id.unique()

train_ds = ds[ds.alert_key.isin(label.alert_key)]
pub_ds = ds[~ds.alert_key.isin(label.alert_key)]

In [3]:
X = train_ds.iloc[:,:-2].values
y = train_ds.iloc[:,-1].values
pca = PCA()
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)
pca.fit(X_std)

PCA()

### RFE Feature selection

Use ExtraTreeClassifier as estimator

In [4]:
skf = StratifiedKFold(n_splits=5)
feat_freq = np.zeros(train_ds.iloc[:,:-2].values.shape[1])
X = train_ds.iloc[:,:-2].values
y = train_ds.iloc[:,-1].values
for idx, (train_idx, val_idx) in enumerate(skf.split(X,y)):
    train_ds_cv = train_ds.iloc[train_idx,:]
    val_ds_cv = train_ds.iloc[val_idx,:]
    X_train = train_ds_cv.iloc[:,:-2].values
    X_test = val_ds_cv.iloc[:,:-2].values
    y_train = train_ds_cv.iloc[:,-1].values
    y_test = val_ds_cv.iloc[:,-1].values
    
    #'''
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test)
    #'''

    for i in range(5):
        estimator = ExtraTreesClassifier()
        selector = RFE(estimator=estimator, step=10, n_features_to_select=100)
        selector.fit(X_train, y_train)
        feat_freq = feat_freq + (0+selector.support_)
top_feats = np.argsort(feat_freq)[-100:]

# with open('./top_idx/top_featsV2', 'wb') as f:
#     pickle.dump(top_feats, f)

In [7]:
with open('./result/top_featsV2', 'rb') as f:
    top_feats = pickle.load(f)

# Show feature ranks: top 10
train_ds.iloc[:,top_feats[:10]]

Unnamed: 0,period_dp_neg_sum,day5_cdtx_n_forcur,period_remit_trans1,ccba_csamt,day10_cdtx_n_forcur,period_remit_trans3,day5_cdtx_n_cur_switch,day5_cdtx_n_cur,day10_cdtx_n_country_switch,day10_cdtx_max_country
0,-46.051702,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,-46.051702,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,-46.051702,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,-46.051702,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,-46.051702,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
25746,-46.051702,0.0,0.0,-46.051702,0.0,0.0,1.0,1.0,1.0,9.0
25747,-46.051702,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
25748,-46.051702,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
25749,-46.051702,0.0,0.0,-46.051702,0.0,0.0,1.0,1.0,1.0,7.0


### Model Training: XGBoost, SVM, RandomForest
Test the performance on public data

In [8]:
X_train = train_ds.iloc[:,:-2].values
X_pub = pub_ds.iloc[:,:-2].values
y_train = train_ds.iloc[:,-1].values
y_pub = pub_ds.iloc[:,-1].values
ak_train = train_ds.iloc[:,-2:]
ak_pub = pub_ds.iloc[:,-2:]

#'''
sc = StandardScaler()
sc.fit(X)
X_train = sc.transform(X_train)
X_pub = sc.transform(X_pub)
#'''

X_train = X_train[:,top_feats[:]]
X_pub = X_pub[:,top_feats[:]]
# pca = PCA(n_components=30)
# pca.fit(X_std)
# X_train = pca.transform(X_train)
# X_pub = pca.transform(X_pub)
    
print(X_train.shape)
print(X_pub.shape)

clf_xgb = xgb.XGBClassifier(n_estimators=100, learning_rate = 0.10, verbosity=0)
clf_svc = SVC(C=1e+0, class_weight='balanced', kernel='rbf', probability=True)
clf_rf = RandomForestClassifier(n_estimators=100, criterion='entropy', class_weight='balanced', random_state=44)


clf_xgb.fit(X_train, y_train)
y_prob = clf_xgb.predict_proba(X_pub)
y_pred = ak_pub.iloc[:,0:1] # alert_key
y_pred['prob'] = y_prob[:,1]
print(f"XGB score: {score(ak_pub, y_pred)}")
result_xgb = y_pred.copy().reset_index(drop=True)

clf_svc.fit(X_train, y_train)
y_prob = clf_svc.predict_proba(X_pub)
y_pred = ak_pub.iloc[:,0:1] # alert_key
y_pred['prob'] = y_prob[:,1]
print(f"SVM score: {score(ak_pub, y_pred)}")
result_svm = y_pred.copy().reset_index(drop=True)

clf_rf.fit(X_train, y_train)
y_prob = clf_rf.predict_proba(X_pub)
y_pred = ak_pub.iloc[:,0:1] # alert_key
y_pred['prob'] = y_prob[:,1]
print(f"RF score: {score(ak_pub, y_pred)}")
result_rf = y_pred.copy().reset_index(drop=True)


(23906, 100)
(1845, 100)
XGB score: (0.01972386587771203, 10, 506)
SVM score: (0.01006036217303823, 10, 993)
RF score: (0.00999000999000999, 10, 1000)


Try using voter to leverage these models

In [9]:
voter = VotingClassifier(estimators=[('XGB', clf_xgb), ('SVC', clf_svc), ('RF', clf_rf)], voting='soft', weights=[7,2,1])
voter.fit(X_train, y_train)
y_prob = voter.predict_proba(X_pub)
y_pred = ak_pub.iloc[:,0:1] # alert_key
y_pred['prob'] = y_prob[:,1]
print(f"score: {score(ak_pub, y_pred)}")
result_voter = y_pred.copy().reset_index(drop=True)

score: (0.01594896331738437, 10, 626)


### Analysis: alert key SAR probability ranks

In [10]:
from IPython.display import display
print("XGB | alert key SAR prob rank")
display(result_xgb[result_xgb.alert_key.isin(ak_pub[ak_pub.sar_flag==1].alert_key)])
print("SVM | alert key SAR prob rank")
display(result_svm[result_svm.alert_key.isin(ak_pub[ak_pub.sar_flag==1].alert_key)])
print("RF | alert key SAR prob rank")
display(result_rf[result_rf.alert_key.isin(ak_pub[ak_pub.sar_flag==1].alert_key)])
print("Voter | alert key SAR prob rank")
display(result_voter[result_voter.alert_key.isin(ak_pub[ak_pub.sar_flag==1].alert_key)])

XGB | alert key SAR prob rank


Unnamed: 0,alert_key,prob
6,354939.0,0.263724
16,356602.0,0.179072
53,363320.0,0.086578
86,355724.0,0.057429
191,355152.0,0.017681
205,358453.0,0.01523
211,355091.0,0.014558
336,363033.0,0.007073
454,361617.0,0.004936
506,359668.0,0.004272


SVM | alert key SAR prob rank


Unnamed: 0,alert_key,prob
16,363320.0,0.117687
36,355091.0,0.065739
38,356602.0,0.064782
93,363033.0,0.041762
288,359668.0,0.020035
574,358453.0,0.010111
627,361617.0,0.00911
632,363896.0,0.008935
664,354939.0,0.008341
993,355724.0,0.005333


RF | alert key SAR prob rank


Unnamed: 0,alert_key,prob
25,356602.0,0.16
76,363320.0,0.09
126,354939.0,0.05
144,355091.0,0.04
274,355724.0,0.02
314,363033.0,0.02
325,355152.0,0.02
367,358453.0,0.01
826,363896.0,0.0
1000,361617.0,0.0


Voter | alert key SAR prob rank


Unnamed: 0,alert_key,prob
10,354939.0,0.1913
14,356602.0,0.15434
47,363320.0,0.093121
114,355724.0,0.043285
157,355091.0,0.027371
229,363033.0,0.015349
233,355152.0,0.014799
250,358453.0,0.013711
461,359668.0,0.007037
626,361617.0,0.005303
