In [18]:
import numpy as np
import pandas as pd
import xgboost as xgb



In [2]:
X_train_1 = pd.read_csv("../aml_data/train/Train-Data-Set-1.csv"); print X_train_1.shape
X_train_2 = pd.read_csv("../aml_data/train/Train-Data-Set-2.csv"); print X_train_2.shape
X_train_3 = pd.read_csv("../aml_data/train/Train-Data-Set-3.csv"); print X_train_3.shape
X_train_4 = pd.read_csv("../aml_data/train/Train-Data-Set-4.csv"); print X_train_4.shape
X_train_5 = pd.read_csv("../aml_data/train/Train-Data-Set-5.csv"); print X_train_5.shape

X_train_1 = X_train_1.drop(['Unnamed: 25'], axis=1)

X_test = pd.read_csv("../aml_data/test.csv"); print X_test.shape

(1000000, 26)
(1000000, 25)
(1000000, 25)
(1000000, 25)
(362223, 25)
(1503085, 22)


In [3]:
X_train = pd.concat([X_train_1, X_train_2, X_train_3, X_train_4, X_train_5]); print X_train.shape

(4362223, 25)


In [4]:
del X_train_1, X_train_2, X_train_3, X_train_4, X_train_5

In [5]:
X_train['Receiver_Customer'] = (X_train['CustomerName'] == X_train['ReceiverName']).astype(int)
X_train['Sender_Customer'] = (X_train['CustomerName'] == X_train['SenderName']).astype(int)

X_test['Receiver_Customer'] = (X_test['CustomerName'] == X_test['ReceiverName']).astype(int)
X_test['Sender_Customer'] = (X_test['CustomerName'] == X_test['SenderName']).astype(int)

# NOTES:
    - (Timestamp('2016-03-01 00:00:00'), Timestamp('2016-07-29 00:00:00'))
    - (Timestamp('2016-08-01 00:00:00'), Timestamp('2016-09-30 00:00:00'))
    - Unique Customers in Train:915
    - Unique Customers in Test: 833
    - # Customer ID is exactly equal to Customer Name - hence, ignore.
    - Account CloseDate is missing for 99.99% of the data - hence, ignore.

In [6]:
drop_cols = ['CustomerName', 'Account CloseDate']
X_train = X_train.drop(drop_cols, axis=1)
X_test = X_test.drop(drop_cols, axis=1)

In [7]:
X_train['Account OpenDate'] = pd.to_datetime(X_train['Account OpenDate'], format="%d.%m.%Y")
X_train['Account OpenDate_Day'] = X_train['Account OpenDate'].dt.day
X_train['Account OpenDate_Month'] = X_train['Account OpenDate'].dt.month
X_train['Account OpenDate_Year'] = X_train['Account OpenDate'].dt.year

X_train['PaymentDate'] = pd.to_datetime(X_train['PaymentDate'], format="%d.%m.%Y")
X_train['PaymentDate_Day'] = X_train['PaymentDate'].dt.day
X_train['PaymentDate_Month'] = X_train['PaymentDate'].dt.month


X_test['Account OpenDate'] = pd.to_datetime(X_test['Account OpenDate'], format="%d.%m.%Y")
X_test['Account OpenDate_Day'] = X_test['Account OpenDate'].dt.day
X_test['Account OpenDate_Month'] = X_test['Account OpenDate'].dt.month
X_test['Account OpenDate_Year'] = X_test['Account OpenDate'].dt.year

X_test['PaymentDate'] = pd.to_datetime(X_test['PaymentDate'], format="%d.%m.%Y")
X_test['PaymentDate_Day'] = X_test['PaymentDate'].dt.day
X_test['PaymentDate_Month'] = X_test['PaymentDate'].dt.month

In [8]:
X_train['Age'] = (X_train['PaymentDate'] - X_train['Account OpenDate']).dt.days
X_test['Age'] = (X_test['PaymentDate'] - X_test['Account OpenDate']).dt.days

In [9]:
X_train = X_train[X_train['AML3'].notnull()].reset_index(drop=True)

In [8]:
# %matplotlib inline
# X_train['Account OpenDate_Year'].value_counts().sort_index().plot(kind='bar', figsize=(10, 8), 
#                                                                   title="Yearly Distribution")

In [10]:
X_train.columns

Index([u'Sl.No', u'PaymentRef', u'CustomerId', u'Custmer Account Number',
       u'Account OpenDate', u'AccountCurrency', u'Sent/Received', u'Amount',
       u'Payment', u'PaymentDate', u'Type', u'SenderName', u'SenderAddress',
       u'ReceiverName', u'ReceiverAddress', u'SenderCountry',
       u'ReceiverCountry', u'SenderCorrespondent', u'ReceiverCorrespondent',
       u'ReceiverAccount', u'AML1', u'AML2', u'AML3', u'Receiver_Customer',
       u'Sender_Customer', u'Account OpenDate_Day', u'Account OpenDate_Month',
       u'Account OpenDate_Year', u'PaymentDate_Day', u'PaymentDate_Month',
       u'Age'],
      dtype='object')

In [11]:
train = X_train.copy()
test = X_test.copy()

In [12]:
train = train.drop(['PaymentDate_Month'], axis=1)
test = test.drop(['PaymentDate_Month'], axis=1)

In [13]:
features = list(set(train.columns) - set(['Sl.No', 'PaymentRef', 'Account OpenDate', 'PaymentDate', 'PaymentDate_Year', 'AML1', 'AML2', 'AML3']))

In [15]:
from sklearn.preprocessing import LabelEncoder
print("Label Encoding...")
for f in train[features].columns:
    if train[f].dtype == 'object':
        print f
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

Label Encoding...
ReceiverAccount
ReceiverAddress
Custmer Account Number
Sent/Received
Type
CustomerId
AccountCurrency
SenderAddress
SenderName
Payment
SenderCountry
ReceiverName
ReceiverCountry
SenderCorrespondent
ReceiverCorrespondent


In [16]:
from sklearn.metrics import recall_score, confusion_matrix

def aml_score(preds, dtrain):
    labels = dtrain.get_label()
    preds = (preds >= 0.5).astype(int)
    conf_matrix = confusion_matrix(labels, preds)
    #score_1 = recall_score(labels, preds) # recall
    score_1 = (conf_matrix[1, 1] * 1.) / (conf_matrix[1, 0] + conf_matrix[1, 1])
    score_2 = (conf_matrix[1, 0] * 1.) / (conf_matrix[1, 0] + conf_matrix[1, 1])
    final_score = score_1 - 0.5 * score_2
    return 'AML', max(final_score, 0)

In [20]:
tr = train[train['PaymentDate'] < '2016-07-01'].reset_index(drop=True)
val = train[train['PaymentDate'] >= '2016-07-01'].reset_index(drop=True)

dtr = xgb.DMatrix(tr[features], tr['AML1'])
dval = xgb.DMatrix(val[features], val['AML1'])

In [23]:
watchlist = [(dtr, 'tr'), (dval, 'val')]
bst = xgb.train(params, dtr, num_boost_round=50, evals=watchlist, feval=aml_score, verbose_eval=1)

[0]	tr-AML:0.988568	val-AML:0.985885
[1]	tr-AML:0.988908	val-AML:0.986027
[2]	tr-AML:0.988675	val-AML:0.985956
[3]	tr-AML:0.988192	val-AML:0.985528
[4]	tr-AML:0.988461	val-AML:0.985885
[5]	tr-AML:0.989105	val-AML:0.986669
[6]	tr-AML:0.989892	val-AML:0.986883
[7]	tr-AML:0.99025	val-AML:0.987952
[8]	tr-AML:0.989785	val-AML:0.986954
[9]	tr-AML:0.989874	val-AML:0.986954
[10]	tr-AML:0.989802	val-AML:0.986883
[11]	tr-AML:0.98982	val-AML:0.986883
[12]	tr-AML:0.989874	val-AML:0.986883
[13]	tr-AML:0.990214	val-AML:0.987097
[14]	tr-AML:0.990053	val-AML:0.986954
[15]	tr-AML:0.990053	val-AML:0.987025
[16]	tr-AML:0.990071	val-AML:0.987097
[17]	tr-AML:0.990142	val-AML:0.987097
[18]	tr-AML:0.990107	val-AML:0.987097
[19]	tr-AML:0.990232	val-AML:0.987239
[20]	tr-AML:0.990232	val-AML:0.987168
[21]	tr-AML:0.990286	val-AML:0.987168
[22]	tr-AML:0.990232	val-AML:0.987025
[23]	tr-AML:0.990232	val-AML:0.986954
[24]	tr-AML:0.990286	val-AML:0.986954
[25]	tr-AML:0.990303	val-AML:0.987097
[26]	tr-AML:0.990232	val

In [24]:
dtrain_1 = xgb.DMatrix(train[features], train['AML1'])
dtrain_2 = xgb.DMatrix(train[features], train['AML2'])
dtrain_3 = xgb.DMatrix(train[features], train['AML3'])
dtest = xgb.DMatrix(test[features])

In [38]:
params = {"objective": "binary:logistic", "booster": "gbtree", "nthread": 4, "silent": 1,
                "eta": 0.1, "max_depth": 9, "subsample": 0.9, "colsample_bytree": 0.9, "min_child_weight": 5,
                "seed": 2016, "tree_method": "exact"}
print(params)
nrounds = 1000
# xgb.cv(params, dtrain, num_boost_round=nrounds, nfold=2, feval=aml_score, maximize=True, early_stopping_rounds=100, verbose_eval=20)

{'seed': 2016, 'tree_method': 'exact', 'booster': 'gbtree', 'colsample_bytree': 0.9, 'silent': 1, 'nthread': 4, 'min_child_weight': 10, 'subsample': 0.8, 'eta': 0.1, 'objective': 'binary:logistic', 'max_depth': 9}


In [33]:
# watchlist = [(dtrain, 'train')]
bst = xgb.train(params, dtrain_1, num_boost_round=100, evals=watchlist, feval=aml_score, verbose_eval=10)
test_preds_1 = bst.predict(dtest)
print("Done:1")
bst = xgb.train(params, dtrain_2, num_boost_round=100, evals=watchlist, feval=aml_score, verbose_eval=10)
test_preds_2 = bst.predict(dtest)
print("Done:2")
bst = xgb.train(params, dtrain_3, num_boost_round=100, evals=watchlist, feval=aml_score, verbose_eval=10)
test_preds_3 = bst.predict(dtest)
print("Done:3")

[0]	tr-AML:0.992021	val-AML:0.991445
[10]	tr-AML:0.992218	val-AML:0.99173
[20]	tr-AML:0.992701	val-AML:0.991944
[30]	tr-AML:0.993917	val-AML:0.99337
[40]	tr-AML:0.994257	val-AML:0.993441
[50]	tr-AML:0.995617	val-AML:0.994867
[60]	tr-AML:0.996601	val-AML:0.995723
[70]	tr-AML:0.997531	val-AML:0.996935
[80]	tr-AML:0.997835	val-AML:0.99722
[90]	tr-AML:0.998139	val-AML:0.997505
Done:1
[0]	tr-AML:0	val-AML:0
[10]	tr-AML:0	val-AML:0
[20]	tr-AML:0	val-AML:0
[30]	tr-AML:0	val-AML:0
[40]	tr-AML:0	val-AML:0
[50]	tr-AML:0	val-AML:0
[60]	tr-AML:0	val-AML:0
[70]	tr-AML:0	val-AML:0
[80]	tr-AML:0	val-AML:0
[90]	tr-AML:0	val-AML:0
Done:2
[0]	tr-AML:0	val-AML:0
[10]	tr-AML:0	val-AML:0
[20]	tr-AML:0	val-AML:0
[30]	tr-AML:0	val-AML:0
[40]	tr-AML:0	val-AML:0
[50]	tr-AML:0	val-AML:0
[60]	tr-AML:0	val-AML:0
[70]	tr-AML:0	val-AML:0
[80]	tr-AML:0	val-AML:0
[90]	tr-AML:0	val-AML:0
Done:3


In [34]:
test_preds_11 = (test_preds_1 >= 0.5).astype(int)#[1 if x > 0.5 else 0 for x in test_preds_1]
test_preds_22 = (test_preds_2 >= 0.5).astype(int)#[1 if x > 0.5 else 0 for x in test_preds_2]
test_preds_33 = (test_preds_3 >= 0.5).astype(int)#[1 if x > 0.5 else 0 for x in test_preds_3]

In [35]:
submit = pd.DataFrame({'Sl.No': test['Sl.No'], 'AML1': test_preds_11, 'AML2': test_preds_22, 'AML3': test_preds_33})
submit = submit[['Sl.No', 'AML1', 'AML2', 'AML3']]
submit.head()

Unnamed: 0,Sl.No,AML1,AML2,AML3
0,4362224,0,0,0
1,4362225,0,0,0
2,4362226,0,0,0
3,4362227,0,0,0
4,4362228,0,1,0


In [36]:
print submit['AML1'].value_counts()
print submit['AML2'].value_counts()
print submit['AML3'].value_counts()

0    1473639
1      29446
Name: AML1, dtype: int64
0    1478473
1      24612
Name: AML2, dtype: int64
0    1479506
1      23579
Name: AML3, dtype: int64


In [None]:
0    1473832
1      29253
Name: AML1, dtype: int64
0    1479049
1      24036
Name: AML2, dtype: int64
0    1483643
1      19442
Name: AML3, dtype: int64

In [37]:
submit.to_csv("Submission_5.csv", index=False)

In [166]:
pd.DataFrame.from_dict(bst.get_fscore(), orient="index").reset_index().sort_values(0, ascending=False)

Unnamed: 0,index,0
10,ReceiverCountry,339
1,SenderCountry,264
2,ReceiverAccount,218
20,ReceiverCorrespondent,155
8,Age,127
13,Amount,93
15,SenderCorrespondent,80
3,ReceiverName,79
4,ReceiverAddress,65
5,Custmer Account Number,63
