In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from dateutil.parser import parse
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import ADASYN
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression


In [2]:
feature_set =  'feature_set_4_normalised'
X_train = pd.read_csv( F'./data/{feature_set}/X_train_full.csv')
y_train = pd.read_csv( F'./data/{feature_set}/y_train.csv').values.ravel()

X_val = pd.read_csv( F'./data/{feature_set}/X_valid_full.csv')
y_val = pd.read_csv( F'./data/{feature_set}/y_valid.csv').values.ravel()

In [3]:
counter = Counter(y_train)
print("Before Sampling: {}".format(counter))

pos = Counter(y_train).get(1)
neg = Counter(y_train).get(0)
total = neg+pos

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Before Sampling: Counter({0.0: 727162, 1.0: 19346})
Weight for class 0: 0.51
Weight for class 1: 19.29


In [4]:
xgboost = XGBClassifier(n_estimators = 1000,random_state=42,n_jobs=-1)
eval_set = [(X_val, y_val)]
xgboost.fit(X_train, y_train, eval_metric="auc", eval_set=eval_set, early_stopping_rounds=100, verbose=True)



[0]	validation_0-auc:0.60859
[1]	validation_0-auc:0.63701
[2]	validation_0-auc:0.63639
[3]	validation_0-auc:0.64081
[4]	validation_0-auc:0.65303
[5]	validation_0-auc:0.65376
[6]	validation_0-auc:0.66503
[7]	validation_0-auc:0.66618
[8]	validation_0-auc:0.68525
[9]	validation_0-auc:0.69232
[10]	validation_0-auc:0.71123
[11]	validation_0-auc:0.71150
[12]	validation_0-auc:0.71186
[13]	validation_0-auc:0.71766
[14]	validation_0-auc:0.72370
[15]	validation_0-auc:0.72826
[16]	validation_0-auc:0.73454
[17]	validation_0-auc:0.73663
[18]	validation_0-auc:0.73659
[19]	validation_0-auc:0.73638
[20]	validation_0-auc:0.73732
[21]	validation_0-auc:0.73853
[22]	validation_0-auc:0.73908
[23]	validation_0-auc:0.74199
[24]	validation_0-auc:0.74306
[25]	validation_0-auc:0.74530
[26]	validation_0-auc:0.74529
[27]	validation_0-auc:0.74806
[28]	validation_0-auc:0.74829
[29]	validation_0-auc:0.75028
[30]	validation_0-auc:0.75222
[31]	validation_0-auc:0.75228
[32]	validation_0-auc:0.75180
[33]	validation_0-au

In [5]:

val_res = 1 -xgboost.predict_proba(X_val)[:,0]

In [6]:
roc_auc_score(y_val,val_res)

0.8524601219113352

In [7]:
test_df = pd.read_csv( F'./data/{feature_set}/test_features.csv')

In [9]:
test_res = 1 -xgboost.predict_proba(test_df.drop(columns=['TX_ID']))[:,0]

In [10]:
test_res

array([0.00145596, 0.00751865, 0.00122434, ..., 0.00879544, 0.00192368,
       0.0002476 ], dtype=float32)

In [11]:
test_df['TX_FRAUD']  = test_res

In [21]:
submission

Unnamed: 0,TX_ID,TX_FRAUD
0,09324d812ba7915c3f791e973db293ad50db70d8,0.001456
1,365385787bc3da985acb1f5c120c6d0831dc8734,0.007519
2,0c952ffc309601a50bd11cbbbfcacd887f99c14c,0.001224
3,bfbfcd0e8b0b24dd32743390a500bbf3d815856a,0.024206
4,0052bebc8c3e21ef6a195eaecc75376fb53b49ad,0.002385
...,...,...
71133,28ae334063e4f58ffc8b47789d911b25915b72a3,0.032827
71134,05b33d08d2158971856c3cddd3a7ad161d1fa350,0.001076
71135,9ac904a006491900fb0f26e98aeea4fe24aa37a8,0.008795
71136,2f1693aeb0e7b26a3f0d2634da09b3d2238329b9,0.001924


In [26]:
dict = {'TX_ID':['ddaa070acea087eae360225e92c1609cea905e43'],
        'TX_FRAUD':[0]
       }
df2 = pd.DataFrame(dict)

In [29]:
submission = pd.concat([submission,df2],ignore_index = True)

In [31]:
test_trans = pd.read_csv( F'./data/transactions_test.csv')

In [32]:
test_trans = test_trans['TX_ID']

In [35]:
submission.set_index('TX_ID',inplace=True)
submission = submission.reindex(index =test_trans)


In [39]:
location = 'results'
submission.to_csv(F'./{location}/results_4_xgb.csv',index=True)