In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Lasso
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.pipeline import make_pipeline
from scipy import stats
import xgboost as xgb
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
#load data
train_identity = pd.read_csv('train_identity.csv')
train_transaction = pd.read_csv('train_transaction.csv')

In [3]:
def df_column_unique_values(df, top_n = 5):
    for col_name, values in df.iteritems():
        col_value_counts = values.value_counts()
        print(f"{col_name} : {len(col_value_counts)}")
        col_value_count_list = [
            "'" + str(c) + "'" + ":" + str(n) for c, n in sorted(
                col_value_counts.items(),
                key=lambda kv: kv[1],
                reverse=True
            )
        ]
        print(", ".join(col_value_count_list[:min(len(col_value_count_list), top_n)]))

In [4]:
df_column_unique_values(train_identity)

TransactionID : 144233
'3153916':1, '3548654':1, '3079669':1, '3077620':1, '3067379':1
id_01 : 77
'-5.0':82170, '0.0':19555, '-10.0':11257, '-20.0':11211, '-15.0':5674
id_02 : 115655
'1102.0':11, '696.0':10, '1116.0':9, '1117.0':9, '1120.0':9
id_03 : 24
'0.0':63903, '1.0':863, '3.0':668, '2.0':421, '5.0':109
id_04 : 15
'0.0':65739, '-5.0':132, '-6.0':98, '-8.0':64, '-4.0':51
id_05 : 93
'0.0':92743, '1.0':8293, '2.0':4937, '3.0':3854, '4.0':2941
id_06 : 101
'0.0':91325, '-1.0':4687, '-5.0':3849, '-6.0':3257, '-9.0':2634
id_07 : 84
'0.0':409, '16.0':245, '14.0':228, '12.0':208, '15.0':186
id_08 : 94
'-100.0':500, '0.0':261, '-34.0':257, '-33.0':209, '-32.0':185
id_09 : 46
'0.0':70378, '1.0':1616, '3.0':966, '2.0':773, '4.0':270
id_10 : 62
'0.0':72879, '-6.0':295, '-5.0':247, '-1.0':200, '-8.0':147
id_11 : 365
'100.0':133162, '95.08000183105469':1231, '95.16000366210938':754, '97.12000274658205':440, '96.66999816894531':333
id_12 : 2
'NotFound':123025, 'Found':21208
id_13 : 54
'52.0':5809

In [5]:
df_column_unique_values(train_transaction)

TransactionID : 590540
'3147775':1, '3089230':1, '3189951':1, '3187902':1, '3194045':1
isFraud : 2
'0':569877, '1':20663
TransactionDT : 573349
'9474817':8, '11576951':5, '7236588':5, '4397066':5, '8468062':4
TransactionAmt : 20902
'59.0':30582, '117.0':28933, '107.95':23954, '57.95':23600, '100.0':20362
ProductCD : 5
'W':439670, 'C':68519, 'R':37699, 'H':33024, 'S':11628
card1 : 13553
'7919':14932, '9500':14162, '15885':10361, '17188':10344, '15066':7945
card2 : 500
'321.0':48935, '111.0':45191, '555.0':41995, '490.0':38145, '583.0':21803
card3 : 114
'150.0':521287, '185.0':56346, '106.0':1571, '146.0':1252, '144.0':1252
card4 : 4
'visa':384767, 'mastercard':189217, 'american express':8328, 'discover':6651
card5 : 119
'226.0':296546, '224.0':81513, '166.0':57140, '102.0':29105, '117.0':25941
card6 : 4
'debit':439938, 'credit':148986, 'debit or credit':30, 'charge card':15
addr1 : 332
'299.0':46335, '325.0':42751, '204.0':42020, '264.0':39870, '330.0':26287
addr2 : 74
'87.0':520481, '6

V71 : 7
'0.0':443821, '1.0':67435, '2.0':1837, '3.0':311, '4.0':34
V72 : 11
'0.0':443611, '1.0':66364, '2.0':2802, '3.0':413, '4.0':129
V73 : 8
'0.0':444786, '1.0':65814, '2.0':2551, '3.0':235, '4.0':43
V74 : 9
'0.0':440878, '1.0':67816, '2.0':4109, '3.0':525, '4.0':86
V75 : 5
'1.0':265082, '0.0':232467, '2.0':3676, '3.0':150, '4.0':1
V76 : 7
'1.0':273293, '0.0':217817, '2.0':9571, '3.0':641, '4.0':43
V77 : 31
'1.0':474434, '2.0':18955, '3.0':3695, '4.0':1623, '5.0':742
V78 : 32
'1.0':457915, '2.0':31364, '3.0':6135, '4.0':2261, '5.0':1125
V79 : 8
'0.0':436802, '1.0':61751, '2.0':2179, '3.0':435, '7.0':109
V80 : 20
'0.0':434732, '1.0':63368, '2.0':2283, '3.0':475, '4.0':245
V81 : 20
'0.0':434653, '1.0':60829, '2.0':4033, '3.0':902, '4.0':509
V82 : 8
'1.0':402612, '0.0':88757, '2.0':9288, '3.0':626, '4.0':76
V83 : 8
'1.0':396619, '0.0':83605, '2.0':18834, '3.0':1816, '4.0':292
V84 : 8
'0.0':435246, '1.0':63848, '2.0':2035, '3.0':191, '4.0':35
V85 : 8
'0.0':435103, '1.0':59565, '2.0':532

V196 : 39
'1.0':136911, '2.0':1415, '3.0':335, '4.0':212, '5.0':118
V197 : 15
'1.0':128419, '0.0':9693, '2.0':1256, '3.0':235, '4.0':102
V198 : 22
'1.0':128276, '0.0':9579, '2.0':1318, '3.0':297, '4.0':121
V199 : 46
'1.0':125279, '2.0':9231, '3.0':2119, '4.0':871, '5.0':462
V200 : 46
'1.0':117750, '0.0':9693, '2.0':8523, '3.0':1723, '4.0':728
V201 : 56
'1.0':116412, '0.0':9579, '2.0':9285, '3.0':1922, '4.0':860
V202 : 10970
'0.0':106161, '100.0':2186, '200.0':1500, '50.0':1391, '150.0':1256
V203 : 14951
'0.0':98207, '100.0':2372, '200.0':1580, '50.0':1553, '150.0':1341
V204 : 12858
'0.0':101660, '100.0':2289, '200.0':1541, '50.0':1507, '150.0':1306
V205 : 2240
'0.0':128800, '100.0':598, '50.0':370, '150.0':218, '25.0':205
V206 : 1780
'0.0':132868, '100.0':288, '50.0':253, '150.0':171, '25.0':120
V207 : 3246
'0.0':123086, '877.0':914, '50.0':552, '100.0':483, '619.0':342
V208 : 2552
'0.0':123863, '75.0':1461, '50.0':716, '100.0':553, '25.0':360
V209 : 3451
'0.0':122714, '1961.0':921, '5

'0.0':354815, '107.9499969482422':6520, '59.0':6245, '117.0':5804, '57.95000076293945':5757
V311 : 3098
'0.0':573290, '59.0':823, '117.0':805, '107.9499969482422':611, '57.95000076293945':585
V312 : 8068
'0.0':466863, '59.0':6301, '117.0':6102, '57.95000076293945':3472, '49.0':3085
V313 : 5529
'0.0':489641, '59.0':6654, '117.0':5319, '57.95000076293945':3608, '107.9499969482422':3472
V314 : 11377
'0.0':473273, '59.0':4262, '117.0':3839, '107.9499969482422':2671, '57.95000076293945':2567
V315 : 6973
'0.0':487073, '59.0':5114, '117.0':4360, '107.9499969482422':3048, '57.95000076293945':3012
V316 : 9814
'0.0':546442, '117.0':1727, '59.0':1504, '100.0':949, '49.0':878
V317 : 15184
'0.0':519413, '117.0':2224, '59.0':1827, '226.0':1045, '100.0':1039
V318 : 12309
'0.0':535074, '117.0':1937, '59.0':1586, '100.0':991, '226.0':873
V319 : 4799
'0.0':562740, '100.0':1040, '1165.0':870, '117.0':778, '200.0':747
V320 : 6439
'0.0':542892, '117.0':1298, '59.0':1256, '100.0':1172, '57.95000076293945':9

In [6]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
#test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print(train.shape)
#print(test.shape)

(590540, 434)


In [7]:
del train_identity, train_transaction

In [8]:
col_with_miss_data = train.isnull().any().sum()
print("There are {} columns with missing data".format(col_with_miss_data))

There are 414 columns with missing data


In [9]:
mostly_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.9]
print(mostly_null_cols)
train = train.drop(mostly_null_cols, axis=1)

['dist2', 'D7', 'id_07', 'id_08', 'id_18', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27']


In [10]:
categorical_features = ["ProductCD","card1","card2","card3","card4","card5","card6","addr1","addr2",
                        "P_emaildomain","R_emaildomain","M1","M2","M3","M4","M5","M6","M7","M8","M9",
                        "DeviceType","DeviceInfo",
                        "id_12","id_13","id_14","id_15","id_16","id_17","id_19","id_20","id_28","id_29","id_30","id_31","id_32","id_33",
                       "id_34","id_35","id_36","id_37","id_38"]
numerical_features = [col for col in train.columns if col not in categorical_features]

In [11]:
for feature in categorical_features:
    train.loc[:,feature] = train.loc[:,feature].fillna('NONE')
    train[feature] = train[feature].astype(str)
for feature in numerical_features:
    train.loc[:,feature] = train.loc[:,feature].fillna(train[feature].median())

In [12]:

for ft in categorical_features:
    lbl = LabelEncoder() 
    lbl.fit(list(train[ft].values)) 
    train[ft] = lbl.transform(list(train[ft].values))

In [13]:
y_train = train['isFraud'].copy()
# Drop target, fill in NaNs
X_train = train.drop('isFraud', axis=1)
#X_test = test.copy()
X_train = X_train.fillna(-999)
#X_test = X_test.fillna(-999)

In [14]:

train_num_data = train[numerical_features]
#train_num_data = train_num_data.fillna(train_num_data.median())
skewness = train_num_data.apply(lambda x: stats.skew(x))
skewness = skewness[abs(skewness) > 0.5]
skewed_features = skewness.index
train_num_data[skewed_features] = np.log1p(train_num_data[skewed_features])

for ft in skewed_features:
    X_train[ft] = train_num_data[ft]

In [15]:
X_train_id = X_train['TransactionID']
X_train.drop('TransactionID', axis=1, inplace=True)

In [16]:
X_train=pd.get_dummies(X_train)
X_train.drop('isFraud', axis=1, inplace=True)
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size = 0.2,random_state= 0)

In [17]:
print("X_train : " + str(X_train.shape))
print("X_val : " + str(X_val.shape))
print("y_train : " + str(y_train.shape))
print("y_val : " + str(y_val.shape))

X_train : (472432, 420)
X_val : (118108, 420)
y_train : (472432,)
y_val : (118108,)


In [18]:
col_with_miss_data = X_train.isnull().any().sum()
print("There are {} columns with missing data".format(col_with_miss_data))

There are 13 columns with missing data


In [19]:
X_train = X_train.fillna(-999)

In [20]:

clf = xgb.XGBClassifier(n_estimators=500,
                        n_jobs=4,
                        max_depth=9,
                        learning_rate=0.05,
                        subsample=0.9,
                        colsample_bytree=0.9,
                        missing=-999)

clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.9, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=9,
       min_child_weight=1, missing=-999, n_estimators=500, n_jobs=4,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.9, verbosity=1)

In [21]:
y_pred_valid = clf.predict(X_val).reshape(-1,)
score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')

In [22]:
print("XGBoost error on Training set :", rmse_cv_train(clf).mean())
print("XGBoost error on Test set :", rmse_cv_val(clf).mean())

KeyboardInterrupt: 