# Exploratory Visualization

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('ggplot')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from scipy.stats import skew
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import Imputer

In [3]:
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

  from numpy.core.umath_tests import inner1d


In [4]:
%pylab inline
pd.set_option('display.max_columns', None) # Display all columns int dataframe
pd.set_option('display.max_rows', None) # Display all rows int dataframe

Populating the interactive namespace from numpy and matplotlib


In [5]:
train=pd.read_csv('./train.csv')
test=pd.read_csv('./test.csv')
print(train.shape) # Show Row-Count and Column-Count
print(test.shape) # Show Row-Count and Column-Count

(1521787, 23)
(421665, 22)


In [6]:
#feature_train, target = train, train.pop('SalePrice')
feature_train = train.copy()
#feature_train.pop('fraud_ind')
feature = pd.concat([feature_train,test], ignore_index=True)
feature.head(5)

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
0,6881,113261,38038,513.8,5,0,N,0,N,N,0.0,5,N,0,33,172652.0,457,59333,N,0,102,0,516056
1,0,134508,45725,465.62,5,0,N,2,N,N,0.0,0,N,0,9,105114.0,451,0,N,5817,102,0,4376
2,6881,15408,188328,513.8,5,0,N,0,N,N,0.0,5,N,0,6,152458.0,457,59333,N,0,102,0,483434
3,6716,157159,29967,1016.11,5,62,N,5,N,N,0.0,5,N,0,5,172946.0,247,50436,N,3281,102,0,1407164
4,5975,105985,81305,713.66,5,62,N,4,N,N,0.0,5,N,0,6,182129.0,263,93775,N,5817,102,0,1051004


# Data Cleaning

### Missing Data

In [7]:
feature.isnull().mean() * 100 # Show all columns null value percentage

acqic         0.000000
bacno         0.000000
cano          0.000000
conam         0.000000
contp         0.000000
csmcu         0.000000
ecfg          0.000000
etymd         0.000000
flbmk         0.838508
flg_3dsmk     0.838508
fraud_ind    21.696703
hcefg         0.000000
insfg         0.000000
iterm         0.000000
locdt         0.000000
loctm         0.000000
mcc           0.000000
mchno         0.000000
ovrlt         0.000000
scity         0.000000
stocn         0.000000
stscd         0.000000
txkey         0.000000
dtype: float64

In [8]:
# Group by flbmk values
feature.flbmk.value_counts()

N    1920761
Y       6395
Name: flbmk, dtype: int64

In [9]:
# Group by flg_3dsmk values
feature.flg_3dsmk.value_counts()

N    1842623
Y      84533
Name: flg_3dsmk, dtype: int64

In [10]:
cols1 = ["flbmk" , "flg_3dsmk"]
for col in cols1:
    feature[col].fillna("N", inplace=True)

In [11]:
feature.head(5)

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
0,6881,113261,38038,513.8,5,0,N,0,N,N,0.0,5,N,0,33,172652.0,457,59333,N,0,102,0,516056
1,0,134508,45725,465.62,5,0,N,2,N,N,0.0,0,N,0,9,105114.0,451,0,N,5817,102,0,4376
2,6881,15408,188328,513.8,5,0,N,0,N,N,0.0,5,N,0,6,152458.0,457,59333,N,0,102,0,483434
3,6716,157159,29967,1016.11,5,62,N,5,N,N,0.0,5,N,0,5,172946.0,247,50436,N,3281,102,0,1407164
4,5975,105985,81305,713.66,5,62,N,4,N,N,0.0,5,N,0,6,182129.0,263,93775,N,5817,102,0,1051004


In [12]:
feature = pd.get_dummies(feature, prefix=['ecfg','flbmk','flg_3dsmk','insfg','ovrlt'], columns=['ecfg','flbmk','flg_3dsmk','insfg','ovrlt'])

In [13]:
feature.head(5)

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,etymd,fraud_ind,hcefg,iterm,locdt,loctm,mcc,mchno,scity,stocn,stscd,txkey,ecfg_N,ecfg_Y,flbmk_N,flbmk_Y,flg_3dsmk_N,flg_3dsmk_Y,insfg_N,insfg_Y,ovrlt_N,ovrlt_Y
0,6881,113261,38038,513.8,5,0,0,0.0,5,0,33,172652.0,457,59333,0,102,0,516056,1,0,1,0,1,0,1,0,1,0
1,0,134508,45725,465.62,5,0,2,0.0,0,0,9,105114.0,451,0,5817,102,0,4376,1,0,1,0,1,0,1,0,1,0
2,6881,15408,188328,513.8,5,0,0,0.0,5,0,6,152458.0,457,59333,0,102,0,483434,1,0,1,0,1,0,1,0,1,0
3,6716,157159,29967,1016.11,5,62,5,0.0,5,0,5,172946.0,247,50436,3281,102,0,1407164,1,0,1,0,1,0,1,0,1,0
4,5975,105985,81305,713.66,5,62,4,0.0,5,0,6,182129.0,263,93775,5817,102,0,1051004,1,0,1,0,1,0,1,0,1,0


In [14]:
feature = feature.drop(['txkey'], axis=1)

In [15]:
#feature['loctm'] = feature['loctm'].astype(str).str.zfill(8)
#feature['loctm'] = feature['loctm'].str[:2]
#feature['loctm'] = feature['loctm'].astype(int)

In [16]:
#feature = pd.get_dummies(feature, prefix=['mcc'], columns=['mcc'])

In [17]:
feature['cano_counts'] = feature['cano'].map(feature['cano'].value_counts())

In [18]:
df_cano_sum = feature.groupby('cano', sort=False)["conam"].sum().reset_index(name ='cano_sum')

In [19]:
feature['cano_sum'] = feature['cano'].map(df_cano_sum['cano'])

In [20]:
#feature[feature.cano == 38038]

In [21]:
#feature['bacno_counts'] = feature['bacno'].map(feature['bacno'].value_counts())

In [22]:
#df_bacno_sum = feature.groupby('bacno', sort=False)["conam"].sum().reset_index(name ='bacno_sum')

In [23]:
#feature['bacno_sum'] = feature['bacno'].map(df_bacno_sum['bacno'])

In [24]:
#feature[feature.cano == 38038]

In [25]:
train_full = feature[feature['fraud_ind'].notnull()]
test_full = feature[feature['fraud_ind'].isnull()]

In [26]:
print(train_full.shape) # Show Row-Count and Column-Count
print(test_full.shape) # Show Row-Count and Column-Count

(1521787, 29)
(421665, 29)


In [27]:
# Group by flg_3dsmk values
train_full.fraud_ind.value_counts()

0.0    1501432
1.0      20355
Name: fraud_ind, dtype: int64

In [28]:
#train_full['fraud_ind'].replace(0.0,'N', inplace=True)
#train_full['fraud_ind'].replace(1.0,'Y', inplace=True)

In [29]:
# Group by flg_3dsmk values
train_full.fraud_ind.value_counts()

0.0    1501432
1.0      20355
Name: fraud_ind, dtype: int64

In [30]:
y = train_full.pop('fraud_ind')

In [31]:
y.value_counts()

0.0    1501432
1.0      20355
Name: fraud_ind, dtype: int64

In [32]:
X = train_full

In [33]:
train, valid, trainY, validY = train_test_split(X, y, test_size=0.2, stratify=y,random_state=42)

In [34]:
print(X.shape)
print(y.shape)
print(train.shape)
print(trainY.shape)
print(valid.shape)
print(validY.shape)

(1521787, 28)
(1521787,)
(1217429, 28)
(1217429,)
(304358, 28)
(304358,)


In [35]:
train.head(5)

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,etymd,hcefg,iterm,locdt,loctm,mcc,mchno,scity,stocn,stscd,ecfg_N,ecfg_Y,flbmk_N,flbmk_Y,flg_3dsmk_N,flg_3dsmk_Y,insfg_N,insfg_Y,ovrlt_N,ovrlt_Y,cano_counts,cano_sum
915719,6769,82976,127034,488.26,5,62,5,5,0,63,191826.0,251,77991,5817,102,0,1,0,1,0,1,0,1,0,1,0,27,140351.0
303418,6231,92797,80048,1004.72,5,62,4,5,0,51,85825.0,191,16615,5817,102,0,1,0,1,0,1,0,1,0,1,0,19,212805.0
609193,0,99341,55072,465.62,5,0,2,0,0,53,100943.0,451,0,5817,102,0,1,0,1,0,1,0,1,0,1,0,9,130816.0
1319797,6032,13745,198031,711.7,5,62,2,5,0,32,205803.0,247,78537,5817,102,0,0,1,1,0,1,0,1,0,1,0,26,153372.0
1337767,5720,81642,149070,0.0,5,59,8,5,0,21,42824.0,192,67459,6221,75,0,0,1,1,0,1,0,1,0,1,0,9,51019.0


In [36]:
trainY.head(5)

915719     0.0
303418     0.0
609193     0.0
1319797    0.0
1337767    0.0
Name: fraud_ind, dtype: float64

In [37]:
valid.head(5)

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,etymd,hcefg,iterm,locdt,loctm,mcc,mchno,scity,stocn,stscd,ecfg_N,ecfg_Y,flbmk_N,flbmk_Y,flg_3dsmk_N,flg_3dsmk_Y,insfg_N,insfg_Y,ovrlt_N,ovrlt_Y,cano_counts,cano_sum
55692,5975,138896,34065,775.12,5,62,4,5,0,57,151440.0,263,89735,5817,102,0,1,0,1,0,1,0,1,0,1,0,10,138608.0
206365,6716,25899,197832,776.36,5,62,5,5,0,57,115201.0,257,40120,3590,102,0,1,0,1,0,1,0,1,0,1,0,18,163241.0
1197475,6189,7817,130254,655.51,5,62,4,5,0,38,132647.0,292,91366,5817,102,0,1,0,1,0,1,0,1,0,1,0,6,107596.0
253230,6292,97100,2122,561.61,5,62,2,5,0,32,140314.0,209,38207,5817,102,0,0,1,1,0,1,0,1,0,1,0,41,17791.0
1068720,6779,142888,200913,759.87,5,62,4,5,0,30,195857.0,294,8795,5812,102,0,1,0,1,0,1,0,1,0,1,0,36,35540.0


In [38]:
validY.head(5)

55692      0.0
206365     0.0
1197475    0.0
253230     0.0
1068720    0.0
Name: fraud_ind, dtype: float64

In [39]:
testY = pd.read_csv("./submission_test.csv")
testY.drop(['txkey'], axis=1, inplace=True)

In [40]:
test = test_full
test.drop(['fraud_ind'], axis=1, inplace=True)

In [41]:
test.head(5)

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,etymd,hcefg,iterm,locdt,loctm,mcc,mchno,scity,stocn,stscd,ecfg_N,ecfg_Y,flbmk_N,flbmk_Y,flg_3dsmk_N,flg_3dsmk_Y,insfg_N,insfg_Y,ovrlt_N,ovrlt_Y,cano_counts,cano_sum
1521787,6881,163188,116799,513.8,5,0,0,5,0,102,215328.0,457,59360,0,102,0,1,0,1,0,1,0,1,0,1,0,11,9028.0
1521788,6881,163188,116799,513.8,5,0,0,5,0,102,222007.0,457,59360,0,102,0,1,0,1,0,1,0,1,0,1,0,11,9028.0
1521789,6881,163188,116799,513.8,5,0,0,5,0,100,170013.0,457,59360,0,102,0,1,0,1,0,1,0,1,0,1,0,11,9028.0
1521790,6881,163188,116799,513.8,5,0,0,5,0,100,165914.0,457,59360,0,102,0,1,0,1,0,1,0,1,0,1,0,11,9028.0
1521791,6881,163188,116799,513.8,5,0,0,5,0,102,215311.0,457,59360,0,102,0,1,0,1,0,1,0,1,0,1,0,11,9028.0


In [42]:
testY.head(5)

Unnamed: 0,fraud_ind
0,0
1,0
2,0
3,0
4,0


In [43]:
import xgboost as xgb

dtrain = xgb.DMatrix(train, label=trainY)
dvalid = xgb.DMatrix(valid, label=validY)
#dtest = xgb.DMatrix(test, label=testY)

## fixed parameters
scale_pos_weight = sum(trainY==0)/sum(trainY==1)  
num_rounds=100 # number of boosting iterations

param = {'silent':1,
         'min_child_weight':1, ## unbalanced dataset
         'objective':'binary:logistic',
         'eval_metric':'auc', 
         'scale_pos_weight':scale_pos_weight}

def do_train(param, train,train_s,trainY,valid,valid_s,validY):
    ## train with given fixed and variable parameters
    ## and report performance on validation dataset
    evallist  = [(train,train_s), (valid,valid_s)]
    model = xgb.train( param, train, num_boost_round=num_rounds, 
                      evals=evallist )    
    preds = model.predict(valid)
    labels = valid.get_label()
      
    act_pos=sum(validY==1)
    act_neg=valid.num_row()-act_pos
    true_pos=sum(1 for i in range(len(preds)) if (preds[i]>=0.5) & (labels[i]==1))
    false_pos=sum(1 for i in range(len(preds)) if (preds[i]>=0.5) & (labels[i]==0))
    false_neg=act_pos-true_pos
    true_neg=act_neg-false_pos
    
    ## precision: tp/(tp+fp) percentage of correctly classified predicted positives
    ## recall: tp/(tp+fn) percentage of positives correctly classified
    ## F-score with beta=1
    ## see Sokolova et al., 2006 "Beyond Accuracy, F-score and ROC:
    ## a Family of Discriminant Measures for Performance Evaluation"
    ## fscore <- 2*precision.neg*recall.neg/(precision.neg+recall.neg)
    
    precision = true_pos/(true_pos+false_pos)
    recall = true_pos/(true_pos+false_neg)
    f_score = 2*precision*recall/(precision+recall)  
    
    print('\nconfusion matrix')
    print('----------------')
    print( 'tn:{:6d} fp:{:6d}'.format(true_neg,false_pos))
    print( 'fn:{:6d} tp:{:6d}'.format(false_neg,true_pos))
    return(f_score)    

In [None]:
from collections import OrderedDict
from xgboost import XGBClassifier

#best_model =  XGBClassifier()
best_model = xgb

## parameters to be tuned
tune_dic = OrderedDict()

#tune_dic['max_depth']= np.array([20,25,30]) ## maximum tree 
tune_dic['max_depth']= np.array([40,50,60]) ## maximum tree depth
tune_dic['colsample_bytree']= np.linspace(0.5,1.0,6) ## subsample ratio of columns
tune_dic['eta']= np.linspace(0.3,0.6,4) ## learning rate

best_params = dict()
best_f_score = -1

import itertools
var_params = [ i for i in itertools.product(*tune_dic.values())]
search=np.random.choice(np.arange(len(var_params)),60,replace=False)

columns=[*tune_dic.keys()]+['F Score']

results = pd.DataFrame(index=range(len(search)), columns=columns) ## to check results

for i in range(len(search)): ## len(search)
    
    for (key,val) in zip(tune_dic.keys(),var_params[search[i]]):
        param[key]=val

    print()    
    #f_score = do_train(param, dtrain,'train',trainY,dvalid,'valid',validY)
    #do_train(param, train,train_s,trainY,valid,valid_s,validY):
    
    evallist  = [(dtrain,'train'), (dvalid,'valid')]
    
    model = xgb.train( param, dtrain, num_boost_round=num_rounds, 
                      evals=evallist )    
    preds = model.predict(dvalid)
    labels = dvalid.get_label()
      
    act_pos=sum(validY==1)
    act_neg=dvalid.num_row()-act_pos
    true_pos=sum(1 for i in range(len(preds)) if (preds[i]>=0.5) & (labels[i]==1))
    false_pos=sum(1 for i in range(len(preds)) if (preds[i]>=0.5) & (labels[i]==0))
    false_neg=act_pos-true_pos
    true_neg=act_neg-false_pos
    
    ## precision: tp/(tp+fp) percentage of correctly classified predicted positives
    ## recall: tp/(tp+fn) percentage of positives correctly classified
    ## F-score with beta=1
    ## see Sokolova et al., 2006 "Beyond Accuracy, F-score and ROC:
    ## a Family of Discriminant Measures for Performance Evaluation"
    ## fscore <- 2*precision.neg*recall.neg/(precision.neg+recall.neg)
    
    precision = true_pos/(true_pos+false_pos)
    recall = true_pos/(true_pos+false_neg)
    f_score = 2*precision*recall/(precision+recall)  
    
    print('\nconfusion matrix')
    print('----------------')
    print( 'tn:{:6d} fp:{:6d}'.format(true_neg,false_pos))
    print( 'fn:{:6d} tp:{:6d}'.format(false_neg,true_pos))
    #print(preds)
    
    
    results.loc[i,[*tune_dic.keys()]]=var_params[search[i]]
    results.loc[i,'F Score']=f_score
    
    if f_score > best_f_score:
        best_model = model
        best_model.save_model('./saved_models/xgb_best_f_score.model')
        best_f_score = f_score
        #print(type(model))
        #print(type(best_model))
        print('\n*** better f-score',f_score)
        for (key,val) in zip(tune_dic.keys(),var_params[search[i]]):
            best_params[key]=val        
            print(key,': ',val,' ',end='')
        print()    


[0]	train-auc:0.996191	valid-auc:0.977583
[1]	train-auc:0.998752	valid-auc:0.97813
[2]	train-auc:0.999196	valid-auc:0.981353
[3]	train-auc:0.999397	valid-auc:0.981818
[4]	train-auc:0.999497	valid-auc:0.981899
[5]	train-auc:0.999611	valid-auc:0.984696
[6]	train-auc:0.999719	valid-auc:0.985712
[7]	train-auc:0.999783	valid-auc:0.987129
[8]	train-auc:0.999839	valid-auc:0.988199
[9]	train-auc:0.999904	valid-auc:0.98869
[10]	train-auc:0.999928	valid-auc:0.989259
[11]	train-auc:0.999971	valid-auc:0.989622
[12]	train-auc:0.999976	valid-auc:0.989982
[13]	train-auc:0.999983	valid-auc:0.990673
[14]	train-auc:0.999989	valid-auc:0.990991
[15]	train-auc:0.999991	valid-auc:0.991799
[16]	train-auc:0.999994	valid-auc:0.992594
[17]	train-auc:0.999996	valid-auc:0.992722
[18]	train-auc:0.999996	valid-auc:0.992893
[19]	train-auc:0.999997	valid-auc:0.993048
[20]	train-auc:0.999999	valid-auc:0.993201
[21]	train-auc:0.999999	valid-auc:0.9933
[22]	train-auc:0.999999	valid-auc:0.993618
[23]	train-auc:1	valid-a

In [None]:
best_model = xgb.Booster(model_file='./saved_models/xgb_best_f_score.model')

In [None]:
dtest = xgb.DMatrix(test, label=testY)

In [None]:
preds = best_model.predict(dtest)

In [None]:
type(preds)

In [None]:
len(preds)

In [None]:
preds

In [None]:
for i in range(421665):
    if preds[i] > 0.2:
        preds[i] = 1
    else:
        preds[i] = 0

In [None]:
from datetime import datetime
now = datetime.now().strftime('%m%d_%H%M%S')

In [None]:
submission = pd.read_csv("./submission_test.csv")
submission.iloc[:,1] = (preds)
submission.to_csv("submission_{}.csv".format(now), index=False)