# ieee-cis Fraud Detection
## EDA 부터 XGB hyperopt로 모델링까지
**[참고한 커널](https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt)**

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

#standard plotly imports
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode

#import cufflinks
import plotly.figure_factory as ff

#using plotly + cufflinks in offline mode
init_notebook_mode(connected=True)

#preprocessing, modeling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
import xgboost as xgb

#hyperparameter optimization modules
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

import os
import gc
print(os.listdir("../input"))

['sample_submission.csv', 'test_identity.csv', 'test_transaction.csv', 'train_identity.csv', 'train_transaction.csv']


In [2]:
%%time
df_id = pd.read_csv('../input/train_identity.csv')
df_trans = pd.read_csv('../input/train_transaction.csv')

Wall time: 39.5 s


In [18]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes, columns = ['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name', 'dtypes']]
    summary['Missing'] = df.isnull().sum().values
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Thrid Value'] = df.loc[2].values
    
    for name in summary['Name'].value_counts().index: # 즉 각 value에 대하여
        summary.loc[summary['Name'] == name, 'Entropy'] = round(
        stats.entropy(df[name].value_counts(normalize = True), base=2), 2)
        # stats.entropy는 뭔지 찾아보기
    return summary

# DF size를 줄이는 함수
# def reduce_mem_usage(df, verbose=True):
#     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#     start_mem = df.memory_usage().sum() / 1024**2
#     for col in df.columns:
#         col_type = df[col].dtypes
#         if col_type in numerics: # numerics 중 해당하는 값 있다면?
#             c_min = df[col].min()
#             c_max = df[col].max()
#             if str(col_type)[:3] == 'int':
#                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
#                     df[col] = df[col].astype(np.int8)
#                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
#                     df[col] = df[col].astype(np.int16)
#             else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16.max):
#                     df[col] = df[col].astype(np.float16)
#                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32.max):
#                     df[col] = df[col].astype(np.float32)
#                 else:
#                     df[col] = df[col].astype(np.float64)
    
#     end_mem = df.memory_usage().sum() / 1024**2
#     if verbose:
#         print('Mem. usage decreased to {:5.2f}Mb to {:.1f} reduction'.format(
#         end_mem, 100 * (start_mem - end_mem) / start_mem))
    
#     return df

## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
        end_mem, 100 * (start_mem - end_mem) / start_mem))
    
    return df

def CalcOutliers(df_num):
    # array의 평균, 표준편차 계산
    data_mean, data_std = np.mean(df_num), np.std(df_num)
    
    # 상한, 하한 범위 설정
    cut = data_std * 3
    
    # 상한, 하한값 설정
    lower, upper = data_mean - cut, data_mean + cut
    
    # outliers values 생성
    outliers_lower = [x for x in df_num if x < lower]
    outliers_higher = [x for x in df_num if x > upper]
    outliers_total = [x for x in df_num if x < lower or x > upper]
    
    # array without outlier values. lower, upper 와 같은 경우는 뺌??
    outliers_removed = [x for x in df_num if x > lower and x < upper]
    
    print('Identified lower outliers: %d' % len(outliers_lower))
    print('Identified upper outliers: %d' % len(outliers_higher))
    print('Total outlier observations: %d' % len(outliers_total))
    print('Non-outlier observations: %d' % len(outliers_removed))
    print('Total percentual of Outliers: %d' % round((
        len(outliers_total) / len(outliers_removed)) *100, 4))
    
    return

In [19]:
# Reducing memory
df_trans = reduce_mem_usage(df_trans)
df_id = reduce_mem_usage(df_id)

Mem. usage decreased to 542.35 Mb (69.3% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)


In [20]:
resumetable(df_trans)[:25] #25행(25번째 칼럼)까지만 출력

Dataset Shape: (590540, 394)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Thrid Value,Entropy
0,TransactionID,int32,0,65536,-27656,-27655,-27654,16.0
1,isFraud,int8,0,2,0,0,0,0.22
2,TransactionDT,int32,0,65525,20864,20865,20933,15.92
3,TransactionAmt,float16,0,8195,68.5,29,59,8.1
4,ProductCD,object,0,5,W,W,W,1.28
5,card1,int16,0,13553,13926,2755,4663,9.97
6,card2,float16,8933,500,,404,490,6.32
7,card3,float16,1565,114,150,150,150,0.68
8,card4,object,1577,4,discover,mastercard,visa,1.09
9,card5,float16,4259,119,142,102,166,2.66
