# Kaggle challenge - IEEE-CIS Fraud Detection

based on [this notebook](https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt) 

### import required libraries

In [15]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
from plotly.offline import iplot, init_notebook_mode


init_notebook_mode(connected=True)

from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from xgboost import XGBClassifier
import xgboost as xgb

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

import os
import gc
from pprint import pprint


print('data:')
pprint(os.listdir('data'))

data:
['sample_submission.csv',
 'train_transaction.csv',
 'test_identity.csv',
 'train_identity.csv',
 'test_transaction.csv']


### import train datasets

In [16]:
id_df = pd.read_csv('data/train_identity.csv')
transaction_df = pd.read_csv('data/train_transaction.csv')

### define utility functions

In [None]:
def resume_table(data_frame):
    """
    data_frame (pd.DataFrame): the data frame to get a summary of

    returns
    summary (pd.DataFrame): the summary of the data frame passed in
    """
    print(f'Dataset Shape: {data_frame.shape}')

    summary = pd.DataFrame(data_frame.dtypes, columns=['dtypes'])
    summary = summary.reset_index()

    summary['Name'] = summary['index']
    summary = summary[['Name', 'dtypes']]

    summary['Missing'] = data_frame.isnull().sum().values
    summary['Uniques'] = data_frame.nunique().values
    summary['First Value'] = data_frame.loc[0].values
    summary['Second Value'] = data_frame.loc[1].values
    summary['Third Value'] = data_frame.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(
            stats.entropy(data_frame[name].value_counts(normalize=True), base=2),
            2
        )

    return summary


def reduce_mem_usage(data_frame, verbose=True):
    numeric_types = [
        'int16',
        'int32',
        'int64',
        'float16',
        'float32',
        'float64',
    ]

    start_mem = data_frame.memory_usage().sum() / 1024**2

    for column in data_frame.columns:
        column_type = data_frame[column].dtypes

        if column_type in numeric_types:
            column_min = data_frame[column].min()
            column_max = data_frame[column].max()

            if str(column_type)[:3] == 'int':
                if column_min > np.iinfo(np.int8).min and column_max < np.iinfo(np.int8).max:
                    data_frame[column] = data_frame[column].astype(np.int8)

                elif column_min > np.iinfo(np.int16).min and column_max < np.iinfo(np.int16).max:
                    data_frame[column] = data_frame[column].astype(np.int16)

                elif column_min > np.iinfo(np.int32).min and column_max < np.iinfo(np.int32).max:
                    data_frame[column] = data_frame[column].astype(np.int32)

                elif column_min > np.iinfo(np.int64).min and column_max < np.iinfo(np.int64).max:
                    data_frame[column] = data_frame[column].astype(np.int64)

            else:  # column data type is float
                if column_min > np.finfo(np.float16) and column_max < np.finfo(np.float16).max:
                    data_frame[column] = data_frame[column].astype(np.float16)

                