In [None]:
from IPython.display import HTML, display
import os
import datetime
import numpy as np
import math
import pandas as pd
from scipy import stats
from scipy.stats import ttest_ind, chisquare
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from math import cos, asin, sqrt
from time import time
sns.set(color_codes=True)
from datetime import datetime as dt

# pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 300)
# pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_seq_items', 999)
import plotly.graph_objects as go

# Get data

1. https://www.kaggle.com/mlg-ulb/creditcardfraud

In [None]:
df = pd.read_csv('creditcard.csv')

In [None]:
df

In [None]:
df.drop('Time', axis=1, inplace=True)

In [None]:
def describe_dataset(df, target_column):
    print(f'Dataset shape: {df.shape}')
    print(f'Target column: {target_column}')
    print('Distribution of classes:')
    print(df[target_column].value_counts(normalize=1, dropna=False))
    print('Columns types:')
    print(df.dtypes)

In [None]:
describe_dataset(df, 'Class')

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, train_size = 0.8)

In [None]:
df_test['Class'].value_counts()

In [None]:
def prepare_random_undersampling(df, target_column, random_state=14):
    # Class count
    count_class_0, count_class_1 = df[target_column].value_counts()

    # Divide by class
    df_class_0 = df[df[target_column] == 0]
    df_class_1 = df[df[target_column] == 1]
    
    df_class_0_under = df_class_0.sample(count_class_1, random_state=random_state)
    df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

    print('Random under-sampling:')
    print(df_test_under[target_column].value_counts())

#     df_test_under[target_column].value_counts().plot(kind='bar', title='Count (target)');
    return df_test_under

In [None]:
def prepare_random_oversampling(df, target_column, random_state=15):
    # Class count
    count_class_0, count_class_1 = df[target_column].value_counts()

    # Divide by class
    df_class_0 = df[df[target_column] == 0]
    df_class_1 = df[df[target_column] == 1]
    
    df_class_1_over = df_class_1.sample(count_class_0, random_state=random_state, replace=True)
    df_test_under = pd.concat([df_class_0, df_class_1_over], axis=0)

    print('Random under-sampling:')
    print(df_test_under[target_column].value_counts())

#     df_test_under[target_column].value_counts().plot(kind='bar', title='Count (target)');
    return df_test_under

In [None]:
target_column = 'Class'
params = {'boosting_type': 'gbdt',
                'max_depth': 6,
                'num_leaves': 30,
                'min_data_in_leaf': 50,
                'learning_rate': 0.05,
                'n_estimators': 1500,
                'objective': 'binary',
                'random_state': 5,
                'metric': 'auc'}

In [None]:
import lightgbm as lgb
import time

def train_lightgbm(df, target_column, params):
    
    X_train, X_test, y_train, y_test = train_test_split(df.drop(target_column, axis=1), df[target_column], train_size=0.8)
    lgbtrain = lgb.Dataset(X_train, label=y_train)
    lgbtest = lgb.Dataset(X_test, label=y_test)

    t0 = time.time()
    model = lgb.train(params, lgbtrain, valid_sets=lgbtest, early_stopping_rounds=50, verbose_eval=50)
    print(f'Model training time: {round((time.time() - t0) / 60, 2)} min.')

    return model

In [None]:
from sklearn.metrics import roc_auc_score, confusion_matrix

threshold = 0.5    
    
def get_metrics(df_test, target_column, model_lgb):

    true = df_test[target_column]
    predictions = model_lgb.predict(df_test.drop(target_column, axis=1))
    auc = roc_auc_score(true, predictions, average = 'weighted')
    print(f'AUC: {auc}')

    cm = confusion_matrix(true, predictions > threshold)
    print('Confusion matrix:')
    display(pd.DataFrame(cm, columns=['pred_neg', 'pred_pos'],
                            index=['neg', 'pos']).T)
    tn, fp, fn, tp = cm.ravel()

    fpr_fallout = round(fp / (fp + tn), 6)
    tpr_recall = round(tp / (tp + fn), 6)
    ppv_precision = round(tp / (tp + fp), 6)
    print(f'True positive rate (recall): {tpr_recall}')
    print(f'False positive rate: {fpr_fallout}')
    print(f'Precision: {ppv_precision}')
    return(auc, fpr_fallout, tpr_recall, ppv_precision)

In [None]:
df_u = prepare_random_undersampling(df_train, 'Class')
model_lgb = train_lightgbm(df_train, 'Class', params)
auc, fpr_fallout, tpr_recall, ppv_precision = get_metrics(df_test, 'Class', model_lgb)

In [None]:
# feature_imp = pd.Series(model.feature_importance(), model.feature_name()).sort_values(ascending=False)

In [None]:
df_v = prepare_random_oversampling(df, target_column, random_state=15)
model_lgb = train_lightgbm(df_train, 'Class', params)
auc, fpr_fallout, tpr_recall, ppv_precision = get_metrics(df_test, 'Class', model_lgb)