In [142]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

In [143]:
# read in the data
df = pd.read_csv('cleaned_data_v2.csv')

In [144]:
# check the data
df.head()

Unnamed: 0,FRAUD_FLAG,AVAIL_CRDT,AMOUNT,CREDIT_LIMIT,CARD_NOT_PRESENT,FLAG_LX,FLAG_AUTO,FLAG_CASH,FLAG_LS,FLAG_DISCOUNT,...,"as.numeric(grepl(""iPhone"", raw_data$USER_AGENT))","as.numeric(grepl(""ipad"", raw_data$USER_AGENT))","as.numeric(grepl(""Macintosh"", raw_data$USER_AGENT))","as.numeric(grepl(""Android"", raw_data$USER_AGENT))","as.numeric(grepl(""Windows"", raw_data$USER_AGENT))","as.numeric(grepl(""Linux"", raw_data$USER_AGENT))","as.numeric(grepl(""Chrome"", raw_data$USER_AGENT))","as.numeric(grepl(""KHTML"", raw_data$USER_AGENT))",raw_data.EVENT_DAY_OF_WEEK,raw_data.EVENT_MONTH
0,1,537.1,11.7,29200,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,4,2
1,0,20371.88,96.35,30700,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,4,2
2,0,15628.17,193.72,19500,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,2
3,0,12913.98,47.15,18400,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,2
4,0,26779.35,121.88,29200,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,4,2


In [145]:
# check missing data
df.isnull().sum()

FRAUD_FLAG                                          0
AVAIL_CRDT                                          0
AMOUNT                                              0
CREDIT_LIMIT                                        0
CARD_NOT_PRESENT                                    0
                                                   ..
as.numeric(grepl("Linux", raw_data$USER_AGENT))     0
as.numeric(grepl("Chrome", raw_data$USER_AGENT))    0
as.numeric(grepl("KHTML", raw_data$USER_AGENT))     0
raw_data.EVENT_DAY_OF_WEEK                          0
raw_data.EVENT_MONTH                                0
Length: 661, dtype: int64

In [146]:
# check total missing data
df.isnull().sum().sum()

0

In [147]:
# reformat the data for training
df.rename(columns={'FRAUD_FLAG':'Target'}, inplace=True)

In [148]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.Target
y_test = df_test.Target
y_df = df.Target

del df_train['Target']
del df_test['Target']
del df['Target']

In [81]:
df_train = df_train.to_numpy()
y_train = y_train.to_numpy()
df_test = df_test.to_numpy()
y_test = y_test.to_numpy()

In [59]:
# import kfold and logistic loss
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

# define a function to perform k-fold cross validation
def k_fold_cross_validation(k, model, train_set, target):

    # create a KFold object
    kf = KFold(n_splits = k)

    # lists to store the errors for each fold
    errors = []

    X = train_set
    y = target
    
    # iterate over the folds
    for train_index, test_index in kf.split(X):
        # get the training and testing data for the current fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # fit the model on the training data
        model.fit(X_train, y_train)

        # make predictions on the testing data
        y_pred = model.predict(X_test)

        # calculate the logistic error for the current fold
        error = log_loss(y_test, y_pred)

        # append the error to the list of errors
        errors.append(error)

    # calculate the mean and variance of the errors
    mean_error = np.mean(errors)
    error_variance = np.var(errors)

    return error_variance, mean_error

In [60]:
# import f1 score, log loss and accuracy score
from sklearn.metrics import f1_score, log_loss, accuracy_score

def print_scores(name, model):
    # calculate the error variance and mean error
    error_variance, mean_error = k_fold_cross_validation(100, model, df_train, y_train)
    
    # print the error variance and mean error
    print(name, 'Error Variance: ', error_variance)
    print(name, 'Mean Error: ', mean_error)

    y_pred = model.predict(df_test)

    # print f1 score, log loss, and accuracy score
    print(name, 'F1 Score: ', f1_score(y_test, y_pred))
    print(name, 'Log Loss: ', log_loss(y_test, y_pred))
    print(name, 'Accuracy Score: ', accuracy_score(y_test, y_pred))

In [61]:
# import libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss

# Create a decision tree classifier
clf = DecisionTreeClassifier()
# Fit the classifier on the original data
clf.fit(df_train, y_train)

# print the scores
print_scores("Plane Decision Tree", clf)


Plane Decision Tree Error Variance:  0.04787429374168004
Plane Decision Tree Mean Error:  1.0407648875271327
Plane Decision Tree F1 Score:  0.4189044038668099
Plane Decision Tree Log Loss:  1.0470540123753254
Plane Decision Tree Accuracy Score:  0.969685083492099


In [62]:
# Create a new column with misclassification costs
cost = [0 if y_train[i] == 0 else 1 for i in range(len(y_train))]

# Define the misclassification costs
cost_matrix = {0: 1, 1: 40}

# Fit the classifier on the modified data with misclassification costs
clf = DecisionTreeClassifier(class_weight=cost_matrix)
clf.fit(df_train, cost)

# print the scores
print_scores("Penalized Decision Tree", clf)

Penalized Decision Tree Error Variance:  0.04714057349820436
Penalized Decision Tree Mean Error:  1.0310855967209607
Penalized Decision Tree F1 Score:  0.39631336405529954
Penalized Decision Tree Log Loss:  1.0141507745287104
Penalized Decision Tree Accuracy Score:  0.9706376779110165


In [63]:
# import libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

# Create a random forest classifier
clf = RandomForestClassifier()
# Fit the classifier on the original data
clf.fit(df_train, y_train)

# print the scores
print_scores("Plane Random Forest", clf)

Plane Random Forest Error Variance:  0.03347101019894076
Plane Random Forest Mean Error:  0.7069007300812968
Plane Random Forest F1 Score:  0.3407407407407408
Plane Random Forest Log Loss:  0.6889959647950064
Plane Random Forest Accuracy Score:  0.9800515521685532


In [64]:
# Create a new column with misclassification costs
cost = [0 if y_train[i] == 0 else 1 for i in range(len(y_train))]

# Define the misclassification costs
cost_matrix = {0: 1, 1: 40}

# Fit the classifier on the modified data with misclassification costs
clf = RandomForestClassifier(class_weight=cost_matrix)
clf.fit(df_train, cost)

# print the scores
print_scores("Penalized Random Forest", clf)

Penalized Random Forest Error Variance:  0.0336859012509447
Penalized Random Forest Mean Error:  0.7131856976813785
Penalized Random Forest F1 Score:  0.2955854126679463
Penalized Random Forest Log Loss:  0.7102849562751304
Penalized Random Forest Accuracy Score:  0.9794351675445478


In [66]:
import xgboost as xgb

# Define the misclassification costs
cost_matrix = {0: 1, 1: 40}

# Create the XGBoost data matrix
dmatrix = xgb.DMatrix(data=df_train, label=y_train)

# Create an XGBoost classifier with the specified misclassification costs
clf = xgb.XGBClassifier(scale_pos_weight=cost_matrix[0]/cost_matrix[1])

# Fit the classifier on the data
clf.fit(df_train, y_train)

# print the scores
print_scores("Penalized XGBoost", clf)

Penalized XGBoost Error Variance:  0.0358138775968787
Penalized XGBoost Mean Error:  0.7639847843343536
Penalized XGBoost F1 Score:  0.17796610169491522
Penalized XGBoost Log Loss:  0.7509272912481835
Penalized XGBoost Accuracy Score:  0.9782584332623557


In [127]:
import lightgbm as lgb

# Define the misclassification costs
cost_matrix = {0:1, 1: 100}

# create dataset for lightgbm
lgb_train = lgb.Dataset(df_train, y_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'scale_pos_weight':cost_matrix[0]/cost_matrix[1],
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


In [128]:
# define a function to perform k-fold cross validation
def light_k_fold_cross_validation(k, model, train_set, target):

    # create a KFold object
    kf = KFold(n_splits = k)

    # lists to store the errors for each fold
    errors = []

    X = train_set
    y = target
    
    # iterate over the folds
    for train_index, test_index in kf.split(X):
        # get the training and testing data for the current fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        lgb_train = lgb.Dataset(X_train, y_train)
        
        # fit the model on the training data
        gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20)

        # make predictions on the testing data
        y_pred = model.predict(X_test)

        # calculate the logistic error for the current fold
        error = log_loss(y_test, y_pred)

        # append the error to the list of errors
        errors.append(error)

    # calculate the mean and variance of the errors
    mean_error = np.mean(errors)
    error_variance = np.var(errors)

    return error_variance, mean_error

In [131]:
def light_print_scores(name, model):
    # calculate the error variance and mean error
    error_variance, mean_error = light_k_fold_cross_validation(100, model, df_train, y_train)
    
    # print the error variance and mean error
    print(name, 'Error Variance: ', error_variance)
    print(name, 'Mean Error: ', mean_error)

    y_pred = model.predict(df_test)
    adjusted_pred = []
    for pred in y_pred:
        if pred > 0.1:
            adjusted_pred.append(1)
        else:
            adjusted_pred.append(0)

    print(y_pred)
    # print f1 score, log loss, and accuracy score
    print(name, 'F1 Score: ', f1_score(y_test, adjusted_pred))
    print(name, 'Log Loss: ', log_loss(y_test, adjusted_pred))
    print(name, 'Accuracy Score: ', accuracy_score(y_test, adjusted_pred))

In [132]:
light_print_scores("LightGBM", gbm)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_w