In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import collections
from time import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import preprocessing
from sklearn import metrics

# Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
# read and combine train data by the TransactionID
train_identity = pd.read_csv("/content/drive/My Drive/Colab Notebooks/fraud-detection-data/train_identity.csv")
train_transaction = pd.read_csv("/content/drive/My Drive/Colab Notebooks/fraud-detection-data/train_transaction.csv")
traindata = pd.merge(train_transaction,train_identity, on='TransactionID', how='left',left_index=True,right_index=True)

In [None]:
def prepare_inputs_and_outputs(data):
    
    # Prepare & save the inputs and outputs features
    features = data.drop(['isFraud','TransactionID'], axis = 1)
    labels = data[['isFraud']]
    
    return features, labels

In [None]:
def get_missing_data_percentage(data):
    
    # where mvp = missing value percentages
    mvp = data.isnull().sum() * 100 / len(data)
    mvp = pd.DataFrame({'Feature': data.columns,'Percentage': mvp})
    
    return mvp.sort_values(by ='Percentage', ascending=False)

In [None]:
def drop_high_missing_data_columns(mvd, data, threshold):
    # Where "mvd" = missing value data
    # Get names of indexes for which column missing data is over 50%
    high_missing_data_cols = mvd[mvd['Percentage'] > threshold].index

    for col_name in range(len(high_missing_data_cols)):
        del data[high_missing_data_cols[col_name]] # Delete rows from dataFrame??? or columns
    
    return data

In [None]:
def drop_one_value_columns(data):
    
    # Drop columns with only 1 unique value.
    for column in data.columns:
        if len(data[column].unique()) == 1:
            #print(traindata[column].name)
            data.drop(column,inplace=True,axis=1)
            
    return data

In [None]:
def getCategoricalFeatures(data):
    columns = list(data)
    result = []
    for c in columns: 
        if data.dtypes[c] == np.object:
            result.append(c) 
    return data[result]

def getNumericalFeatures(data):
    columns = list(data)
    result = []
    for c in columns: 
        if data.dtypes[c] != np.object:
            result.append(c) 
    return data[result]

In [None]:
def drop_high_correlation_features(data, threshold):

    corr_matrix = data.corr()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(abs(upper[column]) > threshold)]
    data = data.drop(columns = to_drop)
    
    return data

In [None]:
def label_encode_categorical_features(data):
        
    encoder_dict = collections.defaultdict(LabelEncoder)
    data = data.apply(lambda x: encoder_dict[x.name].fit_transform(x))
    
    return data

In [None]:
def split_data(features, labels):
    
    # Data Splitting: 60% for training, 20% for validation and 20% for testing.
    X_train, X_test, Y_train, y_test = train_test_split(features, labels, test_size=0.4)
    X_validation, X_test, Y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5)
    
    return X_train, Y_train, X_test, y_test, X_validation, Y_validation

In [None]:
def selectkbestfeatures(X_train, Y_train, X_validation, X_test, numberOfFeatures):

    fit = SelectKBest(score_func=f_classif, k=numberOfFeatures).fit(X_train, Y_train)

    X_train = fit.transform(X_train)
    X_validation = fit.transform(X_validation)
    X_test = fit.transform(X_test)

    # Get column names from the best features
    X_train_cols = fit.get_support(indices=True)
    X_validation_cols = fit.get_support(indices=True)
    X_test_cols = fit.get_support(indices=True)

    X_train = pd.DataFrame(X_train, columns=X_train_cols)
    X_validation = pd.DataFrame(X_validation, columns=X_validation_cols)
    X_test = pd.DataFrame(X_test, columns=X_test_cols)

    # Create new dataframes with the column names
    #X_train = X_train.iloc[:,X_train_cols]
    #X_validation = X_validation.iloc[:,X_validation_cols]
    #X_test = X_test.iloc[:,X_test_cols]

    return X_train, X_validation, X_test

In [None]:
def evaluate_model(name, model, features, labels):
    
    start = time()
    pred = model.predict(features)
    end = time()
    
    # Print the confusion matrix
    print(metrics.confusion_matrix(labels, pred))

    # Print the precision and recall, among other metrics
    print(metrics.classification_report(labels, pred, digits=3))
    
    print(name+" Accuracy - "+str(round(accuracy_score(labels, pred), 3) * 100)+"%")
    print(name+" Precision - "+str(round(precision_score(labels, pred, average='micro'), 3) * 100)+"%")
    print(name+" Recall - "+str(round(recall_score(labels, pred, average='micro'), 3) * 100)+"%")
    print(name+" F1 Score - "+str(round(f1_score(labels, pred, average='micro'), 3) * 100)+"%")
    print(name+" Latency - "+str(round((end - start) * 1000, 1))+"ms \n")

In [None]:
# Separate Features & Labels
train_features, train_labels = prepare_inputs_and_outputs(traindata)

In [None]:
# [PREPROCESSING STAGE 1] - DATA CLEANING

# Examine the percentage of missing data for all feature in the training data
allFeaturesMissingData = get_missing_data_percentage(train_features)

# Drop features with a missing data percentage above the specified threshold
train_features = drop_high_missing_data_columns(allFeaturesMissingData, train_features, 70)

# Drop features with only 1 distinct value, extremely high or extremely low correlation
train_features = drop_one_value_columns(train_features)
train_features = drop_high_correlation_features(train_features, 0.80)

# Extract the numerical & categorical features from training features
numericalFeatures = getNumericalFeatures(train_features)
categoricalFeatures = getCategoricalFeatures(train_features)

# Get the percentage of missing data for both numerical & categorical features
numericalFeaturesMissingData = get_missing_data_percentage(numericalFeatures)
categoricalFeaturesMissingData = get_missing_data_percentage(categoricalFeatures)

# Impute categorical missing values with "X" and numerical missing values with column mean
numericalFeatures = numericalFeatures.fillna(numericalFeatures.mean(), inplace=False)
categoricalFeatures = categoricalFeatures.fillna("X")

# Update missing data and ensure none exists
numericalFeaturesMissingData = get_missing_data_percentage(numericalFeatures)
categoricalFeaturesMissingData = get_missing_data_percentage(categoricalFeatures)

In [None]:
# [PREPROCESSING STAGE 2] - DATA TRANSFORMATION 

# Numerically represent the categorical features using label encoding
categoricalFeatures = label_encode_categorical_features(categoricalFeatures)

# Update training features by replacing the initial data with the imputed data
train_features = pd.concat([numericalFeatures, categoricalFeatures], axis=1)

# Further split the training data into a train and test sets
X_train, Y_train, X_test, Y_test, X_validation, Y_validation = split_data(train_features, train_labels)

# Feature Selection using SelectKBest
X_train, X_validation, X_test = selectkbestfeatures(X_train, Y_train, X_validation, X_test, 50)

# Feature Scaling using Standardization
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

In [None]:
# [PREPROCESSING STAGE 3] - DATA REDUCTION (USING PCA or LDA) (focus here next)

from sklearn.decomposition import PCA

pca = PCA(n_components=25).fit(X_train)

X_train_pca = pca.transform(X_train)
X_validation_pca = pca.transform(X_validation)
X_test_pca = pca.transform(X_test)

X_train = pd.DataFrame(data = X_train_pca)
X_validation = pd.DataFrame(data = X_validation_pca)
X_test = pd.DataFrame(data = X_test_pca)

In [None]:
# [MODEL BUILDING]

algorithm = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5, 50, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1, 10, 100]
}
cv = GridSearchCV(algorithm, parameters, cv=5)
cv.fit(X_train, Y_train.values.ravel())

In [None]:
# [MODEL EVALUATION]
 
evaluate_model('Train Set', cv, X_train, Y_train)
evaluate_model('Validation Set', cv, X_validation, Y_validation)
evaluate_model('Test Set', cv, X_test, Y_test)