In [38]:
import pandas as pd
import numpy as np
import collections
from time import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import preprocessing
from sklearn import metrics

# Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

#for the NN
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import backend as K
import tensorflow as tf

#for inbalance issues 
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from collections import Counter
from matplotlib import pyplot 
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN

In [2]:
def prepare_inputs_and_outputs(data):
    
    # Prepare & save the inputs and outputs features
    features = data.drop(['isFraud','TransactionID'], axis = 1)
    labels = data[['isFraud']]
    
    return features, labels

In [3]:
def get_missing_data_percentage(data):
    
    # where mvp = missing value percentages
    mvp = data.isnull().sum() * 100 / len(data)
    mvp = pd.DataFrame({'Feature': data.columns,'Percentage': mvp})
    
    return mvp.sort_values(by ='Percentage', ascending=False)

In [4]:
def drop_high_missing_data_columns(mvd, data, threshold):
    # Where "mvd" = missing value data
    # Get names of indexes for which column missing data is over 50%
    high_missing_data_cols = mvd[mvd['Percentage'] > threshold].index

    for col_name in range(len(high_missing_data_cols)):
        del data[high_missing_data_cols[col_name]] # Delete rows from dataFrame??? or columns
    
    return data

In [5]:
def drop_one_value_columns(data):
    
    # Drop columns with only 1 unique value.
    for column in data.columns:
        if len(data[column].unique()) == 1:
            #print(traindata[column].name)
            data.drop(column,inplace=True,axis=1)
            
    return data

In [6]:
def getCategoricalFeatures(data):
    columns = list(data)
    result = []
    for c in columns:
        if data.dtypes[c] == np.object:
            result.append(c)
    return data[result]

def getNumericalFeatures(data):
    columns = list(data)
    result = []
    for c in columns: 
        if data.dtypes[c] != np.object:
            result.append(c) 
    return data[result]

In [7]:
def drop_high_correlation_features(data, threshold):

    corr_matrix = data.corr()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(abs(upper[column]) > threshold)]
    data = data.drop(columns = to_drop)
    
    return data

In [8]:
def label_encode_categorical_features(data):
    encoder_dict = collections.defaultdict(LabelEncoder)
    data = data.apply(lambda x: encoder_dict[x.name].fit_transform(x))
    return data

In [9]:
def split_data(features, labels):
    
    # Data Splitting: 60% for training, 20% for validation and 20% for testing.
    X_train, X_test, Y_train, y_test = train_test_split(features, labels, test_size=0.4)
    X_validation, X_test, Y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5)
    
    return X_train, Y_train, X_test, y_test, X_validation, Y_validation

In [10]:
def selectkbestfeatures(X_train, Y_train, X_validation, X_test, numberOfFeatures):

    fit = SelectKBest(score_func=f_classif, k=numberOfFeatures).fit(X_train, Y_train)

    X_train = fit.transform(X_train)
    X_validation = fit.transform(X_validation)
    X_test = fit.transform(X_test)

    # Get column names from the best features
    X_train_cols = fit.get_support(indices=True)
    X_validation_cols = fit.get_support(indices=True)
    X_test_cols = fit.get_support(indices=True)

    X_train = pd.DataFrame(X_train, columns=X_train_cols)
    X_validation = pd.DataFrame(X_validation, columns=X_validation_cols)
    X_test = pd.DataFrame(X_test, columns=X_test_cols)

    # Create new dataframes with the column names
    #X_train = X_train.iloc[:,X_train_cols]
    #X_validation = X_validation.iloc[:,X_validation_cols]
    #X_test = X_test.iloc[:,X_test_cols]

    return X_train, X_validation, X_test

In [11]:
def evaluate_model(name, model, features, labels):
    
    start = time()
    pred = model.predict(features)
    end = time()
    
    # Print the confusion matrix
    print(metrics.confusion_matrix(labels, pred))

    # Print the precision and recall, among other metrics
    print(metrics.classification_report(labels, pred, digits=3))
    
    print(name+" Accuracy - "+str(round(accuracy_score(labels, pred), 3) * 100)+"%")
    print(name+" Precision - "+str(round(precision_score(labels, pred, average='micro'), 3) * 100)+"%")
    print(name+" Recall - "+str(round(recall_score(labels, pred, average='micro'), 3) * 100)+"%")
    print(name+" F1 Score - "+str(round(f1_score(labels, pred, average='micro'), 3) * 100)+"%")
    print(name+" Latency - "+str(round((end - start) * 1000, 1))+"ms \n")

This will be the start of the data analytics and will make use of the above functions 

In [12]:
# read and combine train data by the TransactionID
#this has been changed to read from the local machine instad of from google drive as is done with colab 
train_identity = pd.read_csv("D:\School\Fifth Year\Large Scale Data Analytics\Project\ieee-fraud-detection\\train_identity.csv")
train_transaction = pd.read_csv("D:\School\Fifth Year\Large Scale Data Analytics\Project\ieee-fraud-detection\\train_transaction.csv")
#combine the data so we can go through the whole thing 
traindata = pd.merge(train_transaction,train_identity, on='TransactionID', how='left',left_index=True,right_index=True)

In [13]:
# Separate Features & Labels
train_features, train_labels = prepare_inputs_and_outputs(traindata)#used to be at the top but I moved it in order to have the

In [14]:
# [PREPROCESSING STAGE 1] - DATA CLEANING

# Examine the percentage of missing data for all feature in the training data
allFeaturesMissingData = get_missing_data_percentage(train_features)

# Drop features with a missing data percentage above the specified threshold
train_features = drop_high_missing_data_columns(allFeaturesMissingData, train_features, 70)

# Drop features with only 1 distinct value, extremely high or extremely low correlation
train_features = drop_one_value_columns(train_features)
train_features = drop_high_correlation_features(train_features, 0.80)

# Extract the numerical & categorical features from training features
numericalFeatures = getNumericalFeatures(train_features)
categoricalFeatures = getCategoricalFeatures(train_features)

# Get the percentage of missing data for both numerical & categorical features
numericalFeaturesMissingData = get_missing_data_percentage(numericalFeatures)
categoricalFeaturesMissingData = get_missing_data_percentage(categoricalFeatures)


# Impute categorical missing values with "X" and numerical missing values with column mean
numericalFeatures = numericalFeatures.fillna(numericalFeatures.mean(), inplace=False)
categoricalFeatures = categoricalFeatures.fillna("X")

 #Update missing data and ensure none exists
numericalFeaturesMissingData = get_missing_data_percentage(numericalFeatures)
categoricalFeaturesMissingData = get_missing_data_percentage(categoricalFeatures)




In [15]:
# [PREPROCESSING STAGE 2] - DATA TRANSFORMATION 

# Numerically represent the categorical features using label encoding
categoricalFeatures = label_encode_categorical_features(categoricalFeatures)

# Update training features by replacing the initial data with the imputed data
train_features = pd.concat([numericalFeatures, categoricalFeatures], axis=1)

# Further split the training data into a train and test sets
X_train, Y_train, X_test, Y_test, X_validation, Y_validation = split_data(train_features, train_labels)

# Feature Selection using SelectKBest
X_train, X_validation, X_test = selectkbestfeatures(X_train, Y_train, X_validation, X_test, 50)

# Feature Scaling using Standardization
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

In [16]:
# [PREPROCESSING STAGE 3] - DATA REDUCTION (USING PCA or LDA) (focus here next)

from sklearn.decomposition import PCA

pca = PCA(n_components=25).fit(X_train)

X_train_pca = pca.transform(X_train)
X_validation_pca = pca.transform(X_validation)
X_test_pca = pca.transform(X_test)

X_train = pd.DataFrame(data = X_train_pca)
X_validation = pd.DataFrame(data = X_validation_pca)
X_test = pd.DataFrame(data = X_test_pca)

At the moment this takes a very long time so I will avoid it 

In [57]:
#SMOTE for dealing with unbalanced data
#pip install imblearn
########################################################### different methods to balance data
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

def smote(K,x,y):
    seed=100 
    sm=SMOTE(sampling_strategy='auto', k_neighbors=K, random_state=seed)
    #sm=SMOTE()
    X_res, y_res=sm.fit_resample(x,y)

    #print("SMOTE\n# of 1 %d\n# of 0 %d"%(np.count_nonzero(y_res==1),np.count_nonzero(y_res==0)))
    return X_res, y_res

def ada(x,y):
    adas=ADASYN()
    c_res,d_res=adas.fit_resample(x,y)
    #print("ADASYN\n# of 1 %d\n# of 0 %d"%(np.count_nonzero(d_res==1),np.count_nonzero(d_res==0)))
    return c_res, d_res

def ENN(K, x,y):
    undersample= EditedNearestNeighbours(n_neighbors=K)
    a,b=undersample.fit_resample(x,y)
    return a, b

def CNN(K,x,y):#relativly slow so do not use again 
    undersample=CondensedNearestNeighbour(n_neighbors=K)
    X,Y=undersample.fit_resample(x,y)
    return X,Y

def OSS(n,s,x,y):# possibly slow because it uses CNN
    undersample=OneSidedSelection(n_neighbors=n, n_seeds_S=s)
    X,Y=undersample.fit_resample(x,y)
    return X,Y

def NM(n,x,y):# DO NOT USE v 2 TOOK UP 54G of RAM
    undersample=NearMiss(version=3,n_neighbors_ver3=n)
    X,Y=undersample.fit_resample(x,y)
    return X,Y
def comb(x,y):
    #resample = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority'))
    resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
    retx,rety=resample.fit_resample(x,y)
    return retx, rety


In [58]:
#Keras example for getting the NN to work 


def NNetwork(Tr,A,E,L1,L2,L3,L4,W):#Tr is training set A is answer L1 is layer 1 nodes L2 is layer 2 nodes E is the epochs and W is if weights are needed
    NNIn=Tr#take the data and use  new name for it 
    NNans=A
    ##########################################layers are added and removed here to test different implementations
    model_NN=Sequential()
    model_NN.add(Dense(L1, input_dim=NNIn.shape[1], activation='relu'))#50 nodes in first hidden layer there might have been an issue with relu but it seems to be fixed
    model_NN.add(Dropout(0.1))
    model_NN.add(Dense(L2, activation='relu'))
    #model_NN.add(Dense(L3, activation='relu',kernel_initializer='uniform'))
    model_NN.add(Dropout(0.1))
    model_NN.add(Dense(L3, activation='relu'))#seems to bring back the all same value issue 
    model_NN.add(Dropout(0.1))
    model_NN.add(Dense(1, activation='sigmoid',kernel_initializer='uniform'))
    model_NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    if(W==1):#if the weights are needed or not 
        weights={0:(np.count_nonzero(NNans==1)/NNans.size), 1:(np.count_nonzero(NNans==0)/NNans.size)}#gives more even distribution less loss and not much difference in accuracy 
        #weights={0:7, 1:193}#increase to 27 gave slightly better results 
        model_NN.fit(NNIn,NNans,class_weight=weights,epochs=E)# it says 11073 insteead of 354324 because there are 32 batches and 354324/32=11073
        
    else: 
        model_NN.fit(NNIn,NNans,epochs=E)
    
   
    return model_NN
    NNacc(model_NN,NNIn,NNans)#call the other function to analyze 

    
def NNacc (model, testin,testout):
    predictions=model.predict_classes(testin)
    check=np.array(testout)
    count=0
    count1=0
    count0=0
    for i in range(predictions.size):
        if(predictions[i]==check[i]):
            count+=1
        if( predictions[i]==1 and check[i]==1):
            count1+=1
        if( predictions[i]==0 and check[i]==0):
            count0+=1 
    acc=((count/testout.size))*100
    acc1=(count1/np.count_nonzero(check==1))*100
    acc0=(count0/np.count_nonzero(check==0))*100
    print("Total accuracy: %d    Accuracy of 1: %d     Accuracy of 0: %d"%(round(acc),round(acc1),round(acc0)))

########################################### collection of different function calls to balance the data  
#bal_x, bal_y =smote(3,X_train,Y_train)#get more balanced data using smote with the input and output as well as the K value 

#ad_x,ad_y=ada(X_train,Y_train)#get more balanced data using ADASYN

#u_x,u_y=ENN(12,X_train,Y_train)

#CNN_x,CNN_y=CNN(1,X_train,Y_train)#relativly slow do not use again 
#print("Done ENN")#takes a fair amount of time 
#NM_x, NM_y=NM(3,X_train,Y_train)#try this one if they still take a long time 
#print("Done NM")#does not take long but does when using second type
#ONN_x,ONN_y=OSS(4,200,X_train, Y_train)
#print("Done ENN")
#combination 

cx,cy=comb(X_train,Y_train)
#mod=NNetwork(X_train,Y_train,10,100,600,250,1)#train using the X_train and Y_train with epoch and first and second layers and specify 1 if weig

#####################################################different NN implimentations and printouts based on each data for comperison
""" print("SMOTE")
mod=NNetwork(bal_x,bal_y,10,500,1000,280,400,0)#train using the X_train and Y_train with epoch and first and second layers and specify 1 if weig
print("With balanced training data")
NNacc(mod, bal_x,bal_y)
print("With origional training data")
NNacc(mod, X_train,Y_train)
print("with origional testing data")
NNacc(mod, X_test, Y_test)


print("ADASYN")
mod=NNetwork(ad_x,ad_y,10,500,1000,280,400,0)#train using the X_train and Y_train with epoch and first and second layers and specify 1 if weig
print("With balanced training data")
NNacc(mod, ad_x,ad_y)
print("With origional training data")
NNacc(mod, X_train,Y_train)
print("with origional testing data")
NNacc(mod, X_test, Y_test) 
  

print("Edited NN")
mod=NNetwork(u_x,u_y,10,100,100,280,400,0)#
print("With balanced training data")
NNacc(mod, u_x,u_y)
print("With origional training data")
NNacc(mod, X_train,Y_train)
print("with origional testing data")
NNacc(mod, X_test, Y_test) 


print("One Sided NN")
mod=NNetwork(ONN_x,ONN_y,10,100,100,280,400,0)#
print("With balanced training data")
NNacc(mod, ONN_x,ONN_y)
print("With origional training data")
NNacc(mod, X_train,Y_train)
print("with origional testing data")
NNacc(mod, X_test, Y_test) 


print("Near Miss")
mod=NNetwork(NM_x,NM_y,10,100,600,300,400,0)#
print("With balanced training data")
NNacc(mod, NM_x,NM_y)
print("With origional training data")
NNacc(mod, X_train,Y_train)
print("with origional testing data")
NNacc(mod, X_test, Y_test) 
"""
##########################################################################most results tracked in NN Testing doc on google drive 
print("Combined")
mod=NNetwork(cx,cy,50,120,212,320,400,0)#
print("With balanced training data")
NNacc(mod, cx,cy)
print("With origional training data")
NNacc(mod, X_train,Y_train)
print("with origional testing data")
NNacc(mod, X_test, Y_test) 

""" print("NN Added Weights")
mod=NNetwork(X_train,Y_train,10,100,100,280,400,1)#train using the X_train and Y_train with epoch and first and second layers and specify 1 if weig
print("With balanced training data")
NNacc(mod, X_train,Y_train)
print("with origional testing data")
NNacc(mod, X_test, Y_test) """



Combined
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
 3218/21355 [===>..........................] - ETA: 56s - loss: 0.3577 - accuracy: 0.8226

KeyboardInterrupt: 

In [17]:
# [MODEL BUILDING]

algorithm = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5, 50, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1, 10, 100]
}
cv = GridSearchCV(algorithm, parameters, cv=5)
cv.fit(X_train, Y_train.values.ravel())

KeyboardInterrupt: 

In [None]:
# [MODEL EVALUATION]
 
evaluate_model('Train Set', cv, X_train, Y_train)
evaluate_model('Validation Set', cv, X_validation, Y_validation)
evaluate_model('Test Set', cv, X_test, Y_test)