In [None]:
import pandas as pd
import numpy as np
import collections
from time import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import preprocessing
from sklearn import metrics

# Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

#for the NN
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import backend as K
import tensorflow as tf

In [22]:
def prepare_inputs_and_outputs(data):
    
    # Prepare & save the inputs and outputs features
    features = data.drop(['isFraud','TransactionID'], axis = 1)
    labels = data[['isFraud']]
    
    return features, labels

In [23]:
def get_missing_data_percentage(data):
    
    # where mvp = missing value percentages
    mvp = data.isnull().sum() * 100 / len(data)
    mvp = pd.DataFrame({'Feature': data.columns,'Percentage': mvp})
    
    return mvp.sort_values(by ='Percentage', ascending=False)

In [24]:
def drop_high_missing_data_columns(mvd, data, threshold):
    # Where "mvd" = missing value data
    # Get names of indexes for which column missing data is over 50%
    high_missing_data_cols = mvd[mvd['Percentage'] > threshold].index

    for col_name in range(len(high_missing_data_cols)):
        del data[high_missing_data_cols[col_name]] # Delete rows from dataFrame??? or columns
    
    return data

In [25]:
def drop_one_value_columns(data):
    
    # Drop columns with only 1 unique value.
    for column in data.columns:
        if len(data[column].unique()) == 1:
            #print(traindata[column].name)
            data.drop(column,inplace=True,axis=1)
            
    return data

In [26]:
def getCategoricalFeatures(data):
    columns = list(data)
    result = []
    for c in columns:
        if data.dtypes[c] == np.object:
            result.append(c)
    return data[result]

def getNumericalFeatures(data):
    columns = list(data)
    result = []
    for c in columns: 
        if data.dtypes[c] != np.object:
            result.append(c) 
    return data[result]

In [27]:
def drop_high_correlation_features(data, threshold):

    corr_matrix = data.corr()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(abs(upper[column]) > threshold)]
    data = data.drop(columns = to_drop)
    
    return data

In [28]:
def label_encode_categorical_features(data):
    encoder_dict = collections.defaultdict(LabelEncoder)
    data = data.apply(lambda x: encoder_dict[x.name].fit_transform(x))
    return data

In [29]:
def split_data(features, labels):
    
    # Data Splitting: 60% for training, 20% for validation and 20% for testing.
    X_train, X_test, Y_train, y_test = train_test_split(features, labels, test_size=0.4)
    X_validation, X_test, Y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5)
    
    return X_train, Y_train, X_test, y_test, X_validation, Y_validation

In [30]:
def selectkbestfeatures(X_train, Y_train, X_validation, X_test, numberOfFeatures):

    fit = SelectKBest(score_func=f_classif, k=numberOfFeatures).fit(X_train, Y_train)

    X_train = fit.transform(X_train)
    X_validation = fit.transform(X_validation)
    X_test = fit.transform(X_test)

    # Get column names from the best features
    X_train_cols = fit.get_support(indices=True)
    X_validation_cols = fit.get_support(indices=True)
    X_test_cols = fit.get_support(indices=True)

    X_train = pd.DataFrame(X_train, columns=X_train_cols)
    X_validation = pd.DataFrame(X_validation, columns=X_validation_cols)
    X_test = pd.DataFrame(X_test, columns=X_test_cols)

    # Create new dataframes with the column names
    #X_train = X_train.iloc[:,X_train_cols]
    #X_validation = X_validation.iloc[:,X_validation_cols]
    #X_test = X_test.iloc[:,X_test_cols]

    return X_train, X_validation, X_test

In [31]:
# functions and such to get the Neural Network up and running 
#this will be an example of a simple perceptron NN using the sigmoid function 


""" 
class NN(object):
    def __init__(self):
        self.inputLayerSize=24
        self.hiddenLayerSize=50
        self.outputLayerSize=1

        self.weight1=np.random.randn(self.inputLayerSize, self.hiddenLayerSize)
        self.weight2=np.random.randn(self.hiddenLayerSize,self.outputLayerSize)

    def forwardprop(self,X):
        self.z2=np.dot(Xm, self.weight1)
        self.a2=self.sigmoid(self.z2)
        self.z3=np,dot(self.a2, self.weight2)
        yhat=self.sigmoid(self.z3)
        return yhat
    
    def sigmoid(self,y):
        ans=1/(1+np.exp(-y))
        return ans

    def sigmoidPrime(self,z):
        return np,exp(-z)/((1+np.exp(z))**2)

    def costFunction(self, X, y):
        self.yhat=self.forwardprop(X)
        j=0.5*sum((y-self.yhat)**2)
        return j

    def costPrime(self,X,y):
        self.yaht=self.forwardprop(X)
        delta3=np.multiply(-(y-self.yhat), self.sigmoidPrime(self.z3))
        dHdW2=np.dot(self.a2.T, delta3)

        delta2=np.dot(delta3, self.weight2)*self.sigmoidPrime(self.z2)
        dJdW1=np.dot(X.T, delta2)
        return dJdW1, dJdW2
     """




' \nclass NN(object):\n    def __init__(self):\n        self.inputLayerSize=24\n        self.hiddenLayerSize=50\n        self.outputLayerSize=1\n\n        self.weight1=np.random.randn(self.inputLayerSize, self.hiddenLayerSize)\n        self.weight2=np.random.randn(self.hiddenLayerSize,self.outputLayerSize)\n\n    def forwardprop(self,X):\n        self.z2=np.dot(Xm, self.weight1)\n        self.a2=self.sigmoid(self.z2)\n        self.z3=np,dot(self.a2, self.weight2)\n        yhat=self.sigmoid(self.z3)\n        return yhat\n    \n    def sigmoid(self,y):\n        ans=1/(1+np.exp(-y))\n        return ans\n\n    def sigmoidPrime(self,z):\n        return np,exp(-z)/((1+np.exp(z))**2)\n\n    def costFunction(self, X, y):\n        self.yhat=self.forwardprop(X)\n        j=0.5*sum((y-self.yhat)**2)\n        return j\n\n    def costPrime(self,X,y):\n        self.yaht=self.forwardprop(X)\n        delta3=np.multiply(-(y-self.yhat), self.sigmoidPrime(self.z3))\n        dHdW2=np.dot(self.a2.T, delta3)

In [32]:
""" #Tensorflow 
from keras.models import Sequential
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
learning_rate=0.5
epochs=10 
batch_size=100
layer_nodes=300
#set the input and output layer with 24 types or columns coming in and two possible outcomes
x=tf.placeholder(tf.float32,[None, 24])
y=tf.placeholder(tf.float32,[None, 2])
#set weights and biases 
#input to hidden
w1=tf.Variable(tf.random_normal([24,layer_nodes],stddev=0.03), name='w1')
b1=tf.Variable(tf.random_normal([layer_nodes]), name='b1')
#hidden to output
w2=tf.Variable(tf.random_normal([layer_nodes,2],stddev=0.03), name='w2')
b2=tf.Variable(tf.random_normal([2]), name='b2')

hidden_out=tf.add(tf.matmul(x,w1),b1)
hidden_out=tf.nn.relu(hidden_out)

y_=tf.nn.softmax(tf.add(tf.matmul(hidden_out,w2),b2))

y_clipped = tf.clip_by_value(y_, 1e-10, 0.9999999)
cross_entropy = -tf.reduce_mean(tf.reduce_sum(y * tf.log(y_clipped)+ (1 - y) * tf.log(1 - y_clipped), axis=1))

optimiser = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cross_entropy)

init_op=tf.global_variables_initializer()

correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
NNInput=np.array(X_train)
# start the session
with tf.Session() as sess:
   # initialise the variables
   sess.run(init_op)
   total_batch = int(len(NNInput) / batch_size)
   for epoch in range(epochs):
        avg_cost = 0
        for i in range(total_batch):
            batch_x, batch_y = NNInput.next_batch(batch_size=batch_size)
            _, c = sess.run([optimiser, cross_entropy], 
                         feed_dict={x: batch_x, y: batch_y})
            avg_cost += c / total_batch
        print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(avg_cost))
   print(sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels})) """



' #Tensorflow \nfrom keras.models import Sequential\nimport tensorflow.compat.v1 as tf\ntf.disable_v2_behavior()\nlearning_rate=0.5\nepochs=10 \nbatch_size=100\nlayer_nodes=300\n#set the input and output layer with 24 types or columns coming in and two possible outcomes\nx=tf.placeholder(tf.float32,[None, 24])\ny=tf.placeholder(tf.float32,[None, 2])\n#set weights and biases \n#input to hidden\nw1=tf.Variable(tf.random_normal([24,layer_nodes],stddev=0.03), name=\'w1\')\nb1=tf.Variable(tf.random_normal([layer_nodes]), name=\'b1\')\n#hidden to output\nw2=tf.Variable(tf.random_normal([layer_nodes,2],stddev=0.03), name=\'w2\')\nb2=tf.Variable(tf.random_normal([2]), name=\'b2\')\n\nhidden_out=tf.add(tf.matmul(x,w1),b1)\nhidden_out=tf.nn.relu(hidden_out)\n\ny_=tf.nn.softmax(tf.add(tf.matmul(hidden_out,w2),b2))\n\ny_clipped = tf.clip_by_value(y_, 1e-10, 0.9999999)\ncross_entropy = -tf.reduce_mean(tf.reduce_sum(y * tf.log(y_clipped)+ (1 - y) * tf.log(1 - y_clipped), axis=1))\n\noptimiser = tf.t

In [33]:
""" predictions=model.predict_classes(X_test)
check=np.array(Y_test)
count=0
for i in range(predictions.size):
   if(predictions[i]==check[i]):
       count=count+1

print(count)
acc=count/predictions.size
print(acc) """


' predictions=model.predict_classes(X_test)\ncheck=np.array(Y_test)\ncount=0\nfor i in range(predictions.size):\n   if(predictions[i]==check[i]):\n       count=count+1\n\nprint(count)\nacc=count/predictions.size\nprint(acc) '

In [34]:
""" predictions=model.predict_classes(NNIN)
check=np.array(NNans)
count=0
count1=0
for i in range(predictions.size):
    if(predictions[i]==check[i]):
       count=count+1
    if(check[i]==1):
       print(predictions[i])


print(count)
acc=count/predictions.size
print(acc)
 """



' predictions=model.predict_classes(NNIN)\ncheck=np.array(NNans)\ncount=0\ncount1=0\nfor i in range(predictions.size):\n    if(predictions[i]==check[i]):\n       count=count+1\n    if(check[i]==1):\n       print(predictions[i])\n\n\nprint(count)\nacc=count/predictions.size\nprint(acc)\n '

In [35]:
def evaluate_model(name, model, features, labels):
    
    start = time()
    pred = model.predict(features)
    end = time()
    
    # Print the confusion matrix
    print(metrics.confusion_matrix(labels, pred))

    # Print the precision and recall, among other metrics
    print(metrics.classification_report(labels, pred, digits=3))
    
    print(name+" Accuracy - "+str(round(accuracy_score(labels, pred), 3) * 100)+"%")
    print(name+" Precision - "+str(round(precision_score(labels, pred, average='micro'), 3) * 100)+"%")
    print(name+" Recall - "+str(round(recall_score(labels, pred, average='micro'), 3) * 100)+"%")
    print(name+" F1 Score - "+str(round(f1_score(labels, pred, average='micro'), 3) * 100)+"%")
    print(name+" Latency - "+str(round((end - start) * 1000, 1))+"ms \n")

This will be the start of the data analytics and will make use of the above functions 

In [36]:
# read and combine train data by the TransactionID
#this has been changed to read from the local machine instad of from google drive as is done with colab 
train_identity = pd.read_csv("D:\School\Fifth Year\Large Scale Data Analytics\Project\ieee-fraud-detection\\train_identity.csv")
train_transaction = pd.read_csv("D:\School\Fifth Year\Large Scale Data Analytics\Project\ieee-fraud-detection\\train_transaction.csv")
#combine the data so we can go through the whole thing 
traindata = pd.merge(train_transaction,train_identity, on='TransactionID', how='left',left_index=True,right_index=True)

In [37]:
# Separate Features & Labels
train_features, train_labels = prepare_inputs_and_outputs(traindata)#used to be at the top but I moved it in order to have the

In [38]:
# [PREPROCESSING STAGE 1] - DATA CLEANING

# Examine the percentage of missing data for all feature in the training data
allFeaturesMissingData = get_missing_data_percentage(train_features)

# Drop features with a missing data percentage above the specified threshold
train_features = drop_high_missing_data_columns(allFeaturesMissingData, train_features, 70)

# Drop features with only 1 distinct value, extremely high or extremely low correlation
train_features = drop_one_value_columns(train_features)
train_features = drop_high_correlation_features(train_features, 0.80)

# Extract the numerical & categorical features from training features
numericalFeatures = getNumericalFeatures(train_features)
categoricalFeatures = getCategoricalFeatures(train_features)

# Get the percentage of missing data for both numerical & categorical features
numericalFeaturesMissingData = get_missing_data_percentage(numericalFeatures)
categoricalFeaturesMissingData = get_missing_data_percentage(categoricalFeatures)


# Impute categorical missing values with "X" and numerical missing values with column mean
numericalFeatures = numericalFeatures.fillna(numericalFeatures.mean(), inplace=False)
categoricalFeatures = categoricalFeatures.fillna("X")

 #Update missing data and ensure none exists
numericalFeaturesMissingData = get_missing_data_percentage(numericalFeatures)
categoricalFeaturesMissingData = get_missing_data_percentage(categoricalFeatures)




In [39]:
# [PREPROCESSING STAGE 2] - DATA TRANSFORMATION 

# Numerically represent the categorical features using label encoding
categoricalFeatures = label_encode_categorical_features(categoricalFeatures)

# Update training features by replacing the initial data with the imputed data
train_features = pd.concat([numericalFeatures, categoricalFeatures], axis=1)

# Further split the training data into a train and test sets
X_train, Y_train, X_test, Y_test, X_validation, Y_validation = split_data(train_features, train_labels)

# Feature Selection using SelectKBest
X_train, X_validation, X_test = selectkbestfeatures(X_train, Y_train, X_validation, X_test, 50)

# Feature Scaling using Standardization
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

In [40]:
# [PREPROCESSING STAGE 3] - DATA REDUCTION (USING PCA or LDA) (focus here next)

from sklearn.decomposition import PCA

pca = PCA(n_components=25).fit(X_train)

X_train_pca = pca.transform(X_train)
X_validation_pca = pca.transform(X_validation)
X_test_pca = pca.transform(X_test)

X_train = pd.DataFrame(data = X_train_pca)
X_validation = pd.DataFrame(data = X_validation_pca)
X_test = pd.DataFrame(data = X_test_pca)

At the moment this takes a very long time so I will avoid it 

In [1]:
#SMOTE for dealing with unbalanced data
#pip install imblearn
import sklearn
from imblearn.over_sampling import SMOTE
import sklearn
seed=100 
sm=SMOTE(sampling_strategy='auto', k_neighbors=2, random_state=seed)
X_res, y_res=sm.fit_resample(X_train,Y_train) 

print("# of 1 %d\n# of 0 %d"%(sum(y_res==1),sum(y_res==0)))

NameError: name 'X_train' is not defined

In [160]:
#Keras example for getting the NN to work 


def NNetwork(Tr,A,E,L1,L2,L3,W):#Tr is training set A is answer L1 is layer 1 nodes L2 is layer 2 nodes E is the epochs and W is if weights are needed
    NNIn=Tr#take the data and use  new name for it 
    NNans=A
    model_NN=Sequential()

    model_NN.add(Dense(L1, input_dim=NNIn.shape[1], activation='elu'))#50 nodes in first hidden layer there might have been an issue with relu but it seems to be fixed
    #model_NN.add(Dropout(0.5))
    model_NN.add(Dense(L2, activation='elu'))
    model_NN.add(Dense(L3, activation='elu'))
    #model_NN.add(Dropout(0.5))
    #model_NN.add(Dense(L3, activation='relu'))#seems to bring back the all same value issue 
    model_NN.add(Dense(1, activation='sigmoid'))
    model_NN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    if(W==1):#if the weights are needed or not 
        weights={0:(np.count_nonzero(NNans==1)/NNans.size), 1:(np.count_nonzero(NNans==0)/NNans.size)}#gives more even distribution less loss and not much difference in accuracy 
        #weights={0:1, 1:27}#increase to 27 gave slightly better results 
        model_NN.fit(NNIn,NNans,class_weight=weights,epochs=E)# it says 11073 insteead of 354324 because there are 32 batches and 354324/32=11073
        
    else: 
        model_NN.fit(NNIn,NNans,epochs=E)
    
   
    return model_NN
    NNacc(model_NN,NNIn,NNans)#call the other function to analyze 

    
def NNacc (model, testin,testout):
    predictions=model.predict_classes(testin)
    check=np.array(testout)
    count=0
    count1=0
    count0=0
    for i in range(predictions.size):
        if(predictions[i]==check[i]):
            count+=1
        if( predictions[i]==1 and check[i]==1):
            count1+=1
        if( predictions[i]==0 and check[i]==0):
            count0+=1 
    acc=((count/testout.size))*100
    print("Total accuracy: %.2f"%(acc))
    acc1=(count1/np.count_nonzero(NNans==1))*100
    print("accuracy of 1: %.2f"%(acc1))
    acc0=(count0/np.count_nonzero(NNans==0))*100
    print("accuracy of 0: %.2f"%(acc0))



mod=NNetwork(X_train,Y_train,10,100,600,600,1)#train using the X_train and Y_train with epoch and first and second layers and specify 1 if weig
NNacc(mod, X_train,Y_train)




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Total accuracy: 82.12
accuracy of 1: 70.97
accuracy of 0: 82.52


In [17]:
# [MODEL BUILDING]

algorithm = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5, 50, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1, 10, 100]
}
cv = GridSearchCV(algorithm, parameters, cv=5)
cv.fit(X_train, Y_train.values.ravel())

KeyboardInterrupt: 

In [None]:
# [MODEL EVALUATION]
 
evaluate_model('Train Set', cv, X_train, Y_train)
evaluate_model('Validation Set', cv, X_validation, Y_validation)
evaluate_model('Test Set', cv, X_test, Y_test)