In [1]:
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE



In [2]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

In [3]:
# Reading in each data file
train_transaction = pd.read_csv('/content/drive/My Drive/Group #2: Detecting Fraudulent Transactions/Project Code/dataset/train_transaction.csv')
train_identity = pd.read_csv('/content/drive/My Drive/Group #2: Detecting Fraudulent Transactions/Project Code/dataset/train_identity.csv')

# Merging transactional and test data 
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)

# Freeing up memory
del train_transaction, train_identity

In [4]:
def selectkbestfeatures(X_train, Y_train, X_validation, X_test, numberOfFeatures):

    fit = SelectKBest(score_func=f_classif, k=numberOfFeatures).fit(X_train, Y_train)

    X_train = fit.transform(X_train)
    X_validation = fit.transform(X_validation)
    X_test = fit.transform(X_test)

    # Get column names from the best features
    X_train_cols = fit.get_support(indices=True)
    X_validation_cols = fit.get_support(indices=True)
    X_test_cols = fit.get_support(indices=True)

    X_train = pd.DataFrame(X_train, columns=X_train_cols)
    X_validation = pd.DataFrame(X_validation, columns=X_validation_cols)
    X_test = pd.DataFrame(X_test, columns=X_test_cols)

    # Create new dataframes with the column names
    #X_train = X_train.iloc[:,X_train_cols]
    #X_validation = X_validation.iloc[:,X_validation_cols]
    #X_test = X_test.iloc[:,X_test_cols]

    return X_train, X_validation, X_test

In [5]:
#SMOTE for dealing with unbalanced data
!pip install imblearn
########################################################### different methods to balance data
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

def smote(K,x,y):
    seed=100 
    sm=SMOTE(sampling_strategy='auto', k_neighbors=K, random_state=seed)
    #sm=SMOTE()
    X_res, y_res=sm.fit_resample(x,y)

    #print("SMOTE\n# of 1 %d\n# of 0 %d"%(np.count_nonzero(y_res==1),np.count_nonzero(y_res==0)))
    return X_res, y_res

def ada(x,y):
    adas=ADASYN()
    c_res,d_res=adas.fit_resample(x,y)
    #print("ADASYN\n# of 1 %d\n# of 0 %d"%(np.count_nonzero(d_res==1),np.count_nonzero(d_res==0)))
    return c_res, d_res

def ENN(K, x,y):
    undersample= EditedNearestNeighbours(n_neighbors=K)
    a,b=undersample.fit_resample(x,y)
    return a, b

def CNN(K,x,y):#relativly slow so do not use again 
    undersample=CondensedNearestNeighbour(n_neighbors=K)
    X,Y=undersample.fit_resample(x,y)
    return X,Y

def OSS(n,s,x,y):# possibly slow because it uses CNN
    undersample=OneSidedSelection(n_neighbors=n, n_seeds_S=s)
    X,Y=undersample.fit_resample(x,y)
    return X,Y

def NM(n,x,y):# DO NOT USE v 2 TOOK UP 54G of RAM
    undersample=NearMiss(version=3,n_neighbors_ver3=n)
    X,Y=undersample.fit_resample(x,y)
    return X,Y
def comb(x,y):
    #resample = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority'))
    resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
    retx,rety=resample.fit_resample(x,y)
    return retx, rety



In [6]:
def getCategoricalFeatures(data):
    columns = list(data)
    result = []
    for c in columns:
        if data.dtypes[c] == np.object:
            result.append(c)
    return data[result]

In [7]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype 
        
        if col_type != object:
            if str(col_type)[:3] == 'int':
                if df[col].max() < 2 ** 7:
                    df[col] = df[col].astype(np.int8)
                elif df[col].max() < 2 ** 15:
                    df[col] = df[col].astype(np.int16)
                elif df[col].max() < 2 ** 31:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)  
            else:
                if df[col].max() < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif df[col].max() < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [8]:
#from sklearn.preprocessing import Imputer

def replaceMissingValues(df):
    for col in df.columns:
        df[col].fillna(-9999, inplace=True)
        # fixes missing data by taking values from other rows and taking the average
        #imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

        # this function takes the average of every column excluding the unknown values
        #imp.fit(data)

        # inserts the average into the missing spots
        #data = imp.fit_transform(data)
    
    return df

In [9]:
train = reduce_mem_usage(train)

Memory usage of dataframe is 1959.88 MB
Memory usage after optimization is: 648.22 MB
Decreased by 66.9%


In [10]:
enc=LabelEncoder()# create the label encoder
for col in train:#loop through all of the columns
    if train[col].dtype.name=="object":#if the data is in a string format we will need to convert it to numeric to find the correlation 
        enc.fit(train[col].astype(str))#fit the column to the encoder to convert to numeric 
        train[col]=enc.fit_transform(train[col].astype(str))#transform the data into numeric 

In [11]:
train = replaceMissingValues(train)

In [12]:
train_data, test_data = train_test_split(train, test_size=0.4, random_state=123)
train_data, val_data = train_test_split(train, test_size = 0.5, random_state=123)

X_train = train_data.drop(['isFraud'], axis=1)
Y_train = train_data['isFraud']

X_val = train_data.drop(['isFraud'], axis=1)
Y_val = train_data['isFraud']

X_test = test_data.drop(['isFraud'], axis=1)
Y_test = test_data['isFraud']

In [None]:
X_train, Y_train = smote(3, X_train, Y_train)

In [None]:
print(X_train, Y_train)

In [None]:
for x in range (10):
  print(X_train[x])

In [None]:
dftrainLGB = lgb.Dataset(data = X_train, label = Y_train, feature_name = list(train.drop(['isFraud'], axis = 1)))
dfvalLGB = lgb.Dataset(data = X_val, label = Y_val, feature_name = list(train.drop(['isFraud'], axis = 1)))

params = {'num_leaves': 300,
          'objective': 'binary',
          'learning_rate': 0.01,
          "boosting_type": "gbdt",
          "verbosity": -1,
          "metric": 'auc',
          'random_state': 123,
         }

clf = lgb.train(params, 
                dftrainLGB, 
                num_boost_round=100, 
                valid_sets=[dftrainLGB, dfvalLGB],
                early_stopping_rounds=10)




In [None]:
from sklearn.metrics import confusion_matrix, auc, roc_curve
import matplotlib.pyplot as plt 

def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred).ravel()[3]
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred).ravel()[0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred).ravel()[1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred).ravel()[2]

# FP / (FP + TN)
def specificity(y_true, y_pred):
    if tn(y_true, y_pred) + fp(y_true, y_pred) == 0:
        return 0
    else: 
        return float(tn(y_true, y_pred)) / float((tn(y_true, y_pred) + fp(y_true, y_pred)))

# TP / (TP + FN)
def sensitivity(y_true, y_pred):
    if tp(y_true, y_pred) + fn(y_true, y_pred) == 0:
        return 0
    else:
        return float(tp(y_true, y_pred)) / float((tp(y_true, y_pred) + fn(y_true, y_pred)))

def precision(y_true, y_pred):
    return tp(y_true, y_pred) / (tp(y_true, y_pred) + fp(y_true, y_pred))

# TP / (TP + 0.5(FP + FN))
def f1(y_true, y_pred):
    if precision(y_true, y_pred) + sensitivity(y_true, y_pred) == 0:
        return 0
    else:
        return 2.0 * float(precision(y_true, y_pred) * sensitivity(y_true, y_pred)) / (precision(y_true, y_pred) +
                                                                                sensitivity(y_true, y_pred))
# (TP + FN) / (TP + TN + FP + FN)
def accuracy(y_true, y_pred):
    return float(tp(y_true, y_pred) + tn(y_true, y_pred)) / float((tp(y_true, y_pred) + tn(y_true, y_pred) + fp(y_true, y_pred)
                                                         + fn(y_true, y_pred)))
# generate an ROC curve graph
def roc(y_true, y_pred, title, filename):
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkgreen',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.savefig('ROCs/' + filename + '.png')

In [None]:
y_pred = clf.predict(X_test)
y_pred = [round(value) for value in y_pred]

tp_score = tp(Y_test, y_pred)
tn_score = tn(Y_test, y_pred)
fp_score = fp(Y_test, y_pred)
fn_score = tn(Y_test, y_pred)

specificity_score = specificity(Y_test, y_pred)
sensitivity_score = sensitivity(Y_test, y_pred)
f1_score = f1(Y_test, y_pred)
accuracy_score = accuracy(Y_test, y_pred)

print("Accuracy: ", accuracy_score)
print("Sensitivity: ", sensitivity_score)
print("Specificity: ", specificity_score)
print("F1 Score: ", f1_score)

roc(Y_test, y_pred, " LightGBM ROC Curve", "lgbm_roc")

In [None]:
# from catboost import CatBoostClassifier

# NFOLDS = 5
# folds = GroupKFold(n_splits = NFOLDS)

# X_train, Y_train

# dftrainLGB = lgb.Dataset(data = X_train, label = Y_train, feature_name = list(train.drop(['isFraud'], axis = 1)))
# dfvalLGB = lgb.Dataset(data = X_val, label = Y_val, feature_name = list(train.drop(['isFraud'], axis = 1)))

# params = {'num_estimators': 300,
#           'learning_rate': 0.07
#           'eval_metric': 'AUC'
#           'random_seed': 123
#          }

# clf = lgb.train(params, 
#                 dftrainLGB, 
#                 num_boost_round=100, 
#                 valid_sets=[dftrainLGB, dfvalLGB],
#                 early_stopping_rounds=10)




In [None]:
from catboost import CatBoostClassifier

NFolds = 5
folds = GroupKFold(n_splits = NFolds)

Tx, Ty = X_train, Y_train
Vx, Vy = X_val, Y_val

oof = np.zeros(len(train_df))

params = {'num_estimators': 300,
          'learning_rate': 0.07
          'eval_metric': 'AUC'
          'random_seed': 123
         }

for fold_, (trn_idx, val_idx) in enumerate(folds.split(Tx, Ty, groups = split_groups)):
  print('Fold:', fold_)
  clf = CatBoostClassifier(params)
  clf.fit(
      Tx.iloc[trn_idx, :], Ty[trn_idx]
      eval_set = 

  )