<a href="https://colab.research.google.com/github/Marzieh777/Fraud-Detection-Crime-Rating/blob/master/fraud_detection_XGB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
#importing necessary packages
import pandas as pd
import numpy as np
import gzip
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,accuracy_score,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support, roc_auc_score, average_precision_score)
from google.colab import drive
from sklearn.model_selection import train_test_split

import xgboost as xgb
import matplotlib

drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


# Load data and create a pandas dataframe

In [18]:
path_data= '/content/drive/My Drive/Colab Notebooks/fraud_prep.csv.gz'
fraud_df = pd.read_csv(path_data, compression='gzip', header=0, sep=',')
print(fraud_df.head())

   Unnamed: 0  Time  ...                                        V30      V31
0           0   0.0  ...                                        NaN  Suffolk
1           1   0.0  ...  JPMorgan Chase Bank, National Association   Nassau
2           2   1.0  ...                               TD Bank N.A.     Erie
3           3   1.0  ...                             Santander Bank   Ulster
4           4   2.0  ...                Chemung Canal Trust Company   Oswego

[5 rows x 35 columns]


##The data set has NaN values in categorical and nmerical features, the following function helps to fill NaN values

In [19]:
#NaN values for categorical and numerical:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

fraud_df_clean = DataFrameImputer().fit_transform(fraud_df)
print(len(fraud_df_clean))
print(fraud_df_clean.head())

284807
   Unnamed: 0  Time  ...                                        V30      V31
0           0   0.0  ...  JPMorgan Chase Bank, National Association  Suffolk
1           1   0.0  ...  JPMorgan Chase Bank, National Association   Nassau
2           2   1.0  ...                               TD Bank N.A.     Erie
3           3   1.0  ...                             Santander Bank   Ulster
4           4   2.0  ...                Chemung Canal Trust Company   Oswego

[5 rows x 35 columns]


#Exploring data, numerical and categorical features

In [20]:
CATEGORICAL_INPUT_FEATURES = ['V{}'.format(i) for i in range(30, 32)]
ALL_FEATURES = list(fraud_df_clean)
INPUT_FEATURES = ALL_FEATURES.copy()
INPUT_FEATURES.remove('Class') #target
INPUT_FEATURES.remove('V29') #seems not important
print(INPUT_FEATURES)
NUMERICAL_INPUT_FEATURES = [feature for feature in INPUT_FEATURES if feature not in CATEGORICAL_INPUT_FEATURES]
print(NUMERICAL_INPUT_FEATURES)
print(CATEGORICAL_INPUT_FEATURES)
# get number of frauds and non-frauds
fraud_df_clean.Class.value_counts()

['Unnamed: 0', 'Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'V30', 'V31']
['Unnamed: 0', 'Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
['V30', 'V31']


0    284315
1       492
Name: Class, dtype: int64

# Split data into Test and Train 

In [21]:
#set a seed to generate a fix random choice
SEED = 1333 
#splitting data into train and test:
df_train, df_test = train_test_split(fraud_df_clean, test_size=0.3, random_state=SEED)
#explore the number normal and fraud data in train and test
print('#of frauds in train : ',len(df_train[df_train.Class==1]))
print('# of normals in train: ',len(df_train[df_train.Class==0]))
print('#of frauds in test: ', len(df_test[df_test.Class==1]))
print('#of normals in test: ',len(df_test[df_test.Class==0]))

#get features
X_train= df_train.drop('Class',axis =1)
X_test = df_test.drop('Class',axis =1)

# get labels
y_train =df_train.Class
y_test =df_test.Class
print(len(y_train))
print(len(X_test))



#of frauds in train :  341
# of normals in train:  199023
#of frauds in test:  151
#of normals in test:  85292
199364
85443


#Preprocessing categorical and numerical data
* Encode categorical features V30, V31.
* Cooncatenating encoded categorical and numerical data


In [22]:
# encode categorical attributes into a binary one-hot encoded representation in test data and train data
def encode_cat_features(cat_cols, train_data,test_data):
  train_encoded = pd.get_dummies(train_data[cat_cols], prefix_sep="__", columns=cat_cols)
  encoded_columns = list(train_encoded.columns[:])
  #print(encoded_columns)
  cat_dummies = [col for col in train_encoded if "__" in col and col.split("__")[0] in cat_cols]
  test_encoded = pd.get_dummies(test_data[cat_cols], prefix_sep="__", columns=cat_cols)
  # Remove additional columns
  for col in test_encoded.columns:
    if ("__" in col) and (col.split("__")[0] in cat_cols) and col not in cat_dummies:
      print("Removing additional feature {}".format(col))
      test_encoded.drop(col, axis=1, inplace=True)
  #add missing cols
  for col in cat_dummies:
    if col not in test_encoded.columns:
      print("Adding missing feature {}".format(col))
      test_processed[col] = 0
  test_encoded = test_encoded[encoded_columns]
  return (train_encoded,test_encoded)

  
#get encoded categorical data
X_train_cat, X_test_cat = encode_cat_features(CATEGORICAL_INPUT_FEATURES,X_train, X_test)
#get numerical data
X_train_num = X_train[NUMERICAL_INPUT_FEATURES]
X_test_num = X_test[NUMERICAL_INPUT_FEATURES]
# concatenating numerical and categorical
df_train= pd.concat([X_train_cat,X_train_num, y_train], axis = 1)
df_test = pd.concat([X_test_cat, X_test_num, y_test], axis = 1)

#we can sample train and test data by fraction of data
#we choose frac = 1 for the last run
df_train_rd = df_train.sample(frac =1)
df_test_rd= df_test.sample(frac = 1)

#Get inputs and targets
X_train_rd =df_train_rd.drop('Class',axis = 1)
y_train_rd=df_train_rd.Class.values

X_test_rd =df_test_rd.drop('Class',axis = 1)
y_test_rd=df_test_rd.Class.values

print(len(df_train_rd))
print(len(df_test_rd))

199364
85443


#Feature selection option with chi2 score

In [23]:
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale

FEATURE_S = True

def fs(X,y,p_val):
  if FEATURE_S:
        
    X_bin = Binarizer().fit_transform(scale(X))
    selectChi2 = SelectPercentile(chi2, percentile=p_val).fit(X_bin, y)
    selectF_classif = SelectPercentile(f_classif, percentile=p_val).fit(X, y)

    chi2_selected = selectChi2.get_support()
    chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]]
    print('Chi2 selected {} features {}.'.format(chi2_selected.sum(),
      chi2_selected_features))
    f_classif_selected = selectF_classif.get_support()
    f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]]
    print('F_classif selected {} features {}.'.format(f_classif_selected.sum(),
      f_classif_selected_features))
    selected = chi2_selected & f_classif_selected
    print('Chi2 & F_classif selected {} features'.format(selected.sum()))
    features = [ f for f,s in zip(X.columns, selected) if s]
    print (len(features))
    return (features)

f_sel = fs(X_train_rd,y_train_rd, 50)
X_train_rd = X_train_rd[f_sel]
X_test_rd = X_test_rd[f_sel]


Chi2 selected 169 features ['V30__Access Federal Credit Union', 'V30__Adirondack Trust Company, The', 'V30__Advantage Federal C.U.', 'V30__Alma Bank', 'V30__Amalgamated Bank', 'V30__Apple Bank For Savings', 'V30__Bank of Akron', 'V30__Bank of America N.A. GA1-006-15-40', 'V30__Bank of Hope', 'V30__Bank of Millbrook', 'V30__Bay Ridge Federal Credit Union', 'V30__Berkshire Bank (Pittsfield, MA)', 'V30__Bridgehampton National Bank, The', 'V30__Buffalo Metropolitan Federal CU', 'V30__CFCU Community Credit Union', 'V30__CTBC Bank Corp. (USA)', 'V30__Canandaigua National Corporation', 'V30__Capital One, N.A.', 'V30__Carver Federal Savings Bank', 'V30__Cathay Bank', 'V30__Catskill Hudson Bank', 'V30__Cattaraugus County Bank', 'V30__Champlain National Bank', 'V30__Chemung Canal Trust Company', 'V30__Commerce Bank, N.A.', 'V30__Cooperative Federal', 'V30__Corning Federal Credit Union', 'V30__Cross County Federal Savings Bank', 'V30__Delaware National Bank of Delhi, The', 'V30__Dime Community Ba

# Deal with the imbalance data
* we use SMOTE technic to balance TRAIN data (Synthetic Minority Over-sampling Technique )

In [24]:
from imblearn.over_sampling import SMOTE
# SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_sample(X_train_rd, y_train_rd)
cols = X_test_rd.columns
# converting trai data to a data frame with the original column names
X_train_res = pd.DataFrame(X_train_res, columns= cols)



#Performing XGboost Classifier

In [14]:
#Implementing XG-boost #max_depth= 10
clf = xgb.XGBClassifier(missing=9999999999,
                max_depth = 10,
                n_estimators=1000,
                learning_rate=0.1,
                nthread=4,
                subsample=1.0,
                colsample_bytree=0.5,
                min_child_weight = 3,
                seed=SEED)

#fit classifier with eval_metric=AUC_PR, early stopping if the result does not improve after 10 consecutive iteration
clf.fit(X_train_res, y_train_res, early_stopping_rounds=10, eval_metric="aucpr",
        eval_set=[(X_test_rd, y_test_rd)])

[0]	validation_0-aucpr:0.533498
Will train until validation_0-aucpr hasn't improved in 10 rounds.
[1]	validation_0-aucpr:0.672835
[2]	validation_0-aucpr:0.717575
[3]	validation_0-aucpr:0.757179
[4]	validation_0-aucpr:0.766306
[5]	validation_0-aucpr:0.775908
[6]	validation_0-aucpr:0.77017
[7]	validation_0-aucpr:0.771051
[8]	validation_0-aucpr:0.774218
[9]	validation_0-aucpr:0.772743
[10]	validation_0-aucpr:0.773637
[11]	validation_0-aucpr:0.773418
[12]	validation_0-aucpr:0.777155
[13]	validation_0-aucpr:0.778187
[14]	validation_0-aucpr:0.779785
[15]	validation_0-aucpr:0.788069
[16]	validation_0-aucpr:0.78795
[17]	validation_0-aucpr:0.787567
[18]	validation_0-aucpr:0.788815
[19]	validation_0-aucpr:0.790264
[20]	validation_0-aucpr:0.788993
[21]	validation_0-aucpr:0.7888
[22]	validation_0-aucpr:0.789037
[23]	validation_0-aucpr:0.789124
[24]	validation_0-aucpr:0.789861
[25]	validation_0-aucpr:0.816029
[26]	validation_0-aucpr:0.815434
[27]	validation_0-aucpr:0.815871
[28]	validation_0-aucpr:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=3, missing=9999999999, n_estimators=1000,
              n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1333,
              silent=None, subsample=1.0, verbosity=1)

#Display results

In [15]:
#get AUC and AUC_PR on test data
print('Overall AUC TEST:', roc_auc_score(y_test_rd, clf.predict(X_test_rd, ntree_limit=clf.best_iteration))) 	
print('Overall AUC PR TEST:', average_precision_score(y_test_rd, clf.predict(X_test_rd, ntree_limit=clf.best_iteration))) 
# get precision and recall for each class
print (classification_report(y_test_rd, clf.predict(X_test_rd)))

Overall AUC TEST: 0.913760729405458
Overall AUC PR TEST: 0.6901497710452258
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85292
           1       0.83      0.83      0.83       151

    accuracy                           1.00     85443
   macro avg       0.92      0.92      0.92     85443
weighted avg       1.00      1.00      1.00     85443



# XGB classifier along with cross validation to avoid overfitting
* # we can use Cross validation like in crime rating prediction