In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.metrics import average_precision_score, f1_score
from sklearn.metrics import cohen_kappa_score

In [2]:
Fraud_df = pd.read_csv("fraud_final_python_input.csv")

In [3]:
Fraud_df.shape

(6362620, 11)

In [4]:
Fraud_df.head()

Unnamed: 0,step,amount,isFraud,type_CASH_OUT,type_TRANSFER,balanceOrg_Negative,balanceOrg_Positive,balanceOrg_Zero,balanceDest_Negative,balanceDest_Positive,balanceDest_Zero
0,1,9839.64,0,0,0,1,0,0,0,0,1
1,1,1864.28,0,0,0,1,0,0,0,0,1
2,1,181.0,1,0,1,1,0,0,0,0,1
3,1,181.0,1,1,0,1,0,0,1,0,0
4,1,11668.14,0,0,0,1,0,0,0,0,1


In [5]:
# Class count
count_class_0, count_class_1 = Fraud_df.isFraud.value_counts()

# Divide by class
df_class_0 = Fraud_df[Fraud_df['isFraud'] == 0]
df_class_1 = Fraud_df[Fraud_df['isFraud'] == 1]

In [6]:
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under.isFraud.value_counts())

Random under-sampling:
1    8213
0    8213
Name: isFraud, dtype: int64


In [7]:
final_df = df_test_under.reset_index().drop('index', axis=1)
final_df.head()

Unnamed: 0,step,amount,isFraud,type_CASH_OUT,type_TRANSFER,balanceOrg_Negative,balanceOrg_Positive,balanceOrg_Zero,balanceDest_Negative,balanceDest_Positive,balanceDest_Zero
0,283,47177.98,0,0,0,0,1,0,1,0,0
1,130,252254.1,0,1,0,0,0,1,0,1,0
2,399,108979.68,0,0,0,0,1,0,1,0,0
3,132,226519.64,0,1,0,0,0,1,0,1,0
4,370,21358.16,0,0,0,0,1,0,1,0,0


In [8]:
final_df.shape

(16426, 11)

In [9]:
# Splitting the Data into train and test set
X = final_df.drop("isFraud", 1)
y = final_df.isFraud
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
# Applying XGBoost 
clf = XGBClassifier(max_depth = 3, n_jobs = 4)
clf.fit(X_train, y_train)
y_prob=clf.predict_proba(X_test)
y_pred=clf.predict(X_test)





In [11]:
print('Confusion Matrix: ')
print(confusion_matrix(y_test, y_pred))

Confusion Matrix: 
[[2374  105]
 [ 134 2315]]


In [12]:
print("Accuracy Score:")
print(accuracy_score(y_test,y_pred))

Accuracy Score:
0.9515016233766234


In [13]:
print('F1 Score:',(f1_score(y_test,y_pred)))

F1 Score: 0.9509139453686589


In [14]:
print("XGBoost Model Classification Report:")
print(classification_report(y_test,y_pred))

XGBoost Model Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      2479
           1       0.96      0.95      0.95      2449

    accuracy                           0.95      4928
   macro avg       0.95      0.95      0.95      4928
weighted avg       0.95      0.95      0.95      4928



In [15]:
print('AUPRC:', (average_precision_score(y_test, y_prob[:, 1])))

AUPRC: 0.9915915076789218


In [16]:
cohen_kappa_score(y_test,y_pred)

0.9029927012589785

In [17]:
# Applying Logistic Regression
lr = LogisticRegression()
lr.fit(X_train ,y_train)
y_pred_lr = lr.predict(X_test)

In [18]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test ,y_pred_lr)

In [19]:
cnf_matrix

array([[1129, 1350],
       [ 488, 1961]], dtype=int64)

In [20]:
print("Logistic Regression Model Classification Report:")
print(classification_report(y_test, y_pred_lr))

Logistic Regression Model Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.46      0.55      2479
           1       0.59      0.80      0.68      2449

    accuracy                           0.63      4928
   macro avg       0.65      0.63      0.62      4928
weighted avg       0.65      0.63      0.62      4928



In [21]:
print("Accuracy Score:")
print(accuracy_score(y_test, y_pred_lr))

Accuracy Score:
0.6270292207792207


In [22]:
print('F1 Score:',(f1_score(y_test,y_pred_lr)))

F1 Score: 0.6809027777777779


In [23]:
cohen_kappa_score(y_test,y_pred_lr)

0.2556161644286383