In [3]:
import pandas as pd
import numpy as np

# Load data
data = pd.read_csv("Project-II/Fraud.csv")
print(data.head())

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


In [4]:
size,dim = data.shape
print(f"There are {size} online payments and {dim} variables.")
data = data.drop('isFlaggedFraud', axis=1)
print(data.head())

There are 6362620 online payments and 11 variables.
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  
0  M1979787155             0.0             0.0        0  
1  M2044282225             0.0             0.0        0  
2   C553264065             0.0             0.0        1  
3    C38997010         21182.0             0.0        1  
4  M1230701703             0.0             0.0        0  


Check for missing values

In [5]:
print(data.isnull().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64


In [7]:
# Exploring transaction type
print(data.type.value_counts())

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64


In [8]:
type = data["type"].value_counts()
transactions = type.index
quantity = type.values

import plotly.express as px
figure = px.pie(data,
             values=quantity,
             names=transactions,hole = 0.5,
             title="Distribution of Transaction Type")
figure.show()

ModuleNotFoundError: No module named 'plotly'

In [9]:
data["type"] = data["type"].map({"CASH_OUT": 1, "PAYMENT": 2,
                                 "CASH_IN": 3, "TRANSFER": 4,
                                 "DEBIT": 5})
#data["isFraud"] = data["isFraud"].map({0: "No Fraud", 1: "Fraud"})
print(data.head())

   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1     2   9839.64  C1231006815       170136.0       160296.36   
1     1     2   1864.28  C1666544295        21249.0        19384.72   
2     1     4    181.00  C1305486145          181.0            0.00   
3     1     1    181.00   C840083671          181.0            0.00   
4     1     2  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  
0  M1979787155             0.0             0.0        0  
1  M2044282225             0.0             0.0        0  
2   C553264065             0.0             0.0        1  
3    C38997010         21182.0             0.0        1  
4  M1230701703             0.0             0.0        0  


In [10]:
# Checking correlation
correlation = data.iloc[:,[0,1,2,4,5,7,8,9]].corr()
print(correlation["isFraud"].sort_values(ascending=False))

isFraud           1.000000
amount            0.076688
step              0.031578
type              0.016171
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


In [11]:
data["isFraud"] = data["isFraud"].map({0: "No", 1: "Yes"})

### Oline payments Fraud Dtection models

In [12]:
from sklearn.model_selection import train_test_split

# splitting the data
X = np.array(data[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(data[["isFraud"]])

# training a decision tree
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

### Decision trees

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Confusion matrix:",confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Sensitivity (Recall):", recall_score(y_test, y_pred, pos_label='Yes'))

Confusion matrix: [[635367     78]
 [    90    727]]
Accuracy: 0.9997359578286932
Sensitivity (Recall): 0.8898408812729498


In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score

# Manually specify class weights (e.g., more weight to 'yes' class)
class_weights = {'No': 1, 'Yes': 10}  # Adjust the weight of the 'yes' class as needed
clf = DecisionTreeClassifier(random_state=42, class_weight=class_weights)
# try different hyper-parameters: criterion='gini', max_depth=None, min_samples_split=2

# Fit the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Display results
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Sensitivity (Recall):", recall_score(y_test, y_pred, pos_label='Yes'))

[[635373     72]
 [   102    715]]
Accuracy: 0.9997265277511466
Sensitivity (Recall): 0.8751529987760098


In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score

# Manually specify class weights (e.g., more weight to 'yes' class)
class_weights = {'No': 1, 'Yes': 1000}  # Adjust the weight of the 'yes' class as needed
clf = DecisionTreeClassifier(random_state=42, class_weight=class_weights)

# Fit the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Display results
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Sensitivity (Recall):", recall_score(y_test, y_pred, pos_label='Yes'))

[[635376     69]
 [    90    727]]
Accuracy: 0.9997501029450132
Sensitivity (Recall): 0.8898408812729498


The sensitivity is improving, while the decrease in the accuracy is insignificant.

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score

# Automatically balance class weights
clf = DecisionTreeClassifier(random_state=42, class_weight='balanced')

# Fit the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Display results
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Sensitivity (Recall):", recall_score(y_test, y_pred, pos_label='Yes'))

[[635379     66]
 [    94    723]]
Accuracy: 0.9997485312654221
Sensitivity (Recall): 0.8849449204406364


### Logistic regression

In [20]:
from sklearn.linear_model import LogisticRegression

# Create a LogisticRegression model
clf = LogisticRegression(random_state=42, max_iter=1000)

# they require scaling

# Fit the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Display results
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Sensitivity (Recall):", recall_score(y_test, y_pred, pos_label='Yes'))

  y = column_or_1d(y, warn=True)


[[635143    302]
 [    13    804]]
Accuracy: 0.9995049209287997
Sensitivity (Recall): 0.9840881272949816


### Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

# Create a RandomForest model
clf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=4)
# n_estimators, max_depth, n_jobs

# Fit the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Display results
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Sensitivity (Recall):", recall_score(y_test, y_pred, pos_label='Yes'))

  return fit_method(estimator, *args, **kwargs)


KeyboardInterrupt: 

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a GradientBoosting model
clf = GradientBoostingClassifier(random_state=42)

# Fit the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Display results
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Sensitivity (Recall):", recall_score(y_test, y_pred, pos_label='Yes'))

### Support Vector Machines

In [None]:
from sklearn.svm import SVC

# Create a SVM model
clf = SVC(random_state=42)

# they require scaling

# Fit the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Display results
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Sensitivity (Recall):", recall_score(y_test, y_pred, pos_label='Yes'))

### Hyper parameter tuning

In [None]:
# random forests
rf_model = RandomForestClassifier(
    n_estimators=100, max_depth=1, random_state=0, n_jobs=4)

# hyperparameter space
rf_param_grid = dict(
    n_estimators=[10, 20, 50, 100, 200, 500, 1000, 2000],
    max_depth=[1, 2, 3, 4],
)

# search
clf = GridSearchCV(rf_model,
                   rf_param_grid,
                   scoring=fnr_score,
                   cv=5)

search = clf.fit(X, y)

# best hyperparameters
search.best_params_

In [24]:
from sklearn.model_selection import KFold, cross_validate

# Logistic regression
logit = LogisticRegression(penalty='l2', C=10, random_state=0, max_iter=1000)

# K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=0)

# Search
clf = cross_validate(logit, X_train, y_train, scoring='accuracy', return_train_score=True, cv=kf)

clf['test_score']

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.99950754, 0.99913732, 0.99907096, 0.99947349, 0.9995023 ])