# Importing General Libraries

In [2]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set(style="darkgrid",font_scale=1.5)
pd.set_option("display.max.columns",None)
pd.set_option("display.max.rows",None)
# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)  # Suppress user warnings
warnings.filterwarnings("ignore", category=FutureWarning)  # Suppress future warnings

# Suppress specific warnings for LGBMClassifier and CatBoostClassifier
import logging
logging.getLogger("catboost").setLevel(logging.ERROR)  # Suppress CatBoost logs
logging.getLogger("lightgbm").setLevel(logging.ERROR)  # Suppress LightGBM logs

# Importing libraries for ML and performance metrices

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import label_binarize


# Loading dataset and doing the basic EDA

In [4]:
df = pd.read_csv('Fraud.csv')

In [None]:
df.head() #For checking the data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [None]:
df.shape #For checking the shape of the data

(6362620, 11)

In [None]:
df.columns #For checking the columns of the data

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [None]:
df.info() #For checking the data types of the columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [None]:
df.isnull().sum().to_frame().rename(columns={0:"Total No. of Missing Values"}) #For checking the missing values in the data

Unnamed: 0,Total No. of Missing Values
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


In [None]:
print("Duplicate Values =",df.duplicated().sum()) #For checking the duplicate values in the data

Duplicate Values = 0


In [None]:
numeric_data = df.select_dtypes(include=[np.number]) #For cheking the numeric data in the data
numeric_data.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0
2,1,181.0,181.0,0.0,0.0,0.0,1,0
3,1,181.0,181.0,0.0,21182.0,0.0,1,0
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0


In [None]:
categorical_data = df.select_dtypes(exclude=[np.number]) #For checking the categorical data in the data
categorical_data.head()

Unnamed: 0,type,nameOrig,nameDest
0,PAYMENT,C1231006815,M1979787155
1,PAYMENT,C1666544295,M2044282225
2,TRANSFER,C1305486145,C553264065
3,CASH_OUT,C840083671,C38997010
4,PAYMENT,C2048537720,M1230701703


In [None]:
numeric_data.corr() #For checking the correlation between the numeric data

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
step,1.0,0.022373,-0.010058,-0.010299,0.027665,0.025888,0.031578,0.003277
amount,0.022373,1.0,-0.002762,-0.007861,0.294137,0.459304,0.076688,0.012295
oldbalanceOrg,-0.010058,-0.002762,1.0,0.998803,0.066243,0.042029,0.010154,0.003835
newbalanceOrig,-0.010299,-0.007861,0.998803,1.0,0.067812,0.041837,-0.008148,0.003776
oldbalanceDest,0.027665,0.294137,0.066243,0.067812,1.0,0.976569,-0.005885,-0.000513
newbalanceDest,0.025888,0.459304,0.042029,0.041837,0.976569,1.0,0.000535,-0.000529
isFraud,0.031578,0.076688,0.010154,-0.008148,-0.005885,0.000535,1.0,0.044109
isFlaggedFraud,0.003277,0.012295,0.003835,0.003776,-0.000513,-0.000529,0.044109,1.0


In [None]:
fig = px.imshow(numeric_data.corr(),text_auto=True,aspect="auto") #For checking the correlation in graphical form
fig.show()

In [None]:
df['type'].value_counts(ascending=False) #For checking the value counts of the transaction type

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [None]:
fraud_count = df['isFraud'].value_counts() #For checking the value counts of the fraud and non-fraud transactions
fraud_count

isFraud
0    6354407
1       8213
Name: count, dtype: int64

In [None]:
fraud_percentage = df['isFraud'].value_counts(normalize=True) * 100 #For checking the percentage of fraud and non-fraud transactions
fraud_percentage

isFraud
0    99.870918
1     0.129082
Name: proportion, dtype: float64

**This is a classic case of class imbalance**

# Feature scaling and modeling

In [None]:
# Encoding the categorical variables
# Using Label Encoding for categorical variables
encoder = {}
for i in df.select_dtypes('object').columns:
    encoder[i] = LabelEncoder()
    df[i] = encoder[i].fit_transform(df[i])

In [19]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,3,9839.64,757869,170136.0,160296.36,1662094,0.0,0.0,0,0
1,1,3,1864.28,2188998,21249.0,19384.72,1733924,0.0,0.0,0,0
2,1,4,181.0,1002156,181.0,0.0,439685,0.0,0.0,1,0
3,1,1,181.0,5828262,181.0,0.0,391696,21182.0,0.0,1,0
4,1,3,11668.14,3445981,41554.0,29885.86,828919,0.0,0.0,0,0


In [20]:
x = df.drop(['isFraud'], axis = 1)
y = df['isFraud']

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42,stratify=y)

In [23]:
print("x_train - >  ",x_train.shape)
print("y_train - >  ",y_train.shape)
print("x_test  - >  ",x_test.shape)
print("y_test  - >  ",y_test.shape)

x_train - >   (5090096, 10)
y_train - >   (5090096,)
x_test  - >   (1272524, 10)
y_test  - >   (1272524,)


In [24]:
pt = PowerTransformer(method='yeo-johnson')

In [25]:
x_train_scaled = pt.fit_transform(x_train)
x_test_scaled = pt.transform(x_test)

In [26]:
train_accuracy_scores = []
train_precision_scores = []
train_recall_scores = []
train_f1_scores = []

test_accuracy_scores = []
test_precision_scores = []
test_recall_scores = []
test_f1_scores = []

In [27]:
def evaluate_classification_performance(model, x_train, y_train, x_test, y_test, score_append=False):
    """
    Evaluates Accuracy, Precision, Recall, F1-score, AUC, and Confusion Matrix for a given classification model 
    on training and testing data using Plotly for visualizations.
    
    Parameters:
    - model: The machine learning model to evaluate
    - x_train: Training feature set
    - y_train: Training target values
    - x_test: Testing feature set
    - y_test: Testing target values
    """

    # Fit the model
    model.fit(x_train, y_train)

    # Predictions for training and testing data
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Calculate metrics for training data
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred, average='macro', zero_division=0)
    train_recall = recall_score(y_train, y_train_pred, average='macro', zero_division=0)
    train_f1 = f1_score(y_train, y_train_pred, average='macro', zero_division=0)

    # Calculate metrics for testing data
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, average='macro', zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, average='macro', zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, average='macro', zero_division=0)

    # AUC (Area Under Curve) - for binary or multiclass
    if len(np.unique(y_train)) == 2:  # Binary Classification
        train_auc = roc_auc_score(y_train, model.predict_proba(x_train)[:, 1])
        test_auc = roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])
    else:  # Multiclass Classification
        train_auc = roc_auc_score(label_binarize(y_train, classes=np.unique(y_train)), 
                                  model.predict_proba(x_train), average='macro', multi_class='ovr')
        test_auc = roc_auc_score(label_binarize(y_test, classes=np.unique(y_test)), 
                                 model.predict_proba(x_test), average='macro', multi_class='ovr')

    # Append scores to respective lists
    if score_append == True:
        train_accuracy_scores.append(train_accuracy)
        train_precision_scores.append(train_precision)
        train_recall_scores.append(train_recall)
        train_f1_scores.append(train_f1)
        
        test_accuracy_scores.append(test_accuracy)
        test_precision_scores.append(test_precision)
        test_recall_scores.append(test_recall)
        test_f1_scores.append(test_f1)
    else:
        pass
        
    # Confusion Matrix for Training and Testing Data
    train_cm = confusion_matrix(y_train, y_train_pred)
    test_cm = confusion_matrix(y_test, y_test_pred)

    # Function to plot confusion matrix using Plotly
    def plot_confusion_matrix(cm, title):
        labels = [f"Class {i}" for i in range(len(cm))]
        fig = ff.create_annotated_heatmap(
            z=cm,
            x=labels,
            y=labels,
            colorscale="Blues",
            showscale=True,
            reversescale=False
        )
        fig.update_layout(
            title_text=title,
            xaxis=dict(title='Predicted Labels'),
            yaxis=dict(title='True Labels')
        )
        fig.show()

    print(f"{model.__class__.__name__} Performance Metrics:")
    print(f"Training Data: Accuracy = {train_accuracy:.2f}, Precision = {train_precision:.2f}, Recall = {train_recall:.2f}, F1-score = {train_f1:.2f}, AUC = {train_auc:.2f}")
    print(f"Testing Data : Accuracy = {test_accuracy:.2f}, Precision = {test_precision:.2f}, Recall = {test_recall:.2f}, F1-score = {test_f1:.2f}, AUC = {test_auc:.2f}\n")

    # Display Confusion Matrices
    plot_confusion_matrix(train_cm, title='Training Confusion Matrix')
    plot_confusion_matrix(test_cm, title='Testing Confusion Matrix')

In [28]:
evaluate_classification_performance(
    model=LogisticRegression(n_jobs=-1),
    x_train=x_train_scaled,
    y_train=y_train,
    x_test=x_test_scaled,
    y_test=y_test,
    score_append = True
)

LogisticRegression Performance Metrics:
Training Data: Accuracy = 1.00, Precision = 0.97, Recall = 0.74, F1-score = 0.82, AUC = 0.98
Testing Data : Accuracy = 1.00, Precision = 0.98, Recall = 0.75, F1-score = 0.82, AUC = 0.98



In [29]:
evaluate_classification_performance(
    model=DecisionTreeClassifier(max_depth=10),
    x_train=x_train_scaled,
    y_train=y_train,
    x_test=x_test_scaled,
    y_test=y_test,
    score_append = True
)

DecisionTreeClassifier Performance Metrics:
Training Data: Accuracy = 1.00, Precision = 0.98, Recall = 0.88, F1-score = 0.93, AUC = 0.98
Testing Data : Accuracy = 1.00, Precision = 0.97, Recall = 0.88, F1-score = 0.92, AUC = 0.98



In [30]:
evaluate_classification_performance(
    model=RandomForestClassifier(n_estimators=10,max_depth=10,n_jobs=-1),
    x_train=x_train_scaled,
    y_train=y_train,
    x_test=x_test_scaled,
    y_test=y_test,
    score_append = True
)

RandomForestClassifier Performance Metrics:
Training Data: Accuracy = 1.00, Precision = 1.00, Recall = 0.87, F1-score = 0.92, AUC = 1.00
Testing Data : Accuracy = 1.00, Precision = 1.00, Recall = 0.86, F1-score = 0.92, AUC = 1.00



In [31]:
evaluate_classification_performance(
    model=GaussianNB(),
    x_train=x_train_scaled,
    y_train=y_train,
    x_test=x_test_scaled,
    y_test=y_test,
    score_append = True
)

GaussianNB Performance Metrics:
Training Data: Accuracy = 1.00, Precision = 1.00, Recall = 0.50, F1-score = 0.50, AUC = 0.97
Testing Data : Accuracy = 1.00, Precision = 0.50, Recall = 0.50, F1-score = 0.50, AUC = 0.97



In [32]:
evaluate_classification_performance(
    model=XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    use_label_encoder=False,  # avoids warning for newer versions
    eval_metric='logloss',    # required to suppress warnings
    random_state=42
),
    x_train=x_train_scaled,
    y_train=y_train,
    x_test=x_test_scaled,
    y_test=y_test,
    score_append = True
)

XGBClassifier Performance Metrics:
Training Data: Accuracy = 1.00, Precision = 0.99, Recall = 0.86, F1-score = 0.91, AUC = 1.00
Testing Data : Accuracy = 1.00, Precision = 0.99, Recall = 0.86, F1-score = 0.92, AUC = 1.00



In [33]:
models = [
    "LogisticRegression",
    "DecisionTreeClassifier",
    "RandomForestClassifier",
    "GaussianNB",
    "XGBClassifier"
    ]

In [34]:
df_model = pd.DataFrame(
        {"Algorithms":models,
         "Training Accuracy":train_accuracy_scores,
         "Training Precision":train_precision_scores,
         "Training Recall":train_recall_scores,
         "Training F1 Score":train_f1_scores,
         
         "Testing Accuracy":test_accuracy_scores,
         "Testing Precision":test_precision_scores,
         "Testing Recall":test_recall_scores,
         "Testing F1 Score":test_f1_scores,
        })
				   
df_model_sort = df_model.sort_values(by="Testing F1 Score", ascending=False)
df_model_sort

Unnamed: 0,Algorithms,Training Accuracy,Training Precision,Training Recall,Training F1 Score,Testing Accuracy,Testing Precision,Testing Recall,Testing F1 Score
1,DecisionTreeClassifier,0.99967,0.984772,0.884231,0.928671,0.999632,0.974593,0.877637,0.920586
2,RandomForestClassifier,0.999647,0.995806,0.866054,0.921139,0.999641,0.995272,0.864268,0.919763
4,XGBClassifier,0.99962,0.991476,0.85882,0.914775,0.999621,0.989103,0.861219,0.915521
0,LogisticRegression,0.999298,0.971676,0.742523,0.820255,0.999316,0.977267,0.746485,0.824999
3,GaussianNB,0.998712,0.999356,0.501218,0.502107,0.998709,0.499354,0.5,0.499677


In [35]:
# Create DataFrame
data = {
    "Algorithms": models,
    "Training Recall": train_recall_scores,
    "Testing Recall": test_recall_scores
}
df_model_sort = pd.DataFrame(data)

# Sort by Testing F1 Score
df_model_sort = df_model_sort.sort_values(by="Testing Recall", ascending=False)

# Create the bar chart
fig = go.Figure()

# Add bars for Training F1 Score
fig.add_trace(
    go.Bar(
        x=df_model_sort["Algorithms"],
        y=df_model_sort["Training Recall"],
        name="Training Recall",
        text=df_model_sort["Training Recall"],
        texttemplate='%{text:.2f}',
        textposition='outside',
        marker_color='rgb(102,194,165)'
    )
)

# Add bars for Testing F1 Score
fig.add_trace(
    go.Bar(
        x=df_model_sort["Algorithms"],
        y=df_model_sort["Testing Recall"],
        name="Testing Recall",
        text=df_model_sort["Testing Recall"],
        texttemplate='%{text:.2f}',
        textposition='outside',
        marker_color='rgb(252,141,98)'
    )
)

# Customize the layout
fig.update_layout(
    title={
        'text': "Performance Visualization of Different Models",
        'y': 0.9,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Algorithms",
    yaxis_title="Scores",
    xaxis_tickangle=-90,
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1,
    font=dict(size=14),
    height=600,
    width=1000
)

# Show the plot
fig.show()


# Feature scaling and modeling after applying oversampling 

In [36]:
from imblearn.over_sampling import SMOTE

In [37]:
over_sample = SMOTE(random_state=42)

In [38]:
x_over_sample,y_over_sample = over_sample.fit_resample(x,y)

In [39]:
y_over_sample.value_counts()

isFraud
0    6354407
1    6354407
Name: count, dtype: int64

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x_over_sample, y_over_sample, test_size=0.2, random_state=42)

In [41]:
print("x_train - >  ",x_train.shape)
print("y_train - >  ",y_train.shape)
print("x_test  - >  ",x_test.shape)
print("y_test  - >  ",y_test.shape)

x_train - >   (10167051, 10)
y_train - >   (10167051,)
x_test  - >   (2541763, 10)
y_test  - >   (2541763,)


In [42]:
pt = PowerTransformer(method='yeo-johnson')
x_train_scaled = pt.fit_transform(x_train)
x_test_scaled = pt.transform(x_test)

In [43]:
train_accuracy_scores = []
train_precision_scores = []
train_recall_scores = []
train_f1_scores = []

test_accuracy_scores = []
test_precision_scores = []
test_recall_scores = []
test_f1_scores = []

In [44]:
evaluate_classification_performance(
    model=LogisticRegression(n_jobs=-1),
    x_train=x_train_scaled,
    y_train=y_train,
    x_test=x_test_scaled,
    y_test=y_test,
    score_append = True
)

LogisticRegression Performance Metrics:
Training Data: Accuracy = 0.95, Precision = 0.95, Recall = 0.95, F1-score = 0.95, AUC = 0.99
Testing Data : Accuracy = 0.95, Precision = 0.95, Recall = 0.95, F1-score = 0.95, AUC = 0.99



In [45]:
evaluate_classification_performance(
    model=DecisionTreeClassifier(max_depth=10),
    x_train=x_train_scaled,
    y_train=y_train,
    x_test=x_test_scaled,
    y_test=y_test,
    score_append = True
)

DecisionTreeClassifier Performance Metrics:
Training Data: Accuracy = 0.99, Precision = 0.99, Recall = 0.99, F1-score = 0.99, AUC = 1.00
Testing Data : Accuracy = 0.99, Precision = 0.99, Recall = 0.99, F1-score = 0.99, AUC = 1.00



In [46]:
evaluate_classification_performance(
    model=RandomForestClassifier(n_estimators=10,max_depth=10,n_jobs=-1),
    x_train=x_train_scaled,
    y_train=y_train,
    x_test=x_test_scaled,
    y_test=y_test,
    score_append = True
)

RandomForestClassifier Performance Metrics:
Training Data: Accuracy = 0.99, Precision = 0.99, Recall = 0.99, F1-score = 0.99, AUC = 1.00
Testing Data : Accuracy = 0.99, Precision = 0.99, Recall = 0.99, F1-score = 0.99, AUC = 1.00



In [47]:
evaluate_classification_performance(
    model=GaussianNB(),
    x_train=x_train_scaled,
    y_train=y_train,
    x_test=x_test_scaled,
    y_test=y_test,
    score_append = True
)

GaussianNB Performance Metrics:
Training Data: Accuracy = 0.50, Precision = 0.75, Recall = 0.50, F1-score = 0.33, AUC = 0.97
Testing Data : Accuracy = 0.50, Precision = 0.75, Recall = 0.50, F1-score = 0.33, AUC = 0.97



In [48]:
evaluate_classification_performance(
    model=XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    use_label_encoder=False,  # avoids warning for newer versions
    eval_metric='logloss',    # required to suppress warnings
    random_state=42
),
    x_train=x_train_scaled,
    y_train=y_train,
    x_test=x_test_scaled,
    y_test=y_test,
    score_append = True
)

XGBClassifier Performance Metrics:
Training Data: Accuracy = 0.99, Precision = 0.99, Recall = 0.99, F1-score = 0.99, AUC = 1.00
Testing Data : Accuracy = 0.99, Precision = 0.99, Recall = 0.99, F1-score = 0.99, AUC = 1.00



In [49]:
models = [
    "LogisticRegression",
    "DecisionTreeClassifier",
    "RandomForestClassifier",
    "GaussianNB",
    "XGBClassifier"
    ]

In [50]:
df_model = pd.DataFrame(
        {"Algorithms":models,
         "Training Accuracy":train_accuracy_scores,
         "Training Precision":train_precision_scores,
         "Training Recall":train_recall_scores,
         "Training F1 Score":train_f1_scores,
         
         "Testing Accuracy":test_accuracy_scores,
         "Testing Precision":test_precision_scores,
         "Testing Recall":test_recall_scores,
         "Testing F1 Score":test_f1_scores,
        })
				   
df_model_sort = df_model.sort_values(by="Testing F1 Score", ascending=False)
df_model_sort

Unnamed: 0,Algorithms,Training Accuracy,Training Precision,Training Recall,Training F1 Score,Testing Accuracy,Testing Precision,Testing Recall,Testing F1 Score
1,DecisionTreeClassifier,0.994703,0.994722,0.994703,0.994703,0.994599,0.994619,0.994599,0.994599
2,RandomForestClassifier,0.991086,0.991157,0.991086,0.991085,0.991002,0.991075,0.991002,0.991002
4,XGBClassifier,0.990209,0.990295,0.990209,0.990208,0.990145,0.990233,0.990145,0.990145
0,LogisticRegression,0.951874,0.952012,0.951874,0.95187,0.951879,0.95202,0.951878,0.951875
3,GaussianNB,0.500547,0.750138,0.500543,0.33454,0.500534,0.750129,0.500551,0.334549


In [51]:
# Create DataFrame
data = {
    "Algorithms": models,
    "Training Recall": train_recall_scores,
    "Testing Recall": test_recall_scores
}
df_model_sort = pd.DataFrame(data)

# Sort by Testing F1 Score
df_model_sort = df_model_sort.sort_values(by="Testing Recall", ascending=False)

# Create the bar chart
fig = go.Figure()

# Add bars for Training F1 Score
fig.add_trace(
    go.Bar(
        x=df_model_sort["Algorithms"],
        y=df_model_sort["Training Recall"],
        name="Training Recall",
        text=df_model_sort["Training Recall"],
        texttemplate='%{text:.2f}',
        textposition='outside',
        marker_color='rgb(102,194,165)'
    )
)

# Add bars for Testing F1 Score
fig.add_trace(
    go.Bar(
        x=df_model_sort["Algorithms"],
        y=df_model_sort["Testing Recall"],
        name="Testing Recall",
        text=df_model_sort["Testing Recall"],
        texttemplate='%{text:.2f}',
        textposition='outside',
        marker_color='rgb(252,141,98)'
    )
)

# Customize the layout
fig.update_layout(
    title={
        'text': "Performance Visualization of Different Models",
        'y': 0.9,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Algorithms",
    yaxis_title="Scores",
    xaxis_tickangle=-90,
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1,
    font=dict(size=14),
    height=600,
    width=1000
)

# Show the plot
fig.show()


In a fraudulent transactions classification problem, **Type II errors (false negatives)** are typically more dangerous than **Type I errors (false positives)**. Here's why and the related performance metrics to focus on:

### **Type II Error (False Negative)**:
- A Type II error occurs when a fraudulent transaction is incorrectly classified as legitimate.
- This is dangerous because it allows fraudulent activity to go undetected, potentially leading to significant financial losses, reputational damage, and loss of customer trust.

### **Type I Error (False Positive)**:
- A Type I error occurs when a legitimate transaction is incorrectly classified as fraudulent.
- While this can be inconvenient for customers and lead to temporary disruptions (e.g., declined transactions), it is generally less harmful than missing actual fraud.

---

### **Performance Metrics to Focus On**:
1. **Recall (Sensitivity or True Positive Rate)**:
   - **Definition**: Proportion of actual fraudulent transactions correctly identified.
   - **Reason to focus**: A high recall ensures that most fraudulent transactions are detected, minimizing Type II errors.
   
2. **Precision**:
   - **Definition**: Proportion of transactions classified as fraudulent that are actually fraudulent.
   - **Reason to focus**: A high precision ensures that the flagged transactions are genuinely fraudulent, reducing unnecessary disruptions.
   
3. **F1 Score**:
   - **Definition**: Harmonic mean of precision and recall.
   - **Reason to focus**: The F1 score balances precision and recall, which is crucial if you want to avoid both false negatives and false positives but prioritize catching fraud.

---

### **Additional Considerations**:
- **Threshold Tuning**:
  - Adjust the decision threshold to achieve an acceptable balance between precision and recall, based on business requirements.
- **Confusion Matrix Analysis**:
  - Regularly analyze the confusion matrix to understand the trade-offs and misclassifications.
- **Domain-Specific Costs**:
  - Use a cost matrix if the financial or reputational cost of false negatives is quantifiable.

By focusing on **recall**, **F1 score**, and potentially using **cost-sensitive metrics**, we can prioritize reducing false negatives while keeping false positives at manageable levels.

Based on the performance metrics provided, here’s an analysis of the algorithms for fraud detection:

---

### **1. Decision Tree Classifier**
- **Training and Testing Metrics**: 
  - **Accuracy**: ~99.47% (Training) and ~99.46% (Testing)
  - **Precision, Recall, and F1-Score**: Consistently high (~99.46% to ~99.47%)
- **Strengths**:
  - The model fits well with very minimal difference between training and testing performance (low overfitting).
  - High precision indicates that it effectively minimizes false positives.
  - High recall shows it captures most fraudulent transactions.
- **Recommendation**: Strong contender, but evaluate computational cost and scalability compared to Random Forest.

---

### **2. Random Forest Classifier**
- **Training and Testing Metrics**:
  - **Accuracy**: ~99.02% (Training) and ~99.01% (Testing)
  - **Precision, Recall, and F1-Score**: ~99.01%
- **Strengths**:
  - High accuracy and stability across both training and testing data.
  - Typically more robust than Decision Trees due to ensemble voting.
- **Considerations**:
  - Slightly lower accuracy than Decision Tree but usually more resistant to overfitting and noise.
- **Recommendation**: Excellent choice, especially if you seek interpretability through feature importance.

---

### **3. Logistic Regression**
- **Training and Testing Metrics**:
  - **Accuracy**: ~95.18% (Training and Testing)
  - **Precision, Recall, and F1-Score**: ~95.18%
- **Strengths**:
  - Consistent performance across training and testing data.
  - Simple and interpretable model.
- **Weaknesses**:
  - Lower accuracy compared to Decision Tree and Random Forest.
  - May struggle to capture complex patterns in highly imbalanced datasets.
- **Recommendation**: Good baseline model, but likely insufficient for fraud detection unless combined with advanced balancing techniques (e.g., SMOTE).

---

### **4. Gaussian Naive Bayes**
- **Training and Testing Metrics**:
  - **Accuracy**: ~50.05% (Training) and ~50.05% (Testing)
  - **Precision**: ~75.01%, but **Recall**: ~50.05%, leading to poor F1-Score (~33.45%).
- **Strengths**:
  - High precision indicates it rarely misclassifies non-fraudulent transactions as fraudulent.
- **Weaknesses**:
  - Poor recall means it misses a large number of actual fraudulent cases.
  - Accuracy close to 50% suggests it's no better than random guessing for imbalanced datasets.
- **Recommendation**: Not suitable for fraud detection without significant improvement or feature engineering.

---

### **5. XGBClassifier**
- **Training and Testing Metrics**:
  - **Accuracy**: ~99% (Training) and ~99% (Testing)
  - **Precision, Recall, and F1-Score**: ~99%
- **Strengths**:
  - Exceptionally strong performance across all metrics on both training and testing data suggests the model is highly effective at distinguishing between fraudulent and non-fraudulent transactions.
  - High recall indicates the model is successfully identifying nearly all fraudulent cases — a critical aspect of fraud detection
  - Near-perfect AUC implies the model is excellent at ranking predictions by their likelihood of being fraudulent.
- **Weaknesses**:
  - The consistency between training and testing metrics is promising, but such high scores may still warrant a check for overfitting, especially if the test set is small or not fully representative.
  - Real-world deployment may introduce concept drift or new fraud patterns — continual monitoring is essential.
- **Recommendation**: XGBClassifier demonstrates state-of-the-art performance on this dataset and is a strong candidate for production deployment in fraud detection systems. However, ongoing validation with real-world data and periodic retraining are recommended to maintain this level of performance over time.



---

### **Comparison Summary**
| **Model**                | **Strengths**                                 | **Weaknesses**                                | **Recommendation**               |
|---------------------------|-----------------------------------------------|-----------------------------------------------|-----------------------------------|
| **Decision Tree**         | High precision and recall; low overfitting.  | Slightly more prone to overfitting compared to Random Forest. | Strong choice for balanced performance. |
| **Random Forest**         | Robust, accurate, and scalable.              | Computationally more expensive.               | Ideal for fraud detection.        |
| **Logistic Regression**   | Simple, interpretable, and consistent.       | Struggles with complex patterns in imbalanced data. | Good as a baseline model.         |
| **Gaussian Naive Bayes**  | High precision for non-fraudulent cases.     | Poor recall; ineffective for imbalanced data. | Not recommended.                  |
| **XGBClassifier**         | High precision and recall  | Real-world deployment may introduce concept drift or new fraud patterns | Strong candidate for production deployment in fraud detection systems |

---

### **Recommendation for Fraud Detection**
1. **Primary Choice**: XGBClassifier
   - High precision and recall.
   - Successfully identifying all fraudulent cases.

2. **Secondary Choice**: Random Forest Classifier 
   - Best trade-off between robustness and performance.
   - Offers feature importance insights to understand key fraud indicators.