In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import warnings

In [2]:
# Suppress specific warning
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.base')

## Data Loading and Exploration

In [3]:
# Load the dataset
df = pd.read_csv("fraud.csv")

In [4]:
# Display the first few rows of the dataset
print(df.head())

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


In [5]:
# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

Missing values in each column:
 step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [6]:
# Explore the distribution of transaction types
print("Transaction Type Distribution:\n", df['type'].value_counts())
type_counts = df["type"].value_counts()
transactions = type_counts.index
quantity = type_counts.values

Transaction Type Distribution:
 type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64


In [7]:
# Visualize the distribution of transaction types
figure = px.pie(df, 
                values=quantity, 
                names=transactions, 
                hole=0.5, 
                title="Distribution of Transaction Type")
figure.show()

## Data Preprocessing

In [8]:
# Map transaction types to numerical values
df["type"] = df["type"].map({
    "CASH_OUT": 1, 
    "PAYMENT": 2, 
    "CASH_IN": 3, 
    "TRANSFER": 4,
    "DEBIT": 5
})

In [9]:
# Ensure only numerical columns are used for correlation analysis
numerical_df = df.select_dtypes(include=[np.number])

In [10]:
# Compute the correlation matrix for numerical columns
correlation = numerical_df.corr()
print("Correlation with 'isFraud':\n", correlation["isFraud"].sort_values(ascending=False))

Correlation with 'isFraud':
 isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
type              0.016171
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


In [11]:
# Display the first few rows of the modified DataFrame
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,2,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,2,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,2,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## Data Splitting

In [12]:
# Split the data into features (X) and target (y)
X = np.array(df[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(df[["isFraud"]])

In [13]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)


## Model Building and Comparison

In [14]:
# Function to train and evaluate models
def evaluate_model(model, model_name):
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    acc = accuracy_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred)
    print(f"Accuracy: {acc:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, pred)}\n")

In [15]:
### Logistic Regression
logreg = LogisticRegression(random_state=7)
evaluate_model(logreg, "Logistic Regression")


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Accuracy: 0.9995
ROC AUC Score: 0.9918
Confusion Matrix:
[[635143    302]
 [    13    804]]



In [16]:
### Decision Tree
dtc = DecisionTreeClassifier(max_leaf_nodes=4, max_features=3, max_depth=15, random_state=7)
evaluate_model(dtc, "Decision Tree")

Accuracy: 0.9989
ROC AUC Score: 0.5747
Confusion Matrix:
[[635444      1]
 [   695    122]]



In [17]:
# Store models in a dictionary
models = {
    "Logistic Regression": logreg,
    "Decision Tree": dtc
}

In [18]:
# Compare models and select the best one based on ROC AUC score
best_model_name = max(models, key=lambda model: roc_auc_score(y_test, models[model].predict_proba(x_test)[:, 1]))
best_model = models[best_model_name]
print(f"\nThe best model based on ROC AUC Score is: {best_model_name}")


The best model based on ROC AUC Score is: Logistic Regression


## Final Fraud Analysis with the Best Model

In [19]:
# Define the mapping for predictions
label_mapping = {0: "No Fraud", 1: "Fraud"}

In [20]:
# Example features
features = np.array([[4, 9000.60, 9000.60, 0.0]])

In [21]:
# Make a prediction
final_prediction = best_model.predict(features)
predicted_label = label_mapping[final_prediction[0]]
print("\nPrediction for the given features:", predicted_label)


Prediction for the given features: Fraud


In [22]:
# Print the accuracy and ROC AUC score of the final model
final_predictions = best_model.predict(x_test)
final_accuracy = accuracy_score(y_test, final_predictions)
final_roc_auc = roc_auc_score(y_test, final_predictions)
print(f"\nFinal Fraud Analysis Results:")
print(f"Final Model: {best_model_name}")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"ROC AUC Score: {final_roc_auc:.4f}")


Final Fraud Analysis Results:
Final Model: Logistic Regression
Accuracy: 0.9995
ROC AUC Score: 0.9918
