In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.pipeline import Pipeline

In [2]:
def transform_df(df):
    # Define a function to categorize countries
    def categorize_country(country):
        if country == 'USA':
            return 'USA'
        elif country == 'CANADA':
            return 'CANADA'
        elif country == 'GERMANY':
            return 'GERMANY'
        else:
            return 'OTHERS'

    # Drop rows with null values
    df.dropna(inplace=True)

    # Apply country categorization to 'Sender_Country' and 'Bene_Country' columns
    df['Sender_Country'] = df['Sender_Country'].apply(categorize_country)
    df['Bene_Country'] = df['Bene_Country'].apply(categorize_country)

    # Extract sender type and bene type from respective IDs
    df["Sender_Type"] = df["Sender_Id"].apply(lambda sender_id: "-".join(sender_id.split("-")[:-1]) if "-" in sender_id else sender_id)
    df["Bene_Type"] = df["Bene_Id"].apply(lambda sender_id: "-".join(sender_id.split("-")[:-1]) if "-" in sender_id else sender_id)

    # Split 'Time_step' into 'Date' and 'Time', then convert 'Time' to seconds
    df['Date'] = df['Time_step'].str.split(" ").str[0]
    df['Time'] = df['Time_step'].str.split(" ").str[1]
    df['Time'] = df['Time'].apply(lambda x: int(x.split(":")[0]) * 3600 + int(x.split(":")[1]) * 60 + int(x.split(":")[2]))

    # Extract 'Year', 'Month', and 'Day' from 'Date'
    df[['Year', 'Month', 'Day']] = df['Date'].str.split('-', expand=True)

    # Drop unnecessary columns
    df.drop(['Transaction_Id','Time_step','Sender_Id','Sender_Account','Sender_lob','Bene_Id','Bene_Account','Date'], axis=1, inplace=True)

    return df

In [3]:
def scaling_df(df):

    # Initialize StandardScaler
    scaler_standard = StandardScaler()

    # Fit and transform the data
    df['Time_Scaled_Standard'] = scaler_standard.fit_transform(df[['Time']])
    with open('scaler_standard_Time.pkl', 'wb') as f:
        pickle.dump(scaler_standard, f)
    
    # Initialize MinMaxScaler
    scaler_minmax = MinMaxScaler()

    # Fit and transform the data
    df['Year_MinMax'] = scaler_minmax.fit_transform(df[['Year']])
    with open('scaler_minmax_Year.pkl', 'wb') as f:
        pickle.dump(scaler_minmax, f)

    df['Month_MinMax'] = scaler_minmax.fit_transform(df[['Month']])
    with open('scaler_minmax_Month.pkl', 'wb') as f:
        pickle.dump(scaler_minmax, f)
    
    df['Day_MinMax'] = scaler_minmax.fit_transform(df[['Day']])
    with open('scaler_minmax_Day.pkl', 'wb') as f:
        pickle.dump(scaler_minmax, f)

    df.drop(['Time','Year', 'Month', 'Day'],axis=1,inplace=True)

    return(df)

In [4]:
def encode_df(df):

    # Use one-hot encoding for categorical columns
    df = pd.get_dummies(df, columns=['Sender_Country', 'Bene_Country', 'Transaction_Type', 'Sender_Type', 'Bene_Type'], dtype=int)
    
    return df

In [5]:
def balance_df(df):
    # Assuming your data is in a DataFrame called 'data'
    X = df.drop('Label', axis=1)
    y = df['Label']

    # Using SMOTE to oversample the minority class
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Creating a new balanced DataFrame
    df = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled, columns=['Label'])], axis=1)

    return df

In [6]:
# Read the CSV file into a DataFrame, specifying the file path using a raw string literal to handle backslashes
data = pd.read_csv(r"C:\Users\gagan\Downloads\Data\Winter 23-24\Capstone\Dataset\fraud_payment_data.csv")

In [7]:
# Apply the 'transform_df' function to preprocess the data by performing various transformations (explained in the comments of the 'transform_df' function)
# Then, apply the 'scaling_df' function to scale the transformed data (assuming 'scaling_df' is a function defined elsewhere)
# Finally, apply the 'encode_df' function to perform one-hot encoding on the preprocessed and scaled data (assuming 'encode_df' is a function defined elsewhere)
data = encode_df(scaling_df(transform_df(data)))

# Unsupervised Clustering

In [9]:
# Split the data into features (X) and target variable (y)
X = data.drop('Label', axis=1)  # Features
y = data['Label']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Isolation Forest': IsolationForest(random_state=42),
    'Local Outlier Factor': LocalOutlierFactor(novelty=True),
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Convert outlier predictions to 0s and 1s
    y_pred_train[y_pred_train == 1] = 0  # Inliers
    y_pred_train[y_pred_train == -1] = 1  # Outliers
    y_pred_test[y_pred_test == 1] = 0  # Inliers
    y_pred_test[y_pred_test == -1] = 1  # Outliers
    
    print(f"Model: {name}")
    print("Training Classification Report:")
    print(classification_report(y_train, y_pred_train))
    print("Testing Classification Report:")
    print(classification_report(y_test, y_pred_test))
    print("------------------------------------")

Model: Isolation Forest
Training Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.67      0.79    833455
           1       0.02      0.38      0.04     17263

    accuracy                           0.66    850718
   macro avg       0.50      0.52      0.42    850718
weighted avg       0.96      0.66      0.78    850718

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.67      0.79    208381
           1       0.02      0.40      0.05      4299

    accuracy                           0.66    212680
   macro avg       0.50      0.53      0.42    212680
weighted avg       0.96      0.66      0.78    212680

------------------------------------




Model: Local Outlier Factor
Training Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98    833455
           1       0.03      0.02      0.03     17263

    accuracy                           0.97    850718
   macro avg       0.51      0.50      0.51    850718
weighted avg       0.96      0.97      0.96    850718

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    208381
           1       0.03      0.03      0.03      4299

    accuracy                           0.96    212680
   macro avg       0.51      0.51      0.51    212680
weighted avg       0.96      0.96      0.96    212680

------------------------------------


In [10]:
# Split the data into features (X) and target variable (y)
X = data.drop('Label', axis=1)  # Features
y = data['Label']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the autoencoder model
input_dim = X_train.shape[1]
encoding_dim = 32  # You can adjust this as needed

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation='relu')(input_layer)
decoder = Dense(input_dim, activation='sigmoid')(encoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)

# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder model
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=64, shuffle=True, validation_split=0.2, callbacks=[early_stopping])

# Use the trained autoencoder to reconstruct the data
X_train_pred = autoencoder.predict(X_train)
X_test_pred = autoencoder.predict(X_test)

# Calculate reconstruction errors
train_mse = tf.reduce_mean(tf.square(X_train - X_train_pred), axis=1)
test_mse = tf.reduce_mean(tf.square(X_test - X_test_pred), axis=1)

# Determine the threshold for anomaly detection (e.g., based on the 95th percentile of training errors)
threshold = np.percentile(train_mse, 95)

# Predict anomalies based on the threshold
y_pred_train = (train_mse > threshold).numpy().astype(int)
y_pred_test = (test_mse > threshold).numpy().astype(int)

# Generate classification report
print("Training Classification Report:")
print(classification_report(y_train, y_pred_train))

print("Testing Classification Report:")
print(classification_report(y_test, y_pred_test))

Epoch 1/50
[1m10634/10634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 572us/step - loss: 29910618.0000 - val_loss: 29683346.0000
Epoch 2/50
[1m10634/10634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 551us/step - loss: 29919322.0000 - val_loss: 29683346.0000
Epoch 3/50
[1m10634/10634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 532us/step - loss: 29938606.0000 - val_loss: 29683346.0000
Epoch 4/50
[1m10634/10634[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 522us/step - loss: 29822668.0000 - val_loss: 29683346.0000
[1m26585/26585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 346us/step
[1m6647/6647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 345us/step
Training Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96    833455
           1       0.02      0.05      0.03     17263

    accuracy                           0.93    850718
   macro avg       0.50      0

# Supervised Classification

In [11]:
# Apply the 'balance_df' function to balance the data (explained in the comments of the 'balance_df' function)
data = balance_df(data)

In [12]:
# Split the data into features (X) and target variable (y)
X = data.drop('Label', axis=1)  # Features
y = data['Label']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Decision Tree (Regularized)

In [13]:
# Initialize and train decision tree model with regularization
dt_model_regularized = DecisionTreeClassifier(max_depth=8, min_samples_split=5, min_samples_leaf=2, random_state=42)
dt_model_regularized.fit(X_train, y_train)
y_pred_train_regularized = dt_model_regularized.predict(X_train)
y_pred_test_regularized = dt_model_regularized.predict(X_test)

# pickle.dump(dt_model_regularized, 'model.pkl')

# Classification report and accuracy for training data with regularization
print("Training Data Classification Report (Regularized Decision Tree):")
print(classification_report(y_train, y_pred_train_regularized))
print("Training Data Accuracy Score (Regularized Decision Tree):", accuracy_score(y_train, y_pred_train_regularized))
print("------------------------------------")

# Classification report and accuracy for test data with regularization
print("Test Data Classification Report (Regularized Decision Tree):")
print(classification_report(y_test, y_pred_test_regularized))
print("Test Data Accuracy Score (Regularized Decision Tree):", accuracy_score(y_test, y_pred_test_regularized))
print("------------------------------------")


Training Data Classification Report (Regularized Decision Tree):
              precision    recall  f1-score   support

           0       0.92      0.99      0.96    729323
           1       0.99      0.92      0.95    729247

    accuracy                           0.95   1458570
   macro avg       0.96      0.95      0.95   1458570
weighted avg       0.96      0.95      0.95   1458570

Training Data Accuracy Score (Regularized Decision Tree): 0.9548256168713192
------------------------------------
Test Data Classification Report (Regularized Decision Tree):
              precision    recall  f1-score   support

           0       0.92      0.99      0.96    312513
           1       0.99      0.92      0.95    312589

    accuracy                           0.95    625102
   macro avg       0.96      0.95      0.95    625102
weighted avg       0.96      0.95      0.95    625102

Test Data Accuracy Score (Regularized Decision Tree): 0.9547657822243409
---------------------------------

# AdaBoost

In [14]:
# Initialize and train AdaBoost model
ab_model = AdaBoostClassifier(random_state=42)
ab_model.fit(X_train, y_train)
y_pred_ab = ab_model.predict(X_test)

# Classification report and accuracy for training data
print("AdaBoost Training Data Classification Report:")
print(classification_report(y_train, ab_model.predict(X_train)))
print("AdaBoost Training Data Accuracy Score:", accuracy_score(y_train, ab_model.predict(X_train)))
print("------------------------------------")

# Classification report and accuracy for test data
print("AdaBoost Test Data Classification Report:")
print(classification_report(y_test, y_pred_ab))
print("AdaBoost Test Data Accuracy Score:", accuracy_score(y_test, y_pred_ab))
print("------------------------------------")



AdaBoost Training Data Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97    729323
           1       1.00      0.94      0.97    729247

    accuracy                           0.97   1458570
   macro avg       0.97      0.97      0.97   1458570
weighted avg       0.97      0.97      0.97   1458570

AdaBoost Training Data Accuracy Score: 0.970713781306348
------------------------------------
AdaBoost Test Data Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97    312513
           1       1.00      0.94      0.97    312589

    accuracy                           0.97    625102
   macro avg       0.97      0.97      0.97    625102
weighted avg       0.97      0.97      0.97    625102

AdaBoost Test Data Accuracy Score: 0.9709423422097514
------------------------------------


# Naive Bayes

In [15]:
# Initialize and train Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

# Classification report and accuracy for training data
print("Naive Bayes Training Data Classification Report:")
print(classification_report(y_train, nb_model.predict(X_train)))
print("Naive Bayes Training Data Accuracy Score:", accuracy_score(y_train, nb_model.predict(X_train)))
print("------------------------------------")

# Classification report and accuracy for test data
print("Naive Bayes Test Data Classification Report:")
print(classification_report(y_test, y_pred_nb))
print("Naive Bayes Test Data Accuracy Score:", accuracy_score(y_test, y_pred_nb))
print("------------------------------------")

Naive Bayes Training Data Classification Report:


              precision    recall  f1-score   support

           0       0.94      0.69      0.80    729323
           1       0.75      0.96      0.84    729247

    accuracy                           0.82   1458570
   macro avg       0.85      0.82      0.82   1458570
weighted avg       0.85      0.82      0.82   1458570

Naive Bayes Training Data Accuracy Score: 0.8232974762952755
------------------------------------
Naive Bayes Test Data Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.69      0.80    312513
           1       0.75      0.96      0.84    312589

    accuracy                           0.82    625102
   macro avg       0.85      0.82      0.82    625102
weighted avg       0.85      0.82      0.82    625102

Naive Bayes Test Data Accuracy Score: 0.8229121007451584
------------------------------------


# Gradient Boosting

In [16]:
# Initialize and train gradient boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

# Classification report and accuracy for training data
print("Gradient Boosting Training Data Classification Report:")
print(classification_report(y_train, gb_model.predict(X_train)))
print("Gradient Boosting Training Data Accuracy Score:", accuracy_score(y_train, gb_model.predict(X_train)))
print("------------------------------------")

# Classification report and accuracy for test data
print("Gradient Boosting Test Data Classification Report:")
print(classification_report(y_test, y_pred_gb))
print("Gradient Boosting Test Data Accuracy Score:", accuracy_score(y_test, y_pred_gb))
print("------------------------------------")

Gradient Boosting Training Data Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97    729323
           1       1.00      0.94      0.97    729247

    accuracy                           0.97   1458570
   macro avg       0.97      0.97      0.97   1458570
weighted avg       0.97      0.97      0.97   1458570

Gradient Boosting Training Data Accuracy Score: 0.9685801847014541
------------------------------------
Gradient Boosting Test Data Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97    312513
           1       1.00      0.94      0.97    312589

    accuracy                           0.97    625102
   macro avg       0.97      0.97      0.97    625102
weighted avg       0.97      0.97      0.97    625102

Gradient Boosting Test Data Accuracy Score: 0.9687347024965526
------------------------------------


# Random Forest

In [17]:
# Initialize and train random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Classification report and accuracy for training data
print("Random Forest Training Data Classification Report:")
print(classification_report(y_train, rf_model.predict(X_train)))
print("Random Forest Training Data Accuracy Score:", accuracy_score(y_train, rf_model.predict(X_train)))
print("------------------------------------")

# Classification report and accuracy for test data
print("Random Forest Test Data Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Test Data Accuracy Score:", accuracy_score(y_test, y_pred_rf))
print("------------------------------------")

Random Forest Training Data Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    729323
           1       1.00      1.00      1.00    729247

    accuracy                           1.00   1458570
   macro avg       1.00      1.00      1.00   1458570
weighted avg       1.00      1.00      1.00   1458570

Random Forest Training Data Accuracy Score: 0.9999835455274687
------------------------------------
Random Forest Test Data Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    312513
           1       1.00      0.98      0.99    312589

    accuracy                           0.99    625102
   macro avg       0.99      0.99      0.99    625102
weighted avg       0.99      0.99      0.99    625102

Random Forest Test Data Accuracy Score: 0.9886658497333235
------------------------------------


# Multi-Layer Perceptron

In [18]:
# Initialize and train Multi-layer Perceptron model
mlp_model = MLPClassifier(random_state=42)
mlp_model.fit(X_train, y_train)
y_pred_mlp = mlp_model.predict(X_test)

# Classification report and accuracy for training data
print("Multi-layer Perceptron Training Data Classification Report:")
print(classification_report(y_train, mlp_model.predict(X_train)))
print("Multi-layer Perceptron Training Data Accuracy Score:", accuracy_score(y_train, mlp_model.predict(X_train)))
print("------------------------------------")

# Classification report and accuracy for test data
print("Multi-layer Perceptron Test Data Classification Report:")
print(classification_report(y_test, y_pred_mlp))
print("Multi-layer Perceptron Test Data Accuracy Score:", accuracy_score(y_test, y_pred_mlp))
print("------------------------------------")

Multi-layer Perceptron Training Data Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97    729323
           1       1.00      0.95      0.97    729247

    accuracy                           0.97   1458570
   macro avg       0.97      0.97      0.97   1458570
weighted avg       0.97      0.97      0.97   1458570

Multi-layer Perceptron Training Data Accuracy Score: 0.973478818294631
------------------------------------
Multi-layer Perceptron Test Data Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97    312513
           1       1.00      0.95      0.97    312589

    accuracy                           0.97    625102
   macro avg       0.98      0.97      0.97    625102
weighted avg       0.98      0.97      0.97    625102

Multi-layer Perceptron Test Data Accuracy Score: 0.9736938931566368
------------------------------------


# Logistic Regression

In [19]:
# Initialize and train logistic regression model
lr_model = LogisticRegression(random_state=42,max_iter=10000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Classification report and accuracy for training data
print("Logistic Regression Training Data Classification Report:")
print(classification_report(y_train, lr_model.predict(X_train)))
print("Logistic Regression Training Data Accuracy Score:", accuracy_score(y_train, lr_model.predict(X_train)))
print("------------------------------------")

# Classification report and accuracy for test data
print("Logistic Regression Test Data Classification Report:")
print(classification_report(y_test, y_pred_lr))
print("Logistic Regression Test Data Accuracy Score:", accuracy_score(y_test, y_pred_lr))
print("------------------------------------")

Logistic Regression Training Data Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98    729323
           1       1.00      0.95      0.97    729247

    accuracy                           0.97   1458570
   macro avg       0.98      0.97      0.97   1458570
weighted avg       0.98      0.97      0.97   1458570

Logistic Regression Training Data Accuracy Score: 0.974749240694653
------------------------------------
Logistic Regression Test Data Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98    312513
           1       1.00      0.95      0.97    312589

    accuracy                           0.98    625102
   macro avg       0.98      0.98      0.98    625102
weighted avg       0.98      0.98      0.98    625102

Logistic Regression Test Data Accuracy Score: 0.9750184769845561
------------------------------------
