In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [2]:
def plot_confusion_matrix(y_true, y_pred, threshold=0.5):
    # Compute confusion matrix
    y_pred_binary = (y_pred > threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred_binary)
    
    # Create heatmap
    plt.figure(figsize=(5, 5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title(f"Confusion matrix (threshold={threshold:.2f})")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    
    # Print information on TP, FP, FN, TP
    tn, fp, fn, tp = cm.ravel()
    print("True negatives (legitimate transactions detected): ", tn)
    print("False positives (legitimate transactions incorrectly detected): ", fp)
    print("False negatives (fraudulent transactions missed): ", fn)
    print("True positives (fraudulent transactions detected): ", tp)
    print("Total fraudulent transactions: ", np.sum(cm[1]))

In [8]:
def load_data(url):
    #Loads a CSV file from a given URL into a Pandas DataFrame.
    df = pd.read_csv('https://jrssbcrsefilesnait.blob.core.windows.net/3950data1/fraudTrain.csv.zip')
    return df


def preprocess_data(df):
    #Preprocesses a DataFrame for fraud detection.
    
    # Compute the distance between the transaction location and the merchant location.
    latlong = gpd.GeoDataFrame(df[['lat','long']], geometry=gpd.points_from_xy(df['lat'], df['long']))
    merchlatlong = gpd.GeoDataFrame(df[['merch_lat','merch_long']], geometry=gpd.points_from_xy(df['merch_lat'], df['merch_long']))
    df['distance'] = latlong['geometry'].distance(merchlatlong['geometry'], align=True)
    
    # Compute the age of the customer based on the date of birth and the transaction date.
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['dob'] = pd.to_datetime(df['dob'])
    df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365
    
    # Remove unnecessary columns and one-hot encode categorical variables.
    df.drop(columns=['trans_num', 'merchant', 'first', 'last', 'gender', 'job', 'street', 'city', 'state', 'trans_date_trans_time', 'dob', 'Unnamed: 0'], inplace=True)
    df = pd.get_dummies(df, drop_first=True)
    
    # Scale numerical variables.
    scaler = StandardScaler()
    df[df.select_dtypes(include=[np.number]).columns] = scaler.fit_transform(df.select_dtypes(include=[np.number]))
    
    return df


def load_model(path):
    
    #Loads a pre-trained fraud detection model from a given path.
    model = tf.keras.models.load_model('Model')
    return model


def predict(model, X):
    
    #Makes binary predictions (0 or 1) on a set of input features using a given model.
    y_pred = model.predict(X).round().astype(int)
    return y_pred


def plot_confusion_matrix(y_true, y_pred):
    #Plots a confusion matrix for binary classification.
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print(f"Confusion matrix:\n{cm}")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1 score: {f1_score:.3f}")
    
    
def predict_fraud(url, model_path):
    
    #Loads a CSV file from a given URL, preprocesses it for fraud detection, loads a pre-trained
    #fraud detection model from a given path, makes binary predictions (0 or 1) on the preprocessed
    #data using the model, and plots a confusion matrix.
    
    df = load_data('https://jrssbcrsefilesnait.blob.core.windows.net/3950data1/fraudTrain.csv.zip')
    X = preprocess_data(df)
    model = load_model('Model')
    y_pred = predict(model, X)