# Fraud Detection of Credit Card Transactions

## Project Overview:


This project focused on developing a robust machine learning model to detect fraudulent transactions. Through comprehensive data analysis, preprocessing, model training, and evaluation, the project leveraged a range of technologies to address the challenges inherent in fraud detection.

## Import

In [0]:
dbutils.library.restartPython()
# pip install azure-storage-blob pandas numpy imblearn scipy scikit-learn dash pandas plotly matplotlib seaborn sqlalchemy pyodbc 
# pip install --upgrade scipy scikit-learn threadpoolctl 

In [0]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
import pandas as pd
from io import StringIO
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, make_scorer, precision_recall_curve, auc, f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

## Meta

In [0]:
connect_str = "DefaultEndpointsProtocol=https;AccountName=stgfinancialanalysis;AccountKey=goADDOu0wTscv9k97Zy7/vndYBE45jGalIZ0turswP77U2+0kJBc8Mlit7dYolCZiWI1r2uaEk40+AStoAoBUg==;EndpointSuffix=core.windows.net"
sas_token = "?sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupiytfx&se=2024-02-14T16:22:21Z&st=2024-02-14T08:22:21Z&spr=https,http&sig=vNWLF5rpGPI5s5e12iUxFhUs1P9wpQTpUkAwdwg%2B%2F%2BI%3D"

storage_account_name = "stgfinancialanalysis"
container_name = "creditcardfraud-container"
blob_name = "creditcard.csv"
mount_name = "frauddetectiondata"

In [0]:
dbutils.fs.unmount(f"/mnt/{mount_name}")

In [0]:
dbutils.fs.mount(
  source=f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/",
  mount_point=f"/mnt/{mount_name}",
  extra_configs={f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net": sas_token})


## Data

In [0]:
def read_blob_into_df(blob_name):
    blob_client = container_client.get_blob_client(blob_name)
    blob_data = blob_client.download_blob().readall()
    return pd.read_csv(StringIO(blob_data.decode('utf-8')))

In [0]:
blob_url = f"https://{storage_account_name}.blob.core.windows.net"
blob_service_client = BlobServiceClient(account_url=blob_url, credential=sas_token)
container_client = blob_service_client.get_container_client(container_name)

print("Listing blobs in container...")
for blob in container_client.list_blobs():
    print("\t" + blob.name)
    if blob.name == blob_name:
        data = read_blob_into_df(blob_name)
        print(f"Loaded {blob_name} into DataFrame.")
        break


In [0]:
# df = data.sample(n=6000, random_state=42)
df = data.copy()
df.head()

## Analysis of Data

In [0]:
print(df.isnull().sum())

Distribution of Transactions:

In [0]:
# where 'Class' == 1 for fraudulent transactions and 'Class' == 0 for non-fraudulent transactions

def plot_transaction_amount_distribution(df, save_plots=False):
    # Plot distribution of all transaction amounts
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Amount'], bins=100, kde=True, color='blue')
    plt.title('Distribution of Transaction Amounts')
    plt.xlabel('Transaction Amount')
    plt.ylabel('Frequency')
    plt.xscale('log')
    plt.grid(True)
    if save_plots:
        plt.savefig('/dbfs/mnt/frauddetectiondata/transaction_amount_distribution_all.png')
    plt.show()
    
    # Plot distribution of transaction amounts by fraud status
    df['Amount_Log'] = np.log(df['Amount'] + 0.01) 
    print(df)

    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='Amount_Log', hue='Class', bins=100, kde=True, palette='viridis')
    plt.title('Transaction Amount Distribution: Fraudulent vs Non-Fraudulent')
    plt.xlabel('Transaction Amount (Log Scale)')
    plt.ylabel('Frequency')
    plt.legend(title='Transaction Type', labels=['Non-Fraudulent', 'Fraudulent'])
    plt.grid(True)

    # Save the plot if needed
    save_plots = True
    if save_plots:
        plt.savefig('/dbfs/mnt/frauddetectiondata/transaction_amount_distribution_by_class.png')

    plt.show()

plot_transaction_amount_distribution(df, save_plots=True)


Feature Correlation:

In [0]:
def plot_feature_correlation(df, save=False, filename='/dbfs/mnt/frauddetectiondata/feature_correlation_matrix.png'):
    corr_matrix = df.iloc[:, :-1].corr()  # Assuming last column is target variable

    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
    plt.title('Feature Correlation Matrix')
    
    if save:
        plt.savefig(filename)
    plt.show()
    return corr_matrix


corr_matrix = plot_feature_correlation(df)

In [0]:
def filter_corr_matrix(corr_matrix, threshold=0.5):
    """
    Filters the correlation matrix to only show values with an absolute value above a certain threshold.
    """
    # Create a boolean mask for the upper triangle
    mask_upper = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)

    # Create a boolean mask for the lower triangle
    mask_lower = np.tril(np.ones(corr_matrix.shape), k=-1).astype(bool)
    
    # Combine masks to exclude the diagonal
    mask = mask_upper | mask_lower
    
    # Apply threshold filter based on absolute values and apply mask
    filtered_corr = corr_matrix.where(np.abs(corr_matrix) >= threshold)
    filtered_corr = filtered_corr.where(mask)

    return filtered_corr


threshold = 0.3
filtered_corr_matrix = filter_corr_matrix(corr_matrix, threshold)

# print(filtered_corr_matrix)


In [0]:
plt.figure(figsize=(10, 8))
sns.heatmap(filtered_corr_matrix, annot=True, cmap='coolwarm', center=0, fmt=".2f")
plt.title("Filtered Correlation Matrix")

plt.savefig(f"/dbfs/mnt/{mount_name}/filtered_correlated_matrix.png")
plt.show()

Analysis of PCS Components:

In [0]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

def analyze_pca_components(X):
    """
    Fits PCA on the dataset and plots the explained variance ratio of the PCA components.
    """
    pca = PCA(n_components=len(X.columns))
    pca.fit(X)
    
    plt.figure(figsize=(10, 6))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('PCA Explained Variance')
    plt.grid(True)

    plt.savefig('/dbfs/mnt/frauddetectiondata/pca_variance.png')
    plt.show()
    
    return pca


X = df.drop(['Class'], axis=1) 
pca_model = analyze_pca_components(X)


In [0]:
from sklearn.cluster import KMeans
import seaborn as sns

def apply_clustering(X, n_clusters=5, save=False, filename='/dbfs/mnt/frauddetectiondata/clustering_output.png'):
    """
    Applies K-Means clustering to the dataset and plots the distribution of clusters.
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    
    # Optionally add cluster labels back to the original DataFrame for further analysis
    X_clustered = X.copy()
    X_clustered['Cluster'] = clusters
    
    plt.figure(figsize=(10, 6))
    sns.countplot(x='Cluster', data=X_clustered)
    plt.title('Distribution of Clusters')
    plt.xlabel('Cluster')
    plt.ylabel('Count')
    
    if save:
        plt.savefig(filename)
    plt.show()
    
    return X_clustered, kmeans


X_clustered, kmeans_model = apply_clustering(X, n_clusters=5, save=True, filename='cluster_distribution.png')


In [0]:
# filtered_corr_matrix = filter_corr_matrix(X_clustered, threshold)
# plt.figure(figsize=(10, 8))
# sns.heatmap(filtered_corr_matrix, annot=True, cmap='coolwarm', center=0, fmt=".2f")
# plt.title("Filtered Correlation Matrix")
# plt.show()

## Feature Engineering

In [0]:
data['HourOfDay'] = np.floor(data['Time'] / 3600) % 24  # Convert seconds to hours and get the hour of the day

data['LogAmount'] = np.log(data['Amount'] + 1)  # Log transform to reduce skewness; +1 to handle transactions with Amount 0

# Cyclical Feature Engineering
data['HourSin'] = np.sin(2 * np.pi * data['HourOfDay'] / 24)
data['HourCos'] = np.cos(2 * np.pi * data['HourOfDay'] / 24)

# Rolling Window Statistics
# As an example, calculate the rolling mean of 'Amount' over a window of 10 transactions
# df['RollingMeanAmount'] = df['LogAmount'].rolling(window=10).mean()


## Feature Selection

In [0]:
def apply_scaling(X, method='standard', return_type='df'):
    if method == 'standard':
        scaler = StandardScaler()
    elif method == 'minmax':
        scaler = MinMaxScaler()
    else:
        raise ValueError("Unsupported scaling method. Choose 'standard' or 'minmax'.")
    
    X_scaled = scaler.fit_transform(X)
    if return_type == 'df':
        return pd.DataFrame(X_scaled, columns=X.columns)
    elif return_type == 'np':
        return X_scaled
    else:
        raise ValueError("Unsupported return type. Choose 'df' or 'array'.")


In [0]:
def select_features_pearson(X, y, threshold=0.1):
    corr_with_target = X.corrwith(y).abs()
    selected_features = corr_with_target[corr_with_target > threshold].index.tolist()
    return selected_features

def select_features_decision_tree(X, y, threshold=0.01):
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X, y)
    important_features = [feature for importance, feature in zip(clf.feature_importances_, X.columns) if importance > threshold]
    return important_features


def select_features_lasso(X, y, cv=5):
    lasso = LassoCV(cv=cv, random_state=42).fit(X, y)
    important_features = X.columns[lasso.coef_ != 0].tolist()
    return important_features


def select_features_chi2(X, y, k=10):
    chi_selector = SelectKBest(chi2, k=k)
    chi_selector.fit(X, y)
    selected_features = X.columns[chi_selector.get_support(indices=True)].tolist()
    return selected_features

In [0]:
def evaluate_feature_set(X, y, selected_features):
    X_selected = X[selected_features]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return classification_report(y_test, y_pred, zero_division=1)


### Evaluation of Feature Sets

In [0]:
df = data.copy()
X = df.drop(['Class'], axis=1)  
y = df['Class']

In [0]:
X_standard_df = apply_scaling(X, method='standard', return_type='df')
X_standard_df.reset_index(drop=True, inplace=True)

X_minmax_df = apply_scaling(X, method='minmax', return_type='df')
X_minmax_df.reset_index(drop=True, inplace=True)

y.reset_index(drop=True, inplace=True)

In [0]:
features_pearson = select_features_pearson(X_standard_df, y, 0.2)
print("Selected features using Pearson's Coefficient:", features_pearson)
print("\n\nPearson's Coefficient Performance:")
print(evaluate_feature_set(X_standard_df, y, features_pearson))

In [0]:
features_chi2 = select_features_chi2(X_minmax_df, y, k=10)
print("Selected features using Chi-Squared Test:", features_chi2)
print("\n\nChi-Squared Test Performance:")
print(evaluate_feature_set(X_minmax_df, y, features_chi2))

In [0]:
features_decision_tree = select_features_decision_tree(X_standard_df, y, 0.1)
print("Selected features using Decision Trees:", features_decision_tree)
print("\n\nDecision Trees Performance:")
print(evaluate_feature_set(X_standard_df, y, features_decision_tree))

In [0]:
features_lasso = select_features_lasso(X_standard_df, y)
print("Selected features using Lasso Regularization:", features_lasso)
print("\n\nLasso Regularization Performance:")
print(evaluate_feature_set(X_standard_df, y, features_lasso))


## Class Imbalancing Handling

In [0]:
def resample_dataset(X, y, strategy='smote'):
    if strategy == 'smote':
        resampler = SMOTE(random_state=42)
    elif strategy == 'under':
        resampler = RandomUnderSampler(random_state=42)
    else:
        raise ValueError("Unsupported resampling strategy. Choose 'smote' or 'under'.")
    
    X_res, y_res = resampler.fit_resample(X, y)
    return X_res, y_res


### Cross Validation of Resampling

In [0]:
def pr_auc(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)


def validate_resampling(X, y, type='smote'):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    classifier = RandomForestClassifier(random_state=42)

    if type == 'smote':
        pipeline = make_pipeline(SMOTE(random_state=42), classifier)
    elif type == "under":  
        pipeline = make_pipeline(RandomUnderSampler(random_state=42), classifier)
    else:
        print('invalid resampling type')
        return

    scoring = {'pr_auc': make_scorer(pr_auc)}

    cv_results = cross_validate(pipeline, X, y, cv=cv, scoring=scoring, return_train_score=True)

    return cv_results


In [0]:
df_rs = data.sample(n=3000, random_state=42)
X = df.drop(['Class'], axis=1)  
y = df['Class']

In [0]:
results = validate_resampling(X, y, type='smote')

print(f"Mean PR AUC (Train): {np.mean(results['train_pr_auc'])}")
print(f"Mean PR AUC (Test): {np.mean(results['test_pr_auc'])}")


A PR AUC of 0.86 is still considered very good, indicating that the model has a high ability to distinguish between classes even in the test set. This suggests that SMOTE has effectively addressed the class imbalance problem, improving model performance on minority class predictions.

In [0]:
results = validate_resampling(X, y, type='under')

print(f"Mean PR AUC (Train): {np.mean(results['train_pr_auc'])}")
print(f"Mean PR AUC (Test): {np.mean(results['test_pr_auc'])}")

SMOTE shows significantly better performance than Under Sampling, on both of our smaller training and test data, making it the preferred choice for dealing with class imbalance in this scenario.

## Data Preparation

In [0]:
selected_features = ['V14', 'V17', 'V10', 'V12', 'V4', 'V16', 'V11', 'V18', 'V1']
X = data[selected_features]
y = data['Class']

In [0]:
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_resampled, y_resampled = resample_dataset(X_tr, y_tr, strategy='under')
X_val, y_val = resample_dataset(X_val, y_val, strategy='under')

X_standard_np = apply_scaling(X_resampled, method='minmax', return_type='df')
X_val = apply_scaling(X_val, method='minmax', return_type='df')

y_resampled.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X_standard_np, y_resampled, test_size=0.4, random_state=42)

X_train.to_csv("/dbfs/mnt/frauddetectiondata/X_train.csv")
y_train.to_csv("/dbfs/mnt/frauddetectiondata/y_train.csv")

In [0]:
X_train.head()

In [0]:
y_train.head()

## Fraud Detection Models

In [0]:
def pr_auc_scoring(estimator, X, y_true):
    # Scoring function for PR AUC to handle model probability predictions
    probas_pred = estimator.predict_proba(X)[:, 1] 
    precision, recall, _ = precision_recall_curve(y_true, probas_pred)
    print(precision)
    print(recall)
    return auc(recall, precision)


models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(probability=True, random_state=42)
}


def evaluate_model(model, X, y):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    pr_auc_scores = cross_val_score(model, X, y, cv=cv, scoring=make_scorer(pr_auc_scoring, needs_proba=True))
    f1_scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
    
    return np.mean(pr_auc_scores), np.mean(f1_scores)


def evaluate_model_performance(model_name, model, X_train, X_test, X_val, y_train, y_test, y_val):
    print(f'Evaluating {model_name} -------------------------')
    model.fit(X_train, y_train)
    
    metrics_list = []
    
    def calculate_metrics(X, y, dataset_name):
        y_prob = model.predict_proba(X)[:, 1]
        precision_vals, recall_vals, _ = precision_recall_curve(y, y_prob)
        pr_auc = auc(recall_vals, precision_vals)
        
        y_pred = model.predict(X)
        f1 = f1_score(y, y_pred)
        accuracy = accuracy_score(y, y_pred)
        precision = precision_score(y, y_pred)
        recall = recall_score(y, y_pred)
        metrics = {
            "Model": model_name,
            "Dataset": dataset_name,
            "PR AUC": pr_auc,
            "F1 Score": f1,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "Fitted Model": model
        } 
        metrics_list.append(metrics)

        print(f"{dataset_name} Sample probabilities:", y_prob[:5])
        print(f"{dataset_name} PR AUC:", pr_auc)
        print(f"{dataset_name} F1 Score:", f1)
        print(f"{dataset_name} Accuracy:", accuracy)
        print(f"{dataset_name} Precision:", precision)
        print(f"{dataset_name} Recall:", recall)
    
    calculate_metrics(X_test, y_test, "Test")
    calculate_metrics(X_val, y_val, "Validation")
    
    return metrics_list

In [0]:
all_metrics = []
for model_name, model in models.items():
    model_metrics = evaluate_model_performance(model_name, model, X_train, X_test, X_val, y_train, y_test, y_val)
    all_metrics.extend(model_metrics)  

metrics_df = pd.DataFrame(all_metrics)

### Random Forest Results

**Test Data**
  - Test Sample probabilities: [0.99 0.03 0.23 0.14 0.16]
  - Test PR AUC: 0.9793355518356972
  - Test F1 Score: 0.9255663430420712
  - Test Accuracy: 0.9272151898734177
  - Test Precision: 0.9407894736842105
  - Test Recall: 0.910828025477707

**Validation Data**
  - Validation Sample probabilities: [0.27 0.42 0.41 0.35 0.22]
  - Validation PR AUC: 0.9792295462353696
  - Validation F1 Score: 0.9368421052631579
  - Validation Accuracy: 0.9387755102040817
  - Validation Precision: 0.967391304347826
  - Validation Recall: 0.9081632653061225


In [0]:
filtered_df = metrics_df.drop(['Fitted Model'], axis=1) 
melted_df = filtered_df.melt(id_vars=["Model", "Dataset"], var_name="Metric", value_name="Score")

melted_df['Score'] = pd.to_numeric(melted_df['Score'], errors='coerce')

if melted_df['Score'].isnull().any():
    print("NaN values found in 'Score' after conversion. Investigating:")
    print(melted_df[melted_df['Score'].isnull()])



plt.figure(figsize=(12, 8))
sns.barplot(x='Metric', y='Score', hue='Model', data=melted_df)
plt.title('Model Performance Comparison Across Datasets')
plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()

plt.savefig('model_performance_comparison.png', bbox_inches='tight', dpi=300)
plt.show()

In [0]:
heatmap_data = metrics_df.pivot_table(index='Model', columns='Dataset', values=['PR AUC', 'F1 Score', 'Accuracy', 'Precision', 'Recall'], aggfunc=np.mean)

plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, annot=True, fmt=".2f", cmap="YlGnBu")
plt.title('Heatmap of Model Performance Metrics')

plt.savefig('model_heatmap_comparison.png', bbox_inches='tight', dpi=300)
plt.show()

### Model Evaluation Summary:


**SVM** showed the highest PR AUC, indicating its superior capability in distinguishing between fraudulent and non-fraudulent transactions across different thresholds. High precision suggests it's very reliable when it flags a transaction as fraudulent; however, its recall is slightly lower than Random Forest, meaning it might miss some fraudulent transactions.


**Random Forest** presented a balanced performance across all metrics, with the highest F1 Score and very competitive PR AUC, Precision, and Recall. It offers a good balance between identifying fraudulent transactions (Recall) and minimizing false alarms (Precision).


**KNN** had the lowest PR AUC among the three models and slightly lower performance metrics across the board compared to Random Forest and SVM, suggesting it might not be as effective in this particular context.

### Chosen Model

Given the emphasis on better identifying fraudulent transactions:

**Random Forest** is recommended for a balanced approach, effectively identifying fraudulent transactions while maintaining a low rate of false positives. Its competitive Recall and highest F1 Score suggest it's the most versatile model for this task, making it the preferred choice if the client seeks a balance between identifying as many fraudulent transactions as possible and maintaining user trust by not flagging too many legitimate transactions as fraudulent.

In [0]:
param_grid = {
    'n_estimators': [100, 250, 500],  # Number of trees in the forest
    'max_depth': [3, 5, 8, 10, None],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
    'max_features': ['auto', 'sqrt']  # Number of features to consider at every split
}


In [0]:
best_score = 0
best_params = None

for n_estimators in param_grid['n_estimators']:
    print('n estimators:  ', n_estimators)

    for max_depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            
            for max_features in param_grid['max_features']:
            
                model = models['Random Forest']

                pr_auc_scores = []

                # Perform Stratified K-Fold cross-validation
                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
                for train_idx, test_idx in cv.split(X_train, y_train):
                    X_cv_train, X_cv_test = X_train.iloc[train_idx], X_train.iloc[test_idx]
                    y_cv_train, y_cv_test = y_train.iloc[train_idx], y_train.iloc[test_idx]

                    model.fit(X_cv_train, y_cv_train)
                    y_prob = model.predict_proba(X_cv_test)[:, 1]

                    # Calculate PR AUC for the current fold
                    precision, recall, _ = precision_recall_curve(y_cv_test, y_prob)
                    pr_auc = auc(recall, precision)
                    pr_auc_scores.append(pr_auc)

                # Calculate the average PR AUC across all folds for the current parameter set
                mean_pr_auc = np.mean(pr_auc_scores)

                if mean_pr_auc > best_score:
                    best_score = mean_pr_auc
                    best_params = {'n_estimators': n_estimators, 
                                'max_depth': max_depth, 
                                'min_samples_split': min_samples_split,
                                'max_features': max_features}

print("Best Parameters:", best_params)
print("Best PR AUC Score:", best_score)

In [0]:
random_forest_metrics = [metrics for metrics in all_metrics if metrics['Model'] == 'Random Forest']
random_forest_model = random_forest_metrics[0]['Fitted Model']  # X_test model?

In [0]:
rf_metrics_df = pd.DataFrame(random_forest_metrics)
rf_metrics_melted = rf_metrics_df.melt(id_vars=['Dataset'], value_vars=['PR AUC', 'F1 Score', 'Accuracy', 'Precision', 'Recall'],
                                       var_name='Metric', value_name='Score')

plt.figure(figsize=(10, 6))
sns.barplot(x='Metric', y='Score', hue='Dataset', data=rf_metrics_melted)
plt.title('Random Forest Performance Metrics')
plt.xticks(rotation=45)
plt.show()


In [0]:
feature_names = X_train.columns  
importances = random_forest_model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 8))
sns.barplot(y=[feature_names[i] for i in indices], x=importances[indices])
plt.title('Feature Importances - Random Forest Model')
plt.xlabel('Relative Importance')

plt.savefig('/dbfs/mnt/frauddetectiondata/feature_importance.png', bbox_inches='tight', dpi=300)
plt.show()


### Key Performance Indicators

### Precision Recall Curve

In [0]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
import numpy as np

# Assuming X_val, y_val are defined and the model is fitted
y_scores = random_forest_model.predict_proba(X_val)[:, 1]  # Get the probabilities for the positive class
precision, recall, thresholds = precision_recall_curve(y_val, y_scores)

# Plotting the Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', label='Random Forest')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)

plt.savefig('/dbfs/mnt/frauddetectiondata/precision_recall_curve.png', bbox_inches='tight', dpi=300)
plt.show()


In [0]:
rf_metrics_df = pd.DataFrame(random_forest_metrics)
rf_metrics_melted = rf_metrics_df.melt(id_vars=['Dataset'], value_vars=['PR AUC', 'F1 Score', 'Accuracy', 'Precision', 'Recall'],
                                       var_name='Metric', value_name='Score')

plt.figure(figsize=(10, 6))
sns.barplot(x='Metric', y='Score', hue='Dataset', data=rf_metrics_melted)
plt.title('Random Forest Performance Metrics')
plt.xticks(rotation=45)

plt.savefig('/dbfs/mnt/frauddetectiondata/rf_performance_metrics.png', bbox_inches='tight', dpi=300)
plt.show()

In [0]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred_val = random_forest_model.predict(X_val)
cm = confusion_matrix(y_val, y_pred_val)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix - Random Forest on Validation Set')

plt.savefig('/dbfs/mnt/frauddetectiondata/model_confusion_matrix.png', bbox_inches='tight', dpi=300)
plt.show()


In [0]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_val, y_pred_val).ravel()
fpr = fp / (fp + tn)

# Visualizing FPR
sns.barplot(x=['False Positive Rate'], y=[fpr])
plt.title('False Positive Rate')
plt.ylabel('Rate')
plt.show()


In [0]:
print(fpr)

In [0]:
rf_metrics = rf_metrics_df.drop(['Fitted Model'], axis=1) 
rf_metrics.head()

In [0]:
print(data.info())

The Random Forest model demonstrated outstanding performance, making it a reliable choice for detecting fraudulent transactions in our dataset. Its high scores in precision and recall are particularly noteworthy because they suggest that the model can minimize false positives (incorrectly flagged legitimate transactions) while effectively identifying most fraudulent transactions. This balance is critical in fraud detection, where the cost of missing a fraudulent transaction can be very high, but so can the inconvenience and customer service implications of falsely flagging legitimate transactions as fraud.

## Conclusion

The project was executed through a series of steps, starting from data collection and preprocessing to model training and evaluation. The chosen Random Forest model demonstrated high performance across various metrics, making it an effective tool for fraud detection. 

The project underscored the importance of using a combination of data preprocessing techniques and machine learning algorithms to handle imbalanced datasets typically encountered in fraud detection scenarios.

This project illustrates the application of machine learning techniques to a critical problem, utilizing a robust stack of tools and technologies. The project's success showcases the ability to leverage data science methodologies to derive meaningful insights and solutions.

## Data Upload

In [0]:
blob_client = container_client.get_blob_client(blob_name)
filtered_df.to_csv('/dbfs/mnt/frauddetectiondata/fraudmodels.csv', index=False)

with open("/dbfs/mnt/frauddetectiondata/fraudmodels.csv", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

with open(f"/dbfs/mnt/{mount_name}/X_train.csv", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

# with open(f"/dbfs/mnt/{mount_name}/filtered_correlation_matrix.png", "rb") as data:
#     blob_client.upload_blob(data, overwrite=True)

with open(f"/dbfs/mnt/{mount_name}/model_performance_comparison.png", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

with open(f"/dbfs/mnt/{mount_name}/model_heatmap_comparison.png", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

with open("/dbfs/mnt/frauddetectiondata/feature_importance.png", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

with open(f"/dbfs/mnt/{mount_name}/precision_recall_curve.png", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

with open(f"/dbfs/mnt/{mount_name}/rf_performance_metrics.png", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

with open(f"/dbfs/mnt/{mount_name}/model_confusion_matrix.png", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

with open(f"/dbfs/mnt/{mount_name}/transaction_amount_distribution_by_class.png", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

with open(f"/dbfs/mnt/{mount_name}/transaction_amount_distribution_all.png", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

with open(f"/dbfs/mnt/{mount_name}/y_train.csv", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

In [0]:
filtered_df.to_csv('/dbfs/mnt/frauddetectiondata/fraudmodels.csv', index=False)

with open("/dbfs/mnt/frauddetectiondata/fraudmodels.csv", "rb") as data:
    blob_client.upload_blob(data, overwrite=True)

### Dashboard Visualization

In [0]:
filtered_df.head(6)

## Upload to MSSQL

In [0]:
import pyodbc
from sqlalchemy import create_engine

pyodbc.drivers()

In [0]:
server = 'DESKTOP-5EVT2A9\DEVSERVER2024'
database = 'frauddetectioncc'  # Update with your database name
username = 'sa'  # Update with your username, if using SQL Server authentication
password = 'YourPassword'  # Update with your password, if using SQL Server authentication

# For Windows Authentication
connection_string = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;"

# If using SQL Server Authentication, uncomment and use the following line
# connection_string = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password}"

# Establish connection
cnxn = pyodbc.connect(connection_string)
cursor = cnxn.cursor()

# Confirm connection (optional)
print("Connected to database successfully!")


In [0]:
# Use the pyodbc connection string directly in create_engine
engine = create_engine("mssql+pyodbc://", creator=lambda: cnxn)

In [0]:
table_name = 'CreditCardTransactions'

# Upload DataFrame to SQL Server
df.to_sql(name=table_name, con=engine, if_exists='replace', index=False, chunksize=500)

print(f"Data uploaded to table {table_name}.")