### Credit Card fraud detection

#### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, roc_auc_score, RocCurveDisplay, f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from category_encoders import WOEEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as ImbPipeline


#### Reading dataset

In [None]:
df=pd.read_csv("fraud test.csv")
df.head()

In [None]:
print(df.columns)


#### Checking null values in data

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()


#### Distribution of fraudulent transaction

In [None]:
fraud = df[df['is_fraud'] == 1] 
non_fraud = df[df['is_fraud'] == 0] 
outlierFraction = len(fraud)/float(len(non_fraud)) 
print(outlierFraction) 
print('Fraud Cases: {}'.format(len(df[df['is_fraud'] == 1]))) 
print('Non fraud Transactions: {}'.format(len(df[df['is_fraud'] == 0]))) 


In [None]:
fraud_counts = df['is_fraud'].value_counts()

# Plot the pie chart
plt.figure(figsize=(5, 4))
plt.pie(fraud_counts, labels=['Non-fraud', 'Fraud'], autopct='%1.1f%%', colors=['green', 'yellow'])
plt.title('Distribution of fraud and non-fraud')
plt.show()


#### Distribution of gender verses fraudulent transactions

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(8, 5))

# Gender distribution Pie chart
explode = [0.1, 0.1]
df.groupby('gender')['is_fraud'].count().plot.pie(explode=explode, autopct="%1.1f%%", ax=axs[0], colors=['skyblue', 'lightcoral'])
axs[0].set_title("Gender Distribution")

# Fraud Status by gender Count Plot
ax = sns.countplot(x="gender", hue="is_fraud", data=df, ax=axs[1], palette='Set2')

# Add values on top of each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# Set labels and title
axs[1].set_title("Distribution of Gender with Fraud Status")
axs[1].set_xlabel("Gender")
axs[1].set_ylabel("Count")

# Show the plot
plt.tight_layout()
plt.show()


#### Category verses fraudulent and non-fraudulent activities


In [None]:
sns.barplot(x = "amt", y = "category", data = df, hue = "is_fraud")
plt.show()

In [None]:
plt.savefig(os.path.join(results_path, "Category verses fraudulent and non-fraudulent activities"))
plt.close()

#### Distribution of fraudulent activities by hour

In [None]:
# Convert 'trans_date_trans_time' column to datetime format
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])



In [None]:
# Convert 'trans_date_trans_time' column to datetime and extract the hour
df['hour'] = pd.to_datetime(df['trans_date_trans_time']).dt.hour

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharey=True)

# Plot histogram for non-fraudulent transactions
sns.histplot(x='hour', data=df[df["is_fraud"] == 0],
             stat="density", bins=24, ax=axes[0], color="orange")
axes[0].set_title("Not Fraud")
axes[0].set_xlabel("Hour of Day")
axes[0].set_ylabel("Density")
axes[0].set_xticks(range(24))

# Plot histogram for fraudulent transactions
sns.histplot(x='hour', data=df[df["is_fraud"] == 1],
             stat="density", bins=24, ax=axes[1], color="green")
axes[1].set_title("Fraud")
axes[1].set_xlabel("Hour of Day")
axes[1].set_ylabel("Density")
axes[1].set_xticks(range(24))

# Adjust layout and display the plot
plt.tight_layout()
plt.show()


### Transaction Amount by category and Fraud Status

In [None]:
# Txn amt by cat and fraud
plt.figure(figsize=(8,6))
sns.scatterplot(x = "category", y = "amt", data = df, hue = "is_fraud")
plt.grid(True)
plt.xticks(rotation=90)
plt.title('Transaction Amount by Category and Fraud Status')
plt.xlabel('Transaction Category')
plt.ylabel('Transaction Amount ($)')

plt.show()

### Counts of Fraudulent Transactions by Category

In [None]:
fraud_counts = df.groupby('category')['is_fraud'].sum()

plt.figure(figsize=(8, 6))
fraud_counts.plot(kind='bar', color='skyblue')

plt.xlabel('Transaction Category')
plt.ylabel('Count of Fraudulent Transactions')
plt.title('Counts of Fraudulent Transactions by Category')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

#### Checking outliers

In [None]:
columns = ['amt', 'lat', 'long', 'city_pop']  
fig, axes = plt.subplots(nrows=1, ncols=len(columns), figsize=(6*len(columns), 6))
for i, col in enumerate(columns):
    sns.boxplot(x=df[col], ax=axes[i], orient='h', palette='Set2')  
    axes[i].set_title(f'Box Plot of {col}')
    axes[i].set_xlabel(col)

plt.tight_layout()
plt.show()

### Feature Engineering

In [None]:
# 1. Age at Transactions
df['dob'] = pd.to_datetime(df['dob']) 
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) 
df['age_at_txns'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365 

# 2. Time Since Last Transactions
df.sort_values(['cc_num', 'trans_date_trans_time'], inplace=True) 
df['time_since_last_txn'] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.days  

# 3. Transaction Amount Relative to Average
df['avg_txn_amount'] = df.groupby('cc_num')['amt'].transform('mean')  
df['txn_amount_relative_to_avg'] = df['amt'] / df['avg_txn_amount']  

# 4. Cumulative Transactions Amount
df['cumulative_txn_amount'] = df.groupby('cc_num')['amt'].cumsum()  


print(df[['age_at_txns', 'time_since_last_txn', 'txn_amount_relative_to_avg', 'cumulative_txn_amount']])


### Dropping some columns and data transformation

In [None]:
# Drop the 'Unnamed' and cc_num column
df.drop(columns=['Unnamed: 0','cc_num'], inplace=True)


In [None]:
df=df.drop(columns=['first', 'unix_time', 'dob', 'zip', 'city','street', 'state', 'trans_num', 'trans_date_trans_time','last'], axis=1)

In [None]:
df['merchant'] = df['merchant'].apply(lambda x : x.replace('fraud_',''))
df['gender'] = df['gender'].map({'F': 0, 'M': 1})


#### Converting categorical columns into numeric

In [None]:
encode_column = ['job', 'merchant', 'category', 'lat']
woe_encoder = WOEEncoder()

df_trasform = woe_encoder.fit_transform(df[encode_column], df['is_fraud'])

df[encode_column] = df_trasform


In [None]:
df_trasform.head()

In [None]:
df.head()

In [None]:
# Drop irrelevant columns 
df.drop(columns=['hour','age_at_txns', 'long', 'city_pop', 'merchant','lat','time_since_last_txn', 'cumulative_txn_amount', 'txn_amount_relative_to_avg', 'time_since_last_txn'], inplace=True)


In [None]:
df.describe()

In [None]:

scaler = StandardScaler()

# Fit and transform the data
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Check the scaled data
print(df_scaled.head())


In [None]:
# Calculate the correlation matrix for the entire DataFrame
corr_matrix = df.corr()

# Plot the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title("Correlation Matrix of All Columns")
plt.show()


In [None]:
X = df.drop(labels='is_fraud', axis=1) 
y = df.loc[:,'is_fraud']               

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

#### Balancing dataset using SMOTE

In [None]:
smote = SMOTE(random_state=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
class_counts = y_train_resampled.value_counts()

# Create a pie chart
plt.figure(figsize=(6, 6))
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Class in Resampled Training Data')
plt.axis('equal')  
plt.show()



Imbalanced classification modelling with synthetic oversampling and stratified K-fold cross-validation

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


In [None]:
# Define the classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

# List to store metrics
roc_auc_scores, fprs, tprs, precisions, recalls, f1_scores = [], [], [], [], [], []
confusion_matrices, accuracies = [], []

# Stratified cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_train_resampled, y_train_resampled):
    X_train_fold, X_test_fold = X_train_resampled.iloc[train_index], X_train_resampled.iloc[test_index]
    y_train_fold, y_test_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[test_index]

    # Fit and predict
    classifier.fit(X_train_fold, y_train_fold)
    y_scores = classifier.predict_proba(X_test_fold)[:, 1]
    y_pred = (y_scores > 0.5)

    # Calculate metrics
    fpr, tpr, _ = roc_curve(y_test_fold, y_scores)
    fprs.append(fpr)
    tprs.append(tpr)
    roc_auc_scores.append(auc(fpr, tpr))
    precision, recall, _ = precision_recall_curve(y_test_fold, y_scores)
    precisions.append(precision)
    recalls.append(recall)
    conf_mat = confusion_matrix(y_test_fold, y_pred)
    confusion_matrices.append(conf_mat)
    accuracies.append(accuracy_score(y_test_fold, y_pred))
    f1_scores.append(f1_score(y_test_fold, y_pred))

# Results
print(f'Average Accuracy for Random Forest: {np.mean(accuracies):.2f}')
print(f'Average F1 Score for Random Forest: {np.mean(f1_scores):.2f}')
print(f'Average ROC AUC Score for Random Forest: {np.mean(roc_auc_scores):.2f}')
print(f'Average Precision for Random Forest: {np.mean([np.mean(precision) for precision in precisions]):.2f}')
print(f'Average Recall for Random Forest: {np.mean([np.mean(recall) for recall in recalls]):.2f}')

def plot_roc_curves(fprs, tprs, model_name):
    plt.figure(figsize=(8, 6))
    for i in range(len(fprs)):
        plt.plot(fprs[i], tprs[i], lw=1, label=f'Fold {i+1}')
    plt.plot([0, 1], [0, 1], linestyle='--', color='r', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curves for {model_name}')
    plt.legend(loc='lower right')
    plt.show()

def plot_precision_recall_curves(precisions, recalls, model_name):
    plt.figure(figsize=(8, 6))
    for i in range(len(precisions)):
        plt.plot(recalls[i], precisions[i], lw=1, label=f'Fold {i+1}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curves for {model_name}')
    plt.legend(loc='lower left')
    plt.show()

def plot_confusion_matrix(conf_matrix, model_name):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix.astype(int), annot=True, fmt="d", cmap="Blues")
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.show()

# Visualisation
plot_roc_curves(fprs, tprs, "Random Forest")
plot_precision_recall_curves(precisions, recalls, "Random Forest")
plot_confusion_matrix(np.mean(confusion_matrices, axis=0), "Random Forest")

# Feature Importance
feature_importances = pd.Series(classifier.feature_importances_, index=X_train_resampled.columns)
plt.figure(figsize=(8, 6))
feature_importances.sort_values().plot(kind='barh')
plt.title('Feature Importance - Random Forest')
plt.show()

### Decision Tree Modelling

In [None]:
# Define classifier
classifier = DecisionTreeClassifier(random_state=42)

# List to store metrics
roc_auc_scores, fprs, tprs, precisions, recalls, f1_scores = [], [], [], [], [], []
confusion_matrices, accuracies = [], []

# Cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_train_resampled, y_train_resampled):
    X_train_fold, X_test_fold = X_train_resampled.iloc[train_index], X_train_resampled.iloc[test_index]
    y_train_fold, y_test_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[test_index]

    # Fit and predict
    classifier.fit(X_train_fold, y_train_fold)
    y_scores = classifier.predict_proba(X_test_fold)[:, 1]
    y_pred = (y_scores > 0.5)

    # Calculate metrics
    fpr, tpr, _ = roc_curve(y_test_fold, y_scores)
    fprs.append(fpr)
    tprs.append(tpr)
    roc_auc_scores.append(auc(fpr, tpr))
    precision, recall, _ = precision_recall_curve(y_test_fold, y_scores)
    precisions.append(precision)
    recalls.append(recall)
    conf_mat = confusion_matrix(y_test_fold, y_pred)
    confusion_matrices.append(conf_mat)
    accuracies.append(accuracy_score(y_test_fold, y_pred))
    f1_scores.append(f1_score(y_test_fold, y_pred))

#results
print(f'Average Accuracy for Decision Tree: {np.mean(accuracies):.2f}')
print(f'Average F1 Score for Decision Tree: {np.mean(f1_scores):.2f}')
print(f'Average ROC AUC Score for Decision Tree: {np.mean(roc_auc_scores):.2f}')
print(f'Average Precision for Decision Tree: {np.mean([np.mean(precision) for precision in precisions]):.2f}')
print(f'Average Recall for Decision Tree: {np.mean([np.mean(recall) for recall in recalls]):.2f}')

def plot_roc_curves(fprs, tprs, model_name):
    plt.figure(figsize=(8, 6))
    for i in range(len(fprs)):
        plt.plot(fprs[i], tprs[i], lw=1, label=f'Fold {i+1}')
    plt.plot([0, 1], [0, 1], linestyle='--', color='r', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curves for {model_name}')
    plt.legend(loc='lower right')
    plt.show()

def plot_precision_recall_curves(precisions, recalls, model_name):
    plt.figure(figsize=(8, 6))
    for i in range(len(precisions)):
        plt.plot(recalls[i], precisions[i], lw=1, label=f'Fold {i+1}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curves for {model_name}')
    plt.legend(loc='lower left')
    plt.show()

def plot_confusion_matrix(conf_matrix, model_name):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix.astype(int), annot=True, fmt="d", cmap="Blues")
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.show()

# Visualisations
plot_roc_curves(fprs, tprs, "Decision Tree")
plot_precision_recall_curves(precisions, recalls, "Decision Tree")
plot_confusion_matrix(np.mean(confusion_matrices, axis=0), "Decision Tree")

# Feature importance visualization
feature_importances = pd.Series(classifier.feature_importances_, index=X_train_resampled.columns)
plt.figure(figsize=(8, 6))
feature_importances.sort_values().plot(kind='barh')
plt.title('Feature Importance - Decision Tree')
plt.show()

### KNN Modelling

In [None]:
# Define classifier
classifier = KNeighborsClassifier()

# Store metrics
roc_auc_scores, fprs, tprs, precisions, recalls, f1_scores = [], [], [], [], [], []
confusion_matrices, accuracies = [], []

# Cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_train_resampled, y_train_resampled):
    X_train_fold, X_test_fold = X_train_resampled.iloc[train_index], X_train_resampled.iloc[test_index]
    y_train_fold, y_test_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[test_index]

    # Fit and predict
    classifier.fit(X_train_fold, y_train_fold)
    y_scores = classifier.predict_proba(X_test_fold)[:, 1]
    y_pred = (y_scores > 0.5)

    # Collect and calculate metrics
    fpr, tpr, _ = roc_curve(y_test_fold, y_scores)
    fprs.append(fpr)
    tprs.append(tpr)
    roc_auc_scores.append(auc(fpr, tpr))
    precision, recall, _ = precision_recall_curve(y_test_fold, y_scores)
    precisions.append(precision)
    recalls.append(recall)
    conf_mat = confusion_matrix(y_test_fold, y_pred)
    confusion_matrices.append(conf_mat)
    accuracies.append(accuracy_score(y_test_fold, y_pred))
    f1_scores.append(f1_score(y_test_fold, y_pred))

# Results
print(f'Average Accuracy for KNN: {np.mean(accuracies):.2f}')
print(f'Average F1 Score for KNN: {np.mean(f1_scores):.2f}')
print(f'Average ROC AUC Score for KNN: {np.mean(roc_auc_scores):.2f}')
print(f'Average Precision for KNN: {np.mean([np.mean(precision) for precision in precisions]):.2f}')
print(f'Average Recall for KNN: {np.mean([np.mean(recall) for recall in recalls]):.2f}')

def plot_roc_curves(fprs, tprs, model_name):
    plt.figure(figsize=(8, 6))
    for i in range(len(fprs)):
        plt.plot(fprs[i], tprs[i], lw=1, label=f'Fold {i+1}')
    plt.plot([0, 1], [0, 1], linestyle='--', color='r', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curves for {model_name}')
    plt.legend(loc='lower right')
    plt.show()

def plot_precision_recall_curves(precisions, recalls, model_name):
    plt.figure(figsize=(8, 6))
    for i in range(len(precisions)):
        plt.plot(recalls[i], precisions[i], lw=1, label=f'Fold {i+1}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curves for {model_name}')
    plt.legend(loc='lower left')
    plt.show()

def plot_confusion_matrix(conf_matrix, model_name):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix.astype(int), annot=True, fmt="d", cmap="Blues")
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.show()

# Visualisations
plot_roc_curves(fprs, tprs, "KNN")
plot_precision_recall_curves(precisions, recalls, "KNN")
plot_confusion_matrix(np.mean(confusion_matrices, axis=0), "KNN")



### Dynamic Synthetic Oversampling with Stratified K-Fold Cross-Validation within a Pipeline Framework

In [None]:
# Define the pipeline
pipeline = Pipeline([
    ('smote', SMOTE(random_state=1)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1))
])

# Prepare cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = []
cumulative_conf_matrix = np.zeros((2, 2))  # Assuming binary classification

fig, (ax_roc, ax_pr) = plt.subplots(1, 2, figsize=(12, 6))

# Cross-validation loop
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Fiting the model
    pipeline.fit(X_train_fold, y_train_fold)
    y_pred = pipeline.predict(X_test_fold)
    y_probs = pipeline.predict_proba(X_test_fold)[:, 1]

    # Metrics collection
    scores.append({
        'roc_auc': roc_auc_score(y_test_fold, y_probs),
        'f1': f1_score(y_test_fold, y_pred),
        'accuracy': accuracy_score(y_test_fold, y_pred),
        'recall': recall_score(y_test_fold, y_pred),
        'precision': precision_score(y_test_fold, y_pred)
    })
    
    # confusion matrix
    cumulative_conf_matrix += confusion_matrix(y_test_fold, y_pred)
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_test_fold, y_probs)
    ax_roc.plot(fpr, tpr, alpha=0.3)
    
    # Precision-Recall curve
    precision, recall, _ = precision_recall_curve(y_test_fold, y_probs)
    ax_pr.plot(recall, precision, alpha=0.3)

# Average metrics and ploting ROC & Precision-Recall
ax_roc.plot([0, 1], [0, 1], 'k--')
ax_roc.set_title('ROC Curve')
ax_roc.set_xlabel('False Positive Rate')
ax_roc.set_ylabel('True Positive Rate')

ax_pr.set_title('Precision-Recall Curve')
ax_pr.set_xlabel('Recall')
ax_pr.set_ylabel('Precision')

plt.tight_layout()
plt.show()

# Calculate average metrics
average_scores = {metric: np.mean([score[metric] for score in scores]) for metric in scores[0]}
print("Average Metrics:", average_scores)

# display the cumulative confusion matrix
print("Cumulative Confusion Matrix:\n", cumulative_conf_matrix.astype(int))

# Defining the plot function for cumulative confusion matrix
def plot_cumulative_confusion_matrix(matrix):
    fig, ax = plt.subplots()
    cax = ax.matshow(matrix, cmap='Blues')
    plt.title('Cumulative Confusion Matrix')
    fig.colorbar(cax)
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_xticklabels([''] + ['Non-fraud', 'Fraud'])
    ax.set_yticklabels([''] + ['Non-fraud', 'Fraud'])

    # Looping over data dimensions and create text annotations.
    for i in range(matrix.shape[0]):
        for j in range(matrix.shape[1]):
            ax.text(j, i, str(matrix[i, j]), ha="center", va="center", color="black")

    plt.show()

# Call the plot function
plot_cumulative_confusion_matrix(cumulative_conf_matrix)

# Feature Importance Visualisation
feature_importances = pd.Series(pipeline.named_steps['classifier'].feature_importances_, index=X_train.columns)
plt.figure(figsize=(10, 8))
feature_importances.sort_values().plot(kind='barh')
plt.title('Feature Importance - RandomForest')
plt.show()

# DataFrame for the metrics
metrics_df = pd.DataFrame(scores)
print(metrics_df.describe().transpose()[['mean', 'std']])  

### Decision Tree

In [None]:
# Define the pipeline
pipeline = Pipeline([
    ('smote', SMOTE(random_state=1)),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = []
cumulative_conf_matrix = np.zeros((2, 2))  

fig, (ax_roc, ax_pr) = plt.subplots(1, 2, figsize=(12, 6))

# Cross-validation loop
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Fitting the model
    pipeline.fit(X_train_fold, y_train_fold)
    y_pred = pipeline.predict(X_test_fold)
    y_probs = pipeline.predict_proba(X_test_fold)[:, 1]

    # Metrics collection
    scores.append({
        'roc_auc': roc_auc_score(y_test_fold, y_probs),
        'f1': f1_score(y_test_fold, y_pred),
        'accuracy': accuracy_score(y_test_fold, y_pred),
        'recall': recall_score(y_test_fold, y_pred),
        'precision': precision_score(y_test_fold, y_pred)
    })
    
    # Confusion matrix
    cumulative_conf_matrix += confusion_matrix(y_test_fold, y_pred).astype(int)
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_test_fold, y_probs)
    ax_roc.plot(fpr, tpr, alpha=0.3)
    
    # Precision-Recall curve
    precision, recall, _ = precision_recall_curve(y_test_fold, y_probs)
    ax_pr.plot(recall, precision, alpha=0.3)

# Finalize ROC & Precision-Recall plots
ax_roc.plot([0, 1], [0, 1], 'k--')
ax_roc.set_title('ROC Curve')
ax_roc.set_xlabel('False Positive Rate')
ax_roc.set_ylabel('True Positive Rate')

ax_pr.set_title('Precision-Recall Curve')
ax_pr.set_xlabel('Recall')
ax_pr.set_ylabel('Precision')

plt.tight_layout()
plt.show()

# Calculate average metrics
average_scores = {metric: np.mean([score[metric] for score in scores]) for metric in scores[0]}
print("Average Metrics for Decision Tree:", average_scores)


def plot_cumulative_confusion_matrix(matrix):
    fig, ax = plt.subplots()
    cax = ax.matshow(matrix, cmap='Blues')
    plt.title('Cumulative Confusion Matrix')
    fig.colorbar(cax)
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_xticks([0, 1, 2])  # Set the ticks first
    ax.set_xticklabels(['', 'Non-fraud', 'Fraud'])  
    ax.set_yticks([0, 1, 2])  # Set the ticks first
    ax.set_yticklabels(['', 'Non-fraud', 'Fraud'])  

    # Looping over data dimensions and create text annotations.
    for i in range(matrix.shape[0]):  # Corrected indentation
        for j in range(matrix.shape[1]):  # Corrected indentation
            ax.text(j, i, str(matrix[i, j]), ha="center", va="center", color="black")

    plt.show()

plot_cumulative_confusion_matrix(cumulative_conf_matrix)

# Feature importance visualization
feature_importances = pd.Series(pipeline.named_steps['classifier'].feature_importances_, index=X_train.columns)
plt.figure(figsize=(10, 8))
feature_importances.sort_values().plot(kind='barh')
plt.title('Feature Importance - Decision Tree')
plt.show()


### KNN

In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, recall_score, precision_score, confusion_matrix, roc_curve, precision_recall_curve
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define the pipeline with imblearn's Pipeline
pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=1)),
    ('classifier', KNeighborsClassifier())
])

# Cross-validation setup
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = []
cumulative_conf_matrix = np.zeros((2, 2), dtype=int)

fig, (ax_roc, ax_pr) = plt.subplots(1, 2, figsize=(12, 6))

# Cross-validation loop
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Fit the model
    pipeline.fit(X_train_fold, y_train_fold)
    y_pred = pipeline.predict(X_test_fold)
    y_probs = pipeline.predict_proba(X_test_fold)[:, 1]

    # Metrics collection
    scores.append({
        'roc_auc': roc_auc_score(y_test_fold, y_probs),
        'f1': f1_score(y_test_fold, y_pred),
        'accuracy': accuracy_score(y_test_fold, y_pred),
        'recall': recall_score(y_test_fold, y_pred),
        'precision': precision_score(y_test_fold, y_pred)
    })
    
    # Confusion matrix update
    cumulative_conf_matrix += confusion_matrix(y_test_fold, y_pred)
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_test_fold, y_probs)
    ax_roc.plot(fpr, tpr, alpha=0.3)
    
    # Precision-Recall curve
    precision, recall, _ = precision_recall_curve(y_test_fold, y_probs)
    ax_pr.plot(recall, precision, alpha=0.3)

# Finalizing ROC & Precision-Recall plots
ax_roc.plot([0, 1], [0, 1], 'k--')
ax_roc.set_title('ROC Curve')
ax_roc.set_xlabel('False Positive Rate')
ax_roc.set_ylabel('True Positive Rate')

ax_pr.set_title('Precision-Recall Curve')
ax_pr.set_xlabel('Recall')
ax_pr.set_ylabel('Precision')

plt.tight_layout()
plt.show()

# Calculating average metrics
average_scores = {metric: np.mean([score[metric] for score in scores]) for metric in scores[0]}
print("Average Metrics for KNN:", average_scores)

# Display cumulative confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cumulative_conf_matrix, annot=True, fmt="d", cmap='Blues')
plt.title("Cumulative Confusion Matrix - KNN")
plt.xlabel("Predicted labels")
plt.ylabel("True labels")
plt.show()



In [None]:
# predictions/probabilities) from model
y_probs = pipeline.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_probs)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ROC Curve')
plt.plot([0, 1], [0, 1], 'k--')  
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.savefig(os.path.join(results_path, "roc_curve.png"))
plt.close()