<a href="https://colab.research.google.com/github/KhinMyatNandar/Fraud-Detection/blob/main/RUS_CCFD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [None]:
CCFD= pd.read_csv('/content/new_creditcard.csv')

In [None]:
df=pd.DataFrame(CCFD)

# DATA Splitting


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#Separate the features and target variables
X= df.drop("Class", axis=1)
y= df["Class"]#target variables

In [None]:
#Split the dataset into 70% and 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y, random_state=42)

#RUS

In [None]:
rus=RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus= rus.fit_resample(X_train, y_train)

In [None]:
# Display the new class distribution
print('Original dataset shape:', Counter(y_train))
print('RUS dataset shape:', Counter(y_train_rus))

NameError: name 'Counter' is not defined

In [None]:
# Plot the class distribution after RUS
rus_distribution = Counter(y_train_rus)
plt.figure(figsize=(10, 5))
sns.barplot(x=list(rus_distribution.keys()), y=list(rus_distribution.values()))
plt.title('Class Distribution After Random UnderSampling (RUS)')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Defining the models
models={
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}
for i in range(len(list(models))):
  #model= list(models.values())[i]#youtube
    model_name=list(models.keys())[i]
    model=list(models.values())[i]
    #print(f"Training {model}...") #to ensure loop is worked
    model.fit(X_train_rus, y_train_rus)#train model


#predictions
    y_train_pred_rus= model.predict(X_train_rus)
    y_test_pred_rus= model.predict(X_test)
    y_test_proba_rus= model.predict_proba(X_test)[:,1]#needed for AUC and AUPRC


    #Training set performance
    rus_train_precision= precision_score(y_train_rus, y_train_pred_rus)
    rus_train_recall= recall_score(y_train_rus, y_train_pred_rus)
    rus_train_f1_score= f1_score(y_train_rus, y_train_pred_rus, average='weighted')
    rus_train_roc_auc= roc_auc_score(y_train_rus, y_train_pred_rus)
    rus_train_auprc= average_precision_score(y_train_rus, y_train_pred_rus)

        #Testing set performance
    rus_test_precision= precision_score(y_test, y_test_pred_rus)
    rus_test_recall= recall_score(y_test, y_test_pred_rus)
    rus_test_f1_score= f1_score(y_test, y_test_pred_rus, average='weighted')
    rus_test_roc_auc= roc_auc_score(y_test, y_test_pred_rus)
    rus_test_auprc= average_precision_score(y_test, y_test_pred_rus)

        #print(list(models.keys())[i])
    print(f"Model:{model_name}")
    print('Model performace for Training set with RUS')
    print("- Precision : {:.4f}".format(rus_train_precision))
    print("- Recall : {:.4f}".format(rus_train_recall))
    print("- F1 Score : {:.4f}".format(rus_train_f1_score))
    print("- AUC-ROC : {:.4f}".format(rus_train_roc_auc))
    print("- AUPRC : {:.4f}".format(rus_train_auprc))

    print('----------------------------------')

    print('Model performace for Testing set with RUS')
    print("- Precision : {:.4f}".format(rus_test_precision))
    print("- Recall : {:.4f}".format(rus_test_recall))
    print("- F1 Score : {:.4f}".format(rus_test_f1_score))
    print("- AUC-ROC : {:.4f}".format(rus_test_roc_auc))
    print("- AUPRC : {:.4f}".format(rus_test_auprc))

    print('='*35)
    print('\n')

Model:Logistic Regression
Model performace for Training set with RUS
- Precision : 0.9810
- Recall : 0.9366
- F1 Score : 0.9592
- AUC-ROC : 0.9592
- AUPRC : 0.9505
----------------------------------
Model performace for Testing set with RUS
- Precision : 0.0355
- Recall : 0.8803
- F1 Score : 0.9780
- AUC-ROC : 0.9202
- AUPRC : 0.0315


Model:XGBoost
Model performace for Training set with RUS
- Precision : 1.0000
- Recall : 1.0000
- F1 Score : 1.0000
- AUC-ROC : 1.0000
- AUPRC : 1.0000
----------------------------------
Model performace for Testing set with RUS
- Precision : 0.0366
- Recall : 0.8803
- F1 Score : 0.9786
- AUC-ROC : 0.9208
- AUPRC : 0.0324


Model:K-Nearest Neighbors
Model performace for Training set with RUS
- Precision : 0.9870
- Recall : 0.9184
- F1 Score : 0.9531
- AUC-ROC : 0.9532
- AUPRC : 0.9473
----------------------------------
Model performace for Testing set with RUS
- Precision : 0.0514
- Recall : 0.8592
- F1 Score : 0.9850
- AUC-ROC : 0.9163
- AUPRC : 0.0444


In [None]:
all_results=[]

In [None]:
#Save in Google Drive for comparison for different notebook
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import joblib

In [None]:
#create directory
os.makedirs(drive_path, exist_ok=True)

In [None]:
# Store the results in a dictionary and append to the list
results = {
        'Model': model_name,
        'Sampling Technique': 'RUS',
        'Train Precision': rus_train_precision,
        'Train Recall': rus_train_recall,
        'Train F1 Score': rus_train_f1_score,
        'Train AUC-ROC': rus_train_roc_auc,
        'Train AUPRC': rus_train_auprc,
        'Test Precision': rus_test_precision,
        'Test Recall': rus_test_recall,
        'Test F1 Score': rus_test_f1_score,
        'Test AUC-ROC': rus_test_roc_auc,
        'Test AUPRC': rus_test_auprc
}

all_results.append(results)


In [None]:
# Convert the list of results to a DataFrame
results_df = pd.DataFrame(all_results)

In [None]:
drive_path='/content/drive/MyDrive/CCFD'

In [None]:
# Save in Google Drive for comparison in different notebooks
drive_path = '/content/drive/MyDrive/CCFD_Results/'
os.makedirs(drive_path, exist_ok=True)

In [None]:
# Save the results DataFrame to a CSV file
results_file= os.path.join(drive_path, 'RUS_CCFD_results.csv')
results_df.to_csv(results_file, index=False)

In [None]:
print(f"Results saved to {results_file}")

Results saved to /content/drive/MyDrive/CCFD_Results/RUS_CCFD_results.csv
