In [None]:
pip install numpy pandas scikit-learn imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from sklearn.cluster import Birch
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

## Step 1: Data exploration and Data preparation

In [None]:
# Load dataset
dataset_main = pd.read_csv('PS_20174392719_1491204439457_log.csv')
dataset_main.head(10)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
5,1,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0
6,1,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0
7,1,PAYMENT,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,0,0
8,1,PAYMENT,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,0,0
9,1,DEBIT,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,0,0


In [None]:
if dataset_main.isna().any().any():
    print("Missing Values in Dataframe!")
else:
    print(dataset_main.isna().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [None]:
dataset_main.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [None]:
dataset_main.drop('nameOrig', axis=1, inplace=True)
dataset_main.drop('nameDest', axis=1, inplace=True)
dataset_main.drop('isFlaggedFraud', axis=1, inplace=True)
dataset_main.head(10)

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0
5,1,PAYMENT,7817.71,53860.0,46042.29,0.0,0.0,0
6,1,PAYMENT,7107.77,183195.0,176087.23,0.0,0.0,0
7,1,PAYMENT,7861.64,176087.23,168225.59,0.0,0.0,0
8,1,PAYMENT,4024.36,2671.0,0.0,0.0,0.0,0
9,1,DEBIT,5337.77,41720.0,36382.23,41898.0,40348.79,0


In [None]:
# Kiểm tra các hàng trùng lặp (tất cả các cột)
duplicates = dataset_main.duplicated()

# Đếm số lượng các hàng trùng lặp
num_duplicates = duplicates.sum()
print(f'Số lượng các hàng trùng lặp: {num_duplicates}')

# Hiển thị các hàng trùng lặp (nếu có)
if num_duplicates > 0:
    print('Các hàng trùng lặp:')
    print(dataset_main[duplicates])
else:
    print('Không có hàng trùng lặp nào trong dataset.')

Số lượng các hàng trùng lặp: 543
Các hàng trùng lặp:
         step      type       amount  oldbalanceOrg  newbalanceOrig  \
11104       7   PAYMENT      1849.50            0.0             0.0   
33556       8   PAYMENT      7759.31            0.0             0.0   
59969       9   PAYMENT      2388.93            0.0             0.0   
60763       9   PAYMENT     10042.85            0.0             0.0   
63096       9   PAYMENT      2783.83            0.0             0.0   
...       ...       ...          ...            ...             ...   
6281481   646  CASH_OUT  10000000.00     10000000.0             0.0   
6281485   646  CASH_OUT         0.00            0.0             0.0   
6351224   702  CASH_OUT  10000000.00     10000000.0             0.0   
6362455   730  CASH_OUT  10000000.00     10000000.0             0.0   
6362581   741  CASH_OUT  10000000.00     10000000.0             0.0   

         oldbalanceDest  newbalanceDest  isFraud  
11104               0.0             0.0    

In [None]:
dataset_main_clean = dataset_main.drop_duplicates()

# Kiểm tra lại để đảm bảo rằng các hàng trùng lặp đã được loại bỏ
duplicates_after = dataset_main_clean.duplicated()
num_duplicates_after = duplicates_after.sum()
print(f'Số lượng các hàng trùng lặp sau khi loại bỏ: {num_duplicates_after}')

Số lượng các hàng trùng lặp sau khi loại bỏ: 0


## Step 2: Exploratory Analyist

In [None]:
# Đếm số lượng giao dịch trong mỗi lớp
class_count_df = pd.DataFrame(dataset_main_clean['isFraud'].value_counts()).reset_index()
class_count_df.columns = ['Class', 'Counts']

class_count_df.value_counts()

Class  Counts 
0      6353880    1
1      8197       1
Name: count, dtype: int64

### Create Label


In [None]:
from sklearn.preprocessing import LabelEncoder

# Khởi tạo và fit LabelEncoder
label_encoder = LabelEncoder()
dataset_main_clean['type'] = label_encoder.fit_transform(dataset_main_clean['type'])

# Hiển thị danh sách các phương thức và số tương ứng
print("Các phương thức giao dịch và số tương ứng:")
for method, code in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{method}: {code}")

Các phương thức giao dịch và số tương ứng:
CASH_IN: 0
CASH_OUT: 1
DEBIT: 2
PAYMENT: 3
TRANSFER: 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_main_clean['type'] = label_encoder.fit_transform(dataset_main_clean['type'])


In [None]:
dataset_main_clean.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,3,9839.64,170136.0,160296.36,0.0,0.0,0
1,1,3,1864.28,21249.0,19384.72,0.0,0.0,0
2,1,4,181.0,181.0,0.0,0.0,0.0,1
3,1,1,181.0,181.0,0.0,21182.0,0.0,1
4,1,3,11668.14,41554.0,29885.86,0.0,0.0,0


## Step 3: Handeling Imbalanced Dataset

### Handle Large dataset first

#### Step 1: Optimize Data Types

In [None]:
dataset_main_clean['amount'] = dataset_main_clean['amount'].astype(np.float32)
dataset_main_clean['oldbalanceOrg'] = dataset_main_clean['oldbalanceOrg'].astype(np.float32)
dataset_main_clean['newbalanceOrig'] = dataset_main_clean['newbalanceOrig'].astype(np.float32)
dataset_main_clean['oldbalanceDest'] = dataset_main_clean['oldbalanceDest'].astype(np.float32)
dataset_main_clean['newbalanceDest'] = dataset_main_clean['newbalanceDest'].astype(np.float32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_main_clean['amount'] = dataset_main_clean['amount'].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_main_clean['oldbalanceOrg'] = dataset_main_clean['oldbalanceOrg'].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_main_clean['newbalanceOri

In [None]:
dataset_main_clean.head(10)

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,3,9839.639648,170136.0,160296.359375,0.0,0.0,0
1,1,3,1864.280029,21249.0,19384.720703,0.0,0.0,0
2,1,4,181.0,181.0,0.0,0.0,0.0,1
3,1,1,181.0,181.0,0.0,21182.0,0.0,1
4,1,3,11668.139648,41554.0,29885.859375,0.0,0.0,0
5,1,3,7817.709961,53860.0,46042.289062,0.0,0.0,0
6,1,3,7107.77002,183195.0,176087.234375,0.0,0.0,0
7,1,3,7861.640137,176087.234375,168225.59375,0.0,0.0,0
8,1,3,4024.360107,2671.0,0.0,0.0,0.0,0
9,1,2,5337.77002,41720.0,36382.230469,41898.0,40348.789062,0


#### Step 2 train

In [None]:
X = dataset_main_clean.drop(columns=['isFraud'])
y = dataset_main_clean['isFraud']
print(y.value_counts())

isFraud
0    6353880
1       8197
Name: count, dtype: int64


In [None]:
import warnings
# Suppress warnings for clean output
warnings.filterwarnings('ignore')

In [None]:
def tomek_links_undersampling(X, y):
    tl = TomekLinks(sampling_strategy='auto')
    X_res, y_res = tl.fit_resample(X, y)
    return X_res, y_res


In [None]:
def incremental_birch_clustering_borderline_smote(X, y, n_clusters=10, threshold=0.1, batch_size=5000, min_samples=6):
    brc = Birch(n_clusters=n_clusters, threshold=threshold)

    X_res = []
    y_res = []

    for start in range(0, X.shape[0], batch_size):
        end = min(start + batch_size, X.shape[0])
        X_batch = X[start:end]
        y_batch = y[start:end]

        X_clusters = brc.fit_predict(X_batch)

        for cluster in np.unique(X_clusters):
            X_cluster = X_batch[X_clusters == cluster]
            y_cluster = y_batch[X_clusters == cluster]

            if len(X_cluster) >= min_samples:  # Ensure cluster has enough samples
                if len(set(y_cluster)) > 1:  # Only resample clusters with both classes
                    # Dynamically set the k_neighbors parameter based on cluster size
                    k_neighbors = min(len(X_cluster) - 1, 5)
                    sm = BorderlineSMOTE(sampling_strategy='minority', k_neighbors=k_neighbors, kind='borderline-1')

                    try:
                        X_resampled, y_resampled = sm.fit_resample(X_cluster, y_cluster)
                        X_res.append(X_resampled)
                        y_res.append(y_resampled)
                    except ValueError as e:
                        print(f"Skipping cluster {cluster} due to error: {e}")
                        X_res.append(X_cluster)
                        y_res.append(y_cluster)
                else:
                    X_res.append(X_cluster)
                    y_res.append(y_cluster)
            else:
                print(f"Skipping cluster {cluster} due to insufficient samples: {len(X_cluster)}")
                X_res.append(X_cluster)
                y_res.append(y_cluster)

    X_res = np.vstack(X_res)
    y_res = np.hstack(y_res)

    return X_res, y_res

In [None]:
# # Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Step 1: Tomek Links Undersampling
X_tomek, y_tomek = tomek_links_undersampling(X_train, y_train)
print('After Tomek Links Undersampling:', Counter(y_tomek))

After Tomek Links Undersampling: Counter({0: 5082173, 1: 6558})


bỏ qua các cluster trong quá trình xử lý là do trong cluster đó không đủ số lượng mẫu cần thiết để thực hiện kỹ thuật Borderline SMOTE

Nếu cluster không đủ số lượng mẫu này, chúng ta sẽ bỏ qua cluster đó để tránh gặp lỗi.

In [None]:
# Step 2: BIRCH Clustering and Borderline SMOTE
X_birch_smote, y_birch_smote = incremental_birch_clustering_borderline_smote(X_tomek, y_tomek, n_clusters=10, threshold=0.1, batch_size=5000)
print('After BIRCH Clustering Borderline SMOTE:', Counter(y_birch_smote))

Skipping cluster 4 due to insufficient samples: 1
Skipping cluster 9 due to insufficient samples: 1
Skipping cluster 5 due to insufficient samples: 1
Skipping cluster 8 due to insufficient samples: 5
Skipping cluster 4 due to insufficient samples: 2
Skipping cluster 6 due to insufficient samples: 1
Skipping cluster 8 due to insufficient samples: 4
Skipping cluster 9 due to error: Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 3, n_samples = 2
Skipping cluster 7 due to insufficient samples: 1
Skipping cluster 8 due to error: Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 5, n_samples = 2
Skipping cluster 1 due to error: Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 4, n_samples = 2
Skipping cluster 2 due to insufficient samples: 3
Skipping cluster 7 due to insufficient samples: 1
Skipping cluster 0 due to error: Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 4, n_samples 

## test and train the model

In [None]:
# Huấn luyện mô hình trên tập dữ liệu đã được cân bằng
clf = RandomForestClassifier(random_state=42)
clf.fit(X_birch_smote, y_birch_smote)

# Dự đoán và đánh giá
y_pred = clf.predict(X_test)

# In ra các chỉ số đánh giá
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

Confusion Matrix:
[[1270310     467]
 [    171    1468]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270777
           1       0.76      0.90      0.82      1639

    accuracy                           1.00   1272416
   macro avg       0.88      0.95      0.91   1272416
weighted avg       1.00      1.00      1.00   1272416


Accuracy Score:
0.9994985916555592
