In [1]:
# Importing libraries

import pandas as pd 
import numpy as np
from numpy import float32
from tqdm import tqdm

from typing import Optional
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, confusion_matrix, precision_score,roc_auc_score,f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.multiprocessing import set_start_method

import pytorch_lightning as pl

from torch.utils.data import DataLoader, Dataset
import os

import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
random_seed = 123
torch.manual_seed(random_seed)

BATCH_SIZE=1000
AVAIL_GPUS = min(1, torch.cuda.device_count())
NUM_WORKERS=0

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
#df = pd.read_csv("/kaggle/input/credit-card-fraud/card_transdata.csv")
df = pd.read_csv("/input/creditcardfraud/creditcard.csv")
#df.rename(columns={'fraud':'Class'},inplace=True)
# df = df.drop(['type','nameDest','nameOrign'],axis=1)
from sklearn.preprocessing import StandardScaler
Mx = df.iloc[:, :-1]  # 所有行，除了最后一列
nx = df.iloc[:, -1]   # 所有行的最后一列

# 创建 StandardScaler 对象
scaler = StandardScaler()

# 对特征进行拟合和转换
Mx_scaled = pd.DataFrame(scaler.fit_transform(Mx), columns=Mx.columns)

# 如果你想保留原始的目标变量，可以这样做：
scaled_data = pd.concat([Mx_scaled, nx], axis=1)

# 现在，scaled_data 就是标准化后的数据集
# 你可以查看结果
scaled_data.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.996583,-0.694242,-0.044075,1.672773,0.973366,-0.245117,0.347068,0.193679,0.082637,0.331128,...,-0.024923,0.382854,-0.176911,0.110507,0.246585,-0.39217,0.330892,-0.063781,0.244964,0
1,-1.996583,0.608496,0.161176,0.109797,0.316523,0.043483,-0.06182,-0.0637,0.071253,-0.232494,...,-0.307377,-0.880077,0.162201,-0.561131,0.320694,0.261069,-0.022256,0.044608,-0.342475,0
2,-1.996562,-0.6935,-0.811578,1.169468,0.268231,-0.364572,1.351454,0.639776,0.207373,-1.378675,...,0.337632,1.063358,1.45632,-1.138092,-0.628537,-0.288447,-0.137137,-0.181021,1.160686,0
3,-1.996562,-0.493325,-0.112169,1.182516,-0.609727,-0.007469,0.93615,0.192071,0.316018,-1.262503,...,-0.147443,0.007267,-0.304777,-1.941027,1.241904,-0.460217,0.155396,0.186189,0.140534,0
4,-1.996541,-0.59133,0.531541,1.021412,0.284655,-0.295015,0.071999,0.479302,-0.22651,0.744326,...,-0.012839,1.100011,-0.220123,0.23325,-0.395202,1.041611,0.54362,0.651816,-0.073403,0


In [4]:
# Creating function to plot confusion metrics for evaluation
def plot_cm(labels, predictions, p=0.5):
    cm = confusion_matrix(labels, predictions)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion matrix @{:.2f}'.format(p))
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

In [5]:
train, test = train_test_split(df, test_size=0.2, stratify=df['Class'])
train.shape, test.shape

((227845, 31), (56962, 31))

In [6]:
# Normalizing the data
data_mean = train.iloc[:,:-1].mean()
data_std = train.iloc[:,:-1].std()
train_norm = (train.iloc[:,:-1] - data_mean)/data_std
test_norm = (test.iloc[:,:-1] - data_mean)/data_std
train_norm['Class'] =  train.iloc[:, -1]
test_norm['Class'] =  test.iloc[:, -1]

In [7]:
train_norm['Class'].value_counts()

Class
0    227451
1       394
Name: count, dtype: int64

In [8]:
# Creating data model to use with the pytorch Lightning package
class CreditCardDataFinal(Dataset):
    def __init__(self, data: pd.DataFrame):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        label = torch.tensor(self.data[index][-1], dtype=torch.float32)
        row = torch.tensor(self.data[index][:-1], dtype=torch.float32)
        return row, label
    
class CreditCardDataModel(pl.LightningDataModule):
    def __init__(self, data: pd.DataFrame, batch_size=BATCH_SIZE, num_workers=0):
        super().__init__()
        self.data = data
        self.batch_size = batch_size
        self.num_workers = num_workers
    
    def setup(self, stage: Optional[str] = None):
        train_df, test_df = train_test_split(self.data, random_state=123, test_size=0.2, stratify=self.data['Class'])
        self.train_df = torch.tensor(train_norm.to_numpy(float32), dtype=torch.float32)
        self.test_df = torch.tensor(test_norm.to_numpy(float32), dtype=torch.float32)

    def train_dataloader(self):
        return DataLoader(dataset=CreditCardDataFinal(self.train_df), batch_size=self.batch_size, num_workers=self.num_workers)
    
    def test_dataloader(self):
        return DataLoader(dataset=CreditCardDataFinal(self.test_df), batch_size=self.batch_size, num_workers=self.num_workers)
    

In [9]:
# Building the main neural network to predict if the data is fraud or not. 
class ModelCreditCard(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.sequential = nn.Sequential(
            nn.Linear(input_size, 100),
            nn.LeakyReLU(0.1),
            nn.Linear(100, 50),
            nn.LeakyReLU(0.1),
            nn.Linear(50, 25),
            nn.LeakyReLU(0.1),
            nn.Linear(25, 1)
        )
    
    def forward(self, x):
        tensor = torch.sigmoid(self.sequential(x))
        return tensor
    

class ModelTraining(pl.LightningModule):
    def __init__(self, model, lr=1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.model = model
        
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)
        y_hat = self.model(x)
        y = y.unsqueeze(1)
        loss = self.binary_loss(y_hat, y)
        return {"loss":loss}
    
    def binary_loss(self, y_hat,y):
        return F.binary_cross_entropy(y_hat, y)
    
    def configure_optimizers(self):
        lr = self.hparams.lr
        opt_g = torch.optim.Adam(self.model.parameters(), lr )
        return [opt_g], []

In [10]:
trainer = pl.Trainer(max_epochs=10, accelerator='cuda', devices=1)

In [11]:
model_card = ModelCreditCard(30)
model = ModelTraining(model_card)

dm = CreditCardDataModel(train_norm)

trainer = pl.Trainer(max_epochs=5, accelerator='gpu', devices=1)
trainer.fit(model, dm)

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/parsing.py:208: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

  label = torch.tensor(self.data[index][-1], dtype=torch.float32)
  row = torch.tensor(self.data[index][:-1], dtype=torch.float32)
  x = torch.tensor(x, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)


In [12]:
test_pred = torch.tensor(test_norm.drop(["Class"], axis=1).to_numpy()).float()
test_true =  test_norm['Class'].to_numpy()
test_true = torch.tensor(test_true).unsqueeze(1).float()

In [13]:
test_output_real = model_card.forward(test_pred)
test_pred_real = [1 if i > 0.5 else 0 for i in test_output_real]
recall_score(test_true, test_pred_real)

0.8469387755102041

In [16]:
class CreditCardData(Dataset):
    def __init__(self, data: pd.DataFrame):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        row = torch.tensor(self.data.iloc[index].values).float()
        return row

In [17]:
class DataModule(pl.LightningDataModule):
    def __init__(self, data: pd.DataFrame, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS):
        super().__init__()
        self.data = data
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.data_mean = None
        self.data_std = None
        
    def prepare_data(self):
        pass
    
    def setup(self, stage: Optional[str] = None):
        train_df, test_df = train_test_split(self.data, random_state=123, test_size=0.2)
        self.data_mean = train_df.mean()
        self.data_std = train_df.std()
        train_norm = (train_df - self.data_mean) / self.data_std
        test_norm = (test_df - self.data_mean) / self.data_std
        self.train_df = train_norm
        self.test_df = test_norm
    
    def train_dataloader(self):
        return DataLoader(dataset=CreditCardData(self.train_df), batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
    
    def valid_dataloader(self):
        return DataLoader(CreditCardData(self.val_df), batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
    
    def test_dataloader(self):
        return DataLoader(CreditCardData(self.test_df), batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=True)
        

In [18]:
class GatedSelfAttention(nn.Module):
    def __init__(self, input_dim):
        super(GatedSelfAttention, self).__init__()
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)
        self.gate = nn.Linear(input_dim, input_dim)  # 新增的门控层

    def forward(self, x):
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)
        G = torch.sigmoid(self.gate(x))  # 计算门控信号，并通过Sigmoid激活
        
        # 计算注意力权重
        attention_scores = torch.bmm(Q, K.transpose(1, 2)) / (x.size(-1) ** 0.5)
        attention_weights = F.softmax(attention_scores, dim=-1)
        
        # 应用门控机制到值向量上
        gated_V = V * G  # 逐元素乘法
        
        # 应用注意力权重到门控后的值向量
        attention_output = torch.bmm(attention_weights, gated_V)
        
        return attention_output

In [19]:
class Discriminator(nn.Module):
    def __init__(self, input_dim=30):
        super(Discriminator, self).__init__()

        self.conv_layers = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.2),
            nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.2),
            nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(0.2)
        )

        self.attention = GatedSelfAttention(64)  # 使用新的门控自注意力模块

        self.fc_layers = nn.Sequential(
            nn.Linear(64 * input_dim, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv_layers(x)
        x = x.transpose(1, 2)
        x = self.attention(x)
        x = x.view(x.size(0), -1)
        x = self.fc_layers(x)
        return torch.sigmoid(x)

In [20]:
#classic generator
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super().__init__()
        self.latent_dim = latent_dim
        self.output_dim = output_dim
        
        self.model = nn.Sequential(
            #nn.Linear(latent_dim, output_dim)
            nn.Linear(latent_dim, 100),
            nn.LeakyReLU(0.2),
            nn.Linear(100, 100),
            nn.LeakyReLU(0.2),
            nn.Linear(100, output_dim)
        )

    def forward(self, z):
        return self.model(z)

In [21]:
class GAN(pl.LightningModule):
    def __init__(self, latent_dim=100, lr=0.002):
        super().__init__()
        self.save_hyperparameters()
        
        self.generator = Generator(latent_dim=self.hparams.latent_dim,output_dim=30)  # 假设输出维度为28)
        self.discriminator = Discriminator()
        
        self.validation_z = torch.randn(6, self.hparams.latent_dim).to(self.device)

        self.automatic_optimization = False  # Disable automatic optimization
        
    def setup(self, stage):
        # 获取数据模块
        data_module = self.trainer.datamodule
        
        # 获取数据统计信息
        self.data_mean = torch.tensor(data_module.data_mean.values, dtype=torch.float32)
        self.data_std = torch.tensor(data_module.data_std.values, dtype=torch.float32)

    def forward(self, z):
        # 生成标准化的数据
        x_normalized = self.generator(z)
        # 使用数据的均值和方差进行反标准化
        #x = x_normalized * self.data_std + self.data_mean
        x = x_normalized
        return x
    
    def adversarial_loss(self, y_hat, y):
        return F.binary_cross_entropy(y_hat, y)
    
    def training_step(self, batch, batch_idx):
        real_data = batch #.to(self.device)
        z = torch.randn(real_data.size(0), self.hparams.latent_dim).to(self.device)
        
        # Access optimizers manually
        opt_g, opt_d = self.optimizers()
        
        # Train Complementary GAN:
        
        # Train CCFD-generator: max log(D(G(z)))
        fake_data = self.generator(z)
        y_hat_fake = self.discriminator(fake_data)
        g_loss = self.adversarial_loss(y_hat_fake, torch.ones_like(y_hat_fake))
        
        opt_g.zero_grad()
        self.manual_backward(g_loss)
        opt_g.step()
        
        # Train CCFD-discriminator: max log(D(x)) + log(1- D(G(z)))
        y_hat_real = self.discriminator(real_data)
        y_real = torch.ones(real_data.size(0), 1).to(self.device)
        real_loss = self.adversarial_loss(y_hat_real, y_real)
        
        y_hat_fake = self.discriminator(fake_data.detach())
        y_fake = torch.zeros(real_data.size(0), 1).to(self.device)
        fake_loss = self.adversarial_loss(y_hat_fake, y_fake)
        
        d_loss = (real_loss + fake_loss) / 2
        
        opt_d.zero_grad()
        self.manual_backward(d_loss)
        opt_d.step()
        
        self.log("g_loss", g_loss, prog_bar=True, logger=True)
        self.log("d_loss", d_loss, prog_bar=True, logger=True)
        
    def configure_optimizers(self):
        lr = self.hparams.lr
        opt_g = torch.optim.Adam(self.generator.parameters(), lr)
        opt_d = torch.optim.Adam(self.discriminator.parameters(), lr)
        return [opt_g, opt_d]

In [22]:
only_pos = train_norm[train_norm['Class'] == 1 ].copy()
only_pos = only_pos.drop(['Class'], axis=1)
only_pos.shape

(394, 30)

In [23]:
model = GAN()
model = model.to(device)

In [24]:
dm = DataModule(only_pos)
trainer = pl.Trainer(max_epochs=500)
trainer.fit(model, dm)

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

In [25]:
z = torch.randn(227451,100)
#227451 730078
output = model(z)

z = torch.randn(10000,100,16)
t = torch.randint(0, model.hparams.timesteps, (100, 1), dtype=torch.float, device=model.device)
output = model(z, t)

In [26]:
output

tensor([[-2.6416, -1.9761, -1.5401,  ..., -0.6735,  3.5284,  0.8151],
        [-0.7106,  1.4402, -3.0575,  ..., -0.6843,  3.1969,  0.7707],
        [-3.0817, -6.2356,  1.6204,  ...,  0.9288,  0.6119,  2.9910],
        ...,
        [-0.0269, -0.1122, -0.3418,  ..., -0.1704, -0.2492, -0.5327],
        [-0.5119, -1.6599,  0.1305,  ...,  1.0293, -0.6446,  0.9336],
        [-3.4466, -1.5024, -2.0621,  ..., -0.6449,  4.8904,  1.8447]],
       grad_fn=<AddmmBackward0>)

In [27]:
only_pos_df =  pd.DataFrame(output.detach().numpy())
only_pos_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-2.641560,-1.976067,-1.540138,-1.383987,0.728703,0.777461,1.241864,-0.927886,0.425667,0.096496,...,2.996011,-0.552803,0.550084,0.954487,-3.928111,-0.330122,-1.965147,-0.673526,3.528360,0.815128
1,-0.710555,1.440228,-3.057508,0.273666,-0.497588,2.195228,0.454073,0.292704,0.505700,1.388119,...,0.613176,0.276237,-0.022336,0.177680,-3.022206,0.432996,-0.251197,-0.684285,3.196902,0.770665
2,-3.081708,-6.235551,1.620369,-4.568712,4.286985,-2.173527,4.022481,0.794505,1.958356,-2.818753,...,2.576996,-0.465570,0.601161,0.289839,-2.216034,-0.048738,-3.434725,0.928773,0.611906,2.991007
3,0.015660,0.216741,-0.464185,-0.014711,-0.122397,0.250846,-0.192409,0.044336,-0.261046,0.094011,...,0.051491,0.045075,-0.331449,-0.210471,0.099484,0.294869,0.169326,-0.129763,-0.127137,-0.249410
4,-0.391105,1.311822,-2.225316,0.588304,-1.078543,1.950081,-0.359852,-0.044559,-0.132108,1.559947,...,0.661241,-0.084365,-0.296366,1.098889,-2.159106,0.501519,-0.036584,-0.135677,2.197541,0.459307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227446,-0.444597,0.549325,-1.709663,0.074521,-0.520293,1.257302,-0.153161,-0.140964,-0.147480,1.248211,...,0.676547,-0.002290,-0.428365,1.066271,-1.755755,0.401285,-0.197728,0.115720,1.584828,0.614690
227447,-0.281702,0.088270,-0.528383,-0.139688,-0.182242,0.366270,-0.214762,-0.070377,-0.393761,0.569488,...,0.435953,-0.007772,-0.416317,0.470014,-0.660775,0.192486,0.024400,0.087208,0.619964,0.275251
227448,-0.026909,-0.112187,-0.341779,-0.193211,-0.021847,0.321943,-0.107296,-0.257433,-0.230307,-0.127958,...,0.493512,-0.450669,-0.457724,0.100568,0.395795,0.549172,0.061587,-0.170438,-0.249233,-0.532724
227449,-0.511930,-1.659882,0.130521,-1.391057,1.238460,-0.657452,0.958512,0.576717,0.137283,-0.435263,...,0.262065,0.159768,-0.616801,0.557368,-0.232300,0.565052,-0.730139,1.029283,-0.644564,0.933621


In [28]:
only_pos_df['Class'] = 1

In [29]:
only_neg_real_df = train_norm[train_norm['Class'] == 0]

In [30]:
train_combined_fake_pos_only = only_pos_df
train_combined_fake_pos_only.columns = only_neg_real_df.columns
train_combined_fake_pos_only = pd.concat([only_pos_df, only_neg_real_df], ignore_index=True)
train_combined_fake_pos_only = train_combined_fake_pos_only.sample(frac=1) # Shuffle the dataset

In [31]:
train_combined_fake_pos_only['Class'].value_counts()

Class
1    227451
0    227451
Name: count, dtype: int64

In [32]:
model_card_with_gan = ModelCreditCard(30)
model_gan = ModelTraining(model_card_with_gan)
dm_only_gan = CreditCardDataModel(train_combined_fake_pos_only)
trainer = pl.Trainer(max_epochs=10, accelerator='cuda', devices=1)
trainer.fit(model_gan, dm_only_gan)

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/parsing.py:208: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

  label = torch.tensor(self.data[index][-1], dtype=torch.float32)
  row = torch.tensor(self.data[index][:-1], dtype=torch.float32)
  x = torch.tensor(x, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)
