In [None]:

import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Kiểm tra và đặt device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Tải dataset từ Kaggle
df = pd.read_excel("C:\\uit\\elliptic\\Ransomware_Transaction_Dataset.xlsx")

# Kiểm tra tên cột thực tế
print("Cột trong dataset:", df.columns.tolist())

# Lọc cột cần thiết
df = df[['Senders Address', 'Sent Amount(BTC)', 'Receivers Address', 'Received Amount(BTC)',
         'Date and Time', 'Tier (I,II, III, IV)', 'Ransomware Family', 'Transaction Flag']].copy()

# Chuyển Date and Time thành Date Numeric
df['Date and Time'] = pd.to_datetime(df['Date and Time'], errors='coerce')
valid_dates = df['Date and Time'].notna()
if valid_dates.any():
    date_series = df.loc[valid_dates, 'Date and Time']
    min_time = date_series.min()
    max_time = date_series.max()
    if pd.notna(min_time) and pd.notna(max_time):
        time_diff = (date_series - min_time).dt.total_seconds()
        time_range = (max_time - min_time).total_seconds()
        df.loc[valid_dates, 'Date Numeric'] = time_diff / time_range if time_range != 0 else 0
        df.loc[~valid_dates, 'Date Numeric'] = 0
    else:
        df['Date Numeric'] = 0
else:
    df['Date Numeric'] = 0

# Mã hóa Tier và Ransomware Family
tier_encoder = LabelEncoder()
df['Tier Encoded'] = tier_encoder.fit_transform(df['Tier (I,II, III, IV)'].fillna('Base'))

family_encoder = LabelEncoder()
df['Family Encoded'] = family_encoder.fit_transform(df['Ransomware Family'].fillna('Netwalker'))

# Nhãn
df['Label'] = df['Transaction Flag'].map({'Normal': 0, 'Suspicious': 1}).fillna(0)

# Chia train, val, test trước khi chuẩn hóa
train_df = df.sample(frac=0.6, random_state=42)
temp_df = df.drop(train_df.index)
val_df = temp_df.sample(frac=0.5, random_state=42)
test_df = temp_df.drop(val_df.index)

# Chuẩn hóa riêng cho train và transform cho val/test
scaler = MinMaxScaler()
numeric_cols = ['Sent Amount(BTC)', 'Received Amount(BTC)']
train_numeric_cols = train_df[numeric_cols]
val_numeric_cols = val_df[numeric_cols]
test_numeric_cols = test_df[numeric_cols]
scaler.fit(train_numeric_cols)
train_df[numeric_cols] = scaler.transform(train_numeric_cols)
val_df[numeric_cols] = scaler.transform(val_numeric_cols)
test_df[numeric_cols] = scaler.transform(test_numeric_cols)

# Kết hợp lại để tạo đặc trưng
df_combined = pd.concat([train_df, val_df, test_df]).sort_index()

# Tạo đặc trưng cho mỗi giao dịch (node)
node_features = df_combined[['Sent Amount(BTC)', 'Received Amount(BTC)', 'Date Numeric', 'Tier Encoded', 'Family Encoded']].values
node_labels = df_combined['Label'].values

# Tạo edge_index với map để tránh O(n^2)
from collections import defaultdict
receiver_to_tx = defaultdict(list)
for i in range(len(df_combined)):
    receiver = df_combined.iloc[i]['Receivers Address']
    receiver_to_tx[receiver].append(i)

edge_index = []
for j in range(len(df_combined)):
    sender = df_combined.iloc[j]['Senders Address']
    if sender in receiver_to_tx:
        for i in receiver_to_tx[sender]:
            if i != j:
                edge_index.append([i, j])  # Từ tx_i (receiver) đến tx_j (sender)

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()


In [None]:


# Tạo Data và di chuyển lên GPU
data = Data(x=torch.tensor(node_features, dtype=torch.float).to(device),
            edge_index=edge_index.to(device),
            y=torch.tensor(node_labels, dtype=torch.long).to(device))

# Xây dựng mô hình GCN
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Khởi tạo mô hình và di chuyển lên GPU
input_dim = node_features.shape[1]
hidden_dim = 16
output_dim = 2

model = GCN(input_dim, hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Chia train/val/test mask dựa trên chỉ số ban đầu
train_mask = torch.zeros(len(node_labels), dtype=torch.bool)
val_mask = torch.zeros(len(node_labels), dtype=torch.bool)
test_mask = torch.zeros(len(node_labels), dtype=torch.bool)

train_indices = train_df.index
val_indices = val_df.index
test_indices = test_df.index

train_mask[train_indices] = True
val_mask[val_indices] = True
test_mask[test_indices] = True

data.train_mask = train_mask.to(device)
data.val_mask = val_mask.to(device)
data.test_mask = test_mask.to(device)

# Train với early stopping
model.train()
best_val_loss = float('inf')
patience = 20
early_stop_counter = 0

for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
    model.train()

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_gcn.pth')
        early_stop_counter = 0
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f'Early stopping at epoch {epoch}')
            break

    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

# Load mô hình tốt nhất
model.load_state_dict(torch.load('best_gcn.pth'))

# Đánh giá trên test
model.eval()
with torch.no_grad():
    out = model(data)
    _, pred = out.max(dim=1)
    pred = pred[data.test_mask].cpu().numpy()
    true = data.y[data.test_mask].cpu().numpy()

acc = accuracy_score(true, pred)
precision = precision_score(true, pred, average=None)
recall = recall_score(true, pred, average=None)
f1 = f1_score(true, pred, average=None)
micro_f1 = f1_score(true, pred, average='micro')

print(f'Accuracy: {acc:.4f}')
print(f"Class 0 (Normal) - Precision: {precision[0]:.4f}, Recall: {recall[0]:.4f}, F1: {f1[0]:.4f}")
print(f"Class 1 (Suspicious) - Precision: {precision[1]:.4f}, Recall: {recall[1]:.4f}, F1: {f1[1]:.4f}")
print(f"Micro F1: {micro_f1:.4f}")

In [None]:
# Cài đặt PyTorch Geometric


import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler

# Tải dataset từ Kaggle
# Giả sử file đã được tải lên Kaggle Datasets với tên "Ransomware_Transaction_Dataset.xlsx"
df = pd.read_excel("/kaggle/input/ransomcoin/Ransomware_Transaction_Dataset.xlsx")

In [None]:
# Kiểm tra tên cột thực tế
print("Cột trong dataset:", df.columns.tolist())

In [None]:
df = df[['Senders Address', 'Sent Amount(BTC)', 'Receivers Address', 'Received Amount(BTC)', 
         'Date and Time', 'Tier (I,II, III, IV)', 'Ransomware Family', 'Transaction Flag']].copy()

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Chuẩn bị dữ liệu
# Chuyển Date and Time thành Date Numeric
df['Date and Time'] = pd.to_datetime(df['Date and Time'], errors='coerce')
valid_dates = df['Date and Time'].notna()
if valid_dates.any():
    date_series = df.loc[valid_dates, 'Date and Time']
    min_time = date_series.min()
    max_time = date_series.max()
    if pd.notna(min_time) and pd.notna(max_time):
        time_diff = (date_series - min_time).dt.total_seconds()
        time_range = (max_time - min_time).total_seconds()
        df.loc[valid_dates, 'Date Numeric'] = time_diff / time_range if time_range != 0 else 0
        df.loc[~valid_dates, 'Date Numeric'] = 0
    else:
        df['Date Numeric'] = 0
else:
    df['Date Numeric'] = 0

# Tạo đặc trưng
# Mã hóa Tier
tier_encoder = LabelEncoder()
df['Tier Encoded'] = tier_encoder.fit_transform(df['Tier (I,II, III, IV)'].fillna('Base'))

# Mã hóa Ransomware Family
family_encoder = LabelEncoder()
df['Family Encoded'] = family_encoder.fit_transform(df['Ransomware Family'].fillna('Netwalker'))

# Lấy nhãn
df['Label'] = df['Transaction Flag'].map({'Normal': 0, 'Suspicious': 1})

# Chuẩn hóa Sent Amount và Received Amount
scaler = MinMaxScaler()
numeric_cols = ['Sent Amount(BTC)', 'Received Amount(BTC)']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])  # Lưu ý: Đây là chuẩn hóa toàn bộ, cần sửa sau

# Chia train, val, test trước khi chuẩn hóa
train_df = df.sample(frac=0.6, random_state=42)  # 60% train
temp_df = df.drop(train_df.index)
val_df = temp_df.sample(frac=0.5, random_state=42)  # 20% val
test_df = temp_df.drop(val_df.index)  # 20% test

# Chuẩn hóa lại, chỉ fit trên train
scaler = MinMaxScaler()
train_numeric_cols = train_df[numeric_cols]
val_numeric_cols = val_df[numeric_cols]
test_numeric_cols = test_df[numeric_cols]
scaler.fit(train_numeric_cols)
train_df[numeric_cols] = scaler.transform(train_numeric_cols)
val_df[numeric_cols] = scaler.transform(val_numeric_cols)
test_df[numeric_cols] = scaler.transform(test_numeric_cols)

# Chuẩn bị features và labels
features = ['Sent Amount(BTC)', 'Received Amount(BTC)', 'Date Numeric', 'Tier Encoded', 'Family Encoded']
X_train = train_df[features]
y_train = train_df['Label']
X_val = val_df[features]
y_val = val_df['Label']
X_test = test_df[features]
y_test = test_df['Label']

In [None]:
# Huấn luyện Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Đánh giá trên test set
# Random Forest
rf_pred = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred, average=None)
rf_recall = recall_score(y_test, rf_pred, average=None)
rf_f1 = f1_score(y_test, rf_pred, average=None)
rf_micro_f1 = f1_score(y_test, rf_pred, average='micro')

print("Random Forest Results:")
print(f"Accuracy: {rf_acc:.4f}")
print(f"Class 0 (Normal) - Precision: {rf_precision[0]:.4f}, Recall: {rf_recall[0]:.4f}, F1: {rf_f1[0]:.4f}")
print(f"Class 1 (Suspicious) - Precision: {rf_precision[1]:.4f}, Recall: {rf_recall[1]:.4f}, F1: {rf_f1[1]:.4f}")
print(f"Micro F1: {rf_micro_f1:.4f}")

In [None]:
# Huấn luyện XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=False)

In [None]:
# XGBoost
xgb_pred = xgb_model.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred)
xgb_precision = precision_score(y_test, xgb_pred, average=None)
xgb_recall = recall_score(y_test, xgb_pred, average=None)
xgb_f1 = f1_score(y_test, xgb_pred, average=None)
xgb_micro_f1 = f1_score(y_test, xgb_pred, average='micro')

print("\nXGBoost Results:")
print(f"Accuracy: {xgb_acc:.4f}")
print(f"Class 0 (Normal) - Precision: {xgb_precision[0]:.4f}, Recall: {xgb_recall[0]:.4f}, F1: {xgb_f1[0]:.4f}")
print(f"Class 1 (Suspicious) - Precision: {xgb_precision[1]:.4f}, Recall: {xgb_recall[1]:.4f}, F1: {xgb_f1[1]:.4f}")
print(f"Micro F1: {xgb_micro_f1:.4f}")

In [None]:
# Chuẩn hóa dữ liệu
scaler = MinMaxScaler()
numeric_cols = ['Sent Amount(BTC)', 'Received Amount(BTC)']
df.loc[:, numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:


# Đảm bảo Date Numeric đã được tính (nếu chưa, tính lại)
if 'Date Numeric' not in df.columns:
    df['Date and Time'] = pd.to_datetime(df['Date and Time'])
    df['Date Numeric'] = (df['Date and Time'] - df['Date and Time'].min()).dt.total_seconds() / (df['Date and Time'].max() - df['Date and Time'].min()).total_seconds()

# Tiếp tục với create_node_features...

In [None]:
# Tạo đặc trưng cho mỗi giao dịch (node) - bổ sung cho GNN
def create_transaction_features(df):
    node_features = []
    node_labels = []
    for idx, row in df.iterrows():
        total_sent = row['Sent Amount(BTC)']
        total_received = row['Received Amount(BTC)']
        avg_time = row['Date Numeric']
        tier = row['Tier (I,II, III, IV)']
        tier_encoded = {'Base': 0, 'Tier One': 1, 'Tier Two': 2, 'Tier Three': 3, 'Tier Four': 4}.get(tier, 0)
        family = row['Ransomware Family']
        family_encoded = {'Netwalker': 1, 'Qlocker': 2, 'DarkSide': 3}.get(family, 0)
        flag = row['Transaction Flag']
        label = 1 if flag == 'Suspicious' else 0
        features = [total_sent, total_received, avg_time, tier_encoded, family_encoded]
        node_features.append(features)
        node_labels.append(label)
    return np.array(node_features), np.array(node_labels)

# Giả sử df đã được chuẩn hóa và có chỉ số từ 0 đến n-1 (n = len(df))
# Tạo map receiver_to_tx_ids
receiver_to_tx_ids = {}
for i, row in df.iterrows():
    receiver = row['Receivers Address']
    if receiver not in receiver_to_tx_ids:
        receiver_to_tx_ids[receiver] = []
    receiver_to_tx_ids[receiver].append(i)  # i là ID tx (chỉ số hàng)

# Tạo edge_index
edge_index = []
for j, row in df.iterrows():
    sender = row['Senders Address']
    if sender in receiver_to_tx_ids:
        for i in receiver_to_tx_ids[sender]:
            if i != j:  # Tránh self-loop
                edge_index.append([i, j])  # Cạnh từ tx_i (receiver) đến tx_j (sender)

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

In [None]:
# Tạo đặc trưng cho mỗi giao dịch (node)
node_features = df[['Sent Amount(BTC)', 'Received Amount(BTC)', 'Date Numeric', 'Tier Encoded', 'Family Encoded']].values
node_labels = df['Label'].values
# Tạo Data
data = Data(x=torch.tensor(node_features, dtype=torch.float),
            edge_index=edge_index,
            y=torch.tensor(node_labels, dtype=torch.long))

# Xây dựng mô hình GCN
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [None]:
# Chia train/val/test (60/20/20)
from sklearn.model_selection import train_test_split
idx = np.arange(len(node_labels))
train_idx, temp_idx = train_test_split(idx, test_size=0.4, stratify=node_labels, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, stratify=node_labels[temp_idx], random_state=42)

train_mask = torch.zeros(len(node_labels), dtype=torch.bool)
val_mask = torch.zeros(len(node_labels), dtype=torch.bool)
test_mask = torch.zeros(len(node_labels), dtype=torch.bool)
train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True
data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

In [None]:
# Kết hợp lại để tạo đặc trưng
df_combined = pd.concat([train_df, val_df, test_df]).sort_index()

In [None]:
# Train với early stopping
model.train()
best_val_loss = float('inf')
patience = 20
early_stop_counter = 0

for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
    model.train()

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_gcn.pth')
        early_stop_counter = 0
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f'Early stopping at epoch {epoch}')
            break

    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

In [None]:
# Load mô hình tốt nhất
model.load_state_dict(torch.load('best_gcn.pth'))

# Đánh giá trên test
model.eval()
with torch.no_grad():
    out = model(data)
    _, pred = out.max(dim=1)
    pred = pred[data.test_mask].cpu().numpy()
    true = data.y[data.test_mask].cpu().numpy()

acc = accuracy_score(true, pred)
precision = precision_score(true, pred, average=None)
recall = recall_score(true, pred, average=None)
f1 = f1_score(true, pred, average=None)
micro_f1 = f1_score(true, pred, average='micro')

print(f'Accuracy: {acc:.4f}')
print(f"Class 0 (Normal) - Precision: {precision[0]:.4f}, Recall: {recall[0]:.4f}, F1: {f1[0]:.4f}")
print(f"Class 1 (Suspicious) - Precision: {precision[1]:.4f}, Recall: {recall[1]:.4f}, F1: {f1[1]:.4f}")
print(f"Micro F1: {micro_f1:.4f}")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Đánh giá
model.eval()
with torch.no_grad():
    out = model(data)
    _, pred = out.max(dim=1)  # Lấy nhãn dự đoán
    pred = pred[data.test_mask].cpu().numpy()  # Chuyển dự đoán sang numpy
    true = data.y[data.test_mask].cpu().numpy()  # Lấy nhãn thực tế

# Tính precision, recall, F1 cho từng lớp
precision = precision_score(true, pred, average=None)
recall = recall_score(true, pred, average=None)
f1 = f1_score(true, pred, average=None)

# In kết quả cho từng lớp
print(f"Class 0 (Normal) - Precision: {precision[0]:.4f}, Recall: {recall[0]:.4f}, F1: {f1[0]:.4f}")
print(f"Class 1 (Suspicious) - Precision: {precision[1]:.4f}, Recall: {recall[1]:.4f}, F1: {f1[1]:.4f}")

# Tính và in micro F1
micro_f1 = f1_score(true, pred, average='micro')
print(f"Micro F1: {micro_f1:.4f}")