In [1]:
import pandas as pd
import random
import uuid
import time

def generate_fake_transaction_data(n=5000):
    data = []
    for _ in range(n):
        transaction_id = str(uuid.uuid4())
        sender = "0x" + ''.join(random.choices('abcdef0123456789', k=40))
        receiver = "0x" + ''.join(random.choices('abcdef0123456789', k=40))
        amount = round(random.uniform(0.001, 100), 4)
        timestamp = int(time.time()) - random.randint(1, 31536000)
        gas_fee = round(random.uniform(0.0001, 0.1), 6)
        transaction_count = random.randint(1, 500)
        wallet_age = random.randint(1, 365)
        
        # 10% Fraudulent Transactions
        is_fraud = 1 if (gas_fee > 0.05 and transaction_count > 300) or (amount > 80) else 0
        
        data.append([transaction_id, sender, receiver, amount, timestamp, gas_fee, transaction_count, wallet_age, is_fraud])
    
    df = pd.DataFrame(data, columns=["transaction_id", "sender_address", "receiver_address", "amount", "timestamp", "gas_fee", 
                                     "transaction_count", "wallet_age", "is_fraud"])
    df.to_csv("synthetic_transactions.csv", index=False)

# Generate and save the dataset
generate_fake_transaction_data()


In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv("synthetic_transactions.csv")

# Display the first 5 rows
print(df.head())


                         transaction_id  \
0  468cdbfc-7656-4ba1-b19a-63c48641742d   
1  54f0ad62-1504-4492-9b9b-a52fb755deb9   
2  2a863a31-e577-416a-8403-0abd52e2207e   
3  85ca3850-d2b9-4a3c-aec9-2085ec97e9e4   
4  13b33c3b-88c5-4bfd-ab3a-71c548255738   

                               sender_address  \
0  0xacf553a6c48626b8352391171f35474c1d5fa884   
1  0x890bd96226890bc0fc02af0c5199550d4bf74a5f   
2  0x36440a7ac59ea20733dcf64fbd114cd2e5884bc2   
3  0x07c98f452383a76cf60e2813567885e0c6fda4b1   
4  0xca5818a381cac32c3106c62393fa73c90a002b1e   

                             receiver_address   amount   timestamp   gas_fee  \
0  0xd5ad3554f84e5b4763be790c92044914ec6bc9fb  36.0507  1716224983  0.078713   
1  0x84bae551539f7c59fcc7974f2ee38b671391c07c   8.4255  1727036627  0.067098   
2  0x7249fb7c701ee8db060bc37f062bc1dbc7618f6a  71.0799  1724626429  0.083439   
3  0x20952be50ca832907ffc048fe731d9a7e6e62d3b  69.0019  1734362615  0.068309   
4  0x5fa0ced60a9bc916e6d7b07edfcf2665de888d0d 

In [3]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Select features and target
features = ["amount", "gas_fee", "transaction_count", "wallet_age"]
X = df[features]
y = df["is_fraud"]

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Isolation Forest Model
model = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
model.fit(X_train)

# Predict on test set
y_pred = model.predict(X_test)

# Convert predictions (-1 for anomalies, 1 for normal) to 0 and 1
y_pred = [1 if pred == -1 else 0 for pred in y_pred]

# Evaluate the model
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

Precision: 0.5221, Recall: 0.1630, F1-score: 0.2484
