In [None]:
import pandas as pd
import numpy as np
import random
import time
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

In [None]:
file_path = "transaction_dataset.csv"
df = pd.read_csv(file_path)

# Drop irrelevant columns
df = df.drop(columns=['Unnamed: 0', 'Index', 'Address'])

# Define target and features
y = df['FLAG']
X = df.drop(columns=['FLAG'])

In [4]:
df.columns

Index(['FLAG', 'Avg min between sent tnx', 'Avg min between received tnx',
       'Time Diff between first and last (Mins)', 'Sent tnx', 'Received Tnx',
       'Number of Created Contracts', 'Unique Received From Addresses',
       'Unique Sent To Addresses', 'min value received', 'max value received ',
       'avg val received', 'min val sent', 'max val sent', 'avg val sent',
       'min value sent to contract', 'max val sent to contract',
       'avg value sent to contract',
       'total transactions (including tnx to create contract',
       'total Ether sent', 'total ether received',
       'total ether sent contracts', 'total ether balance',
       ' Total ERC20 tnxs', ' ERC20 total Ether received',
       ' ERC20 total ether sent', ' ERC20 total Ether sent contract',
       ' ERC20 uniq sent addr', ' ERC20 uniq rec addr',
       ' ERC20 uniq sent addr.1', ' ERC20 uniq rec contract addr',
       ' ERC20 avg time between sent tnx', ' ERC20 avg time between rec tnx',
       ' ERC20

In [None]:

expected_columns = [
    'Transaction Frequency', 'Average Sent Amount', 'Average Received Amount',
    'Unique Sent Addresses', 'Unique Received Addresses', 'Transaction Time Consistency',
    'Time Diff between Transactions (Minutes)', 'Account Age', 'Total Sent Transactions',
    'Total Received Transactions', 'Device Fingerprint', 'IP Address', 'Geolocation'
]

In [None]:
def feature_engineering(df):
    """
    Generate new features based on Sybil attack detection requirements.
    For raw columns, check for existence before computing derived features.
    """
    # Transaction Frequency and Total Transaction Counts
    if 'Sent tnx' in df.columns and 'Received Tnx' in df.columns:
        df['Transaction Frequency'] = df['Sent tnx'] + df['Received Tnx']
        df['Total Sent Transactions'] = df['Sent tnx']
        df['Total Received Transactions'] = df['Received Tnx']
    else:
        df['Transaction Frequency'] = df.get('Transaction Frequency', 0)
        df['Total Sent Transactions'] = df.get('Total Sent Transactions', 0)
        df['Total Received Transactions'] = df.get('Total Received Transactions', 0)
    
    # Average Sent and Received Amounts
    if 'avg val sent' in df.columns:
        df['Average Sent Amount'] = df['avg val sent']
    else:
        df['Average Sent Amount'] = df.get('Average Sent Amount', 0)
        
    if 'avg val received' in df.columns:
        df['Average Received Amount'] = df['avg val received']
    else:
        df['Average Received Amount'] = df.get('Average Received Amount', 0)
    
    # Unique Addresses
    if 'Unique Sent To Addresses' in df.columns:
        df['Unique Sent Addresses'] = df['Unique Sent To Addresses']
    else:
        df['Unique Sent Addresses'] = df.get('Unique Sent Addresses', 0)
        
    if 'Unique Received From Addresses' in df.columns:
        df['Unique Received Addresses'] = df['Unique Received From Addresses']
    else:
        df['Unique Received Addresses'] = df.get('Unique Received Addresses', 0)
    
    # Transaction Time Consistency
    if 'Time Diff between first and last (Mins)' in df.columns:
        df['Transaction Time Consistency'] = df['Time Diff between first and last (Mins)']
    else:
        df['Transaction Time Consistency'] = df.get('Transaction Time Consistency', 0)
    
    # Account Age (if not available, default to 0)
    if 'Account Age' not in df.columns:
        df['Account Age'] = 0
    
    # Device Fingerprint, IP Address, and Geolocation (simulate if missing)
    if 'Device Fingerprint' not in df.columns:
        df['Device Fingerprint'] = 'Unknown'
    if 'IP Address' not in df.columns:
        df['IP Address'] = 'Unknown'
    if 'Geolocation' not in df.columns:
        df['Geolocation'] = 'Unknown'
    
    # Time Diff between Transactions (Minutes)
    if 'Time Diff between Transactions (Minutes)' not in df.columns:
        df['Time Diff between Transactions (Minutes)'] = 0
    
    return df

In [None]:

# Apply feature engineering on historical data
df = feature_engineering(df)

In [None]:
# For training, restrict to the expected features plus the target column.
# Assume the historical dataset has a target column named 'FLAG'
if 'FLAG' in df.columns:
    df = df[expected_columns + ['FLAG']]
else:
    # For demonstration, if no FLAG exists, simulate it.
    df['FLAG'] = np.random.randint(0, 2, size=len(df))
    df = df[expected_columns + ['FLAG']]

# Define target and features
y = df['FLAG']
X = df[expected_columns]

##  Build and Train the Model Pipeline

In [None]:

# Identify numerical and categorical columns among the expected features
numerical_cols = [col for col in expected_columns if df[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in expected_columns if df[col].dtype == 'object']

In [None]:
# Preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

In [None]:



# Build the complete pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
# Split and train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipeline.fit(X_train, y_train)

In [None]:


# Evaluate the model
y_pred = pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("\nTest Accuracy: {:.4f}".format(test_accuracy))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

## Real-Time Sybil Attack Detection Simulation

In [None]:


def align_features(df_sim):
    """
    Ensure that the simulation DataFrame has exactly the same columns (and order)
    as defined in expected_columns. Missing columns are filled with default values.
    """
    for col in expected_columns:
        if col not in df_sim.columns:
            if col in numerical_cols:
                df_sim[col] = 0
            else:
                df_sim[col] = 'Unknown'
    df_sim = df_sim[expected_columns]
    return df_sim

In [None]:

def simulate_real_time_transaction():
    """
    Simulate a single real-time transaction with raw data.
    """
    transaction_data = {
        'Sent tnx': random.randint(1, 10),
        'Received Tnx': random.randint(1, 10),
        'avg val sent': random.uniform(0.01, 5.0),
        'avg val received': random.uniform(0.01, 5.0),
        'Unique Sent To Addresses': random.randint(1, 20),
        'Unique Received From Addresses': random.randint(1, 20),
        'Time Diff between first and last (Mins)': random.uniform(1, 120),
        'Timestamp': datetime.now(),
        'Device Fingerprint': random.choice(['DeviceA', 'DeviceB', 'DeviceC']),
        'IP Address': random.choice(['192.168.0.1', '192.168.0.2']),
        'Geolocation': random.choice(['US', 'EU', 'Asia']),
        'Account Age': random.randint(1, 100)
    }
    return transaction_data

In [None]:



def feature_engineering_for_real_time(transaction_data, previous_data=None):
    """
    Generate features for real-time transaction data.
    Computes derived features and then aligns the DataFrame with the expected features.
    """
    # Compute derived features from the raw transaction data
    transaction_frequency = transaction_data['Sent tnx'] + transaction_data['Received Tnx']
    average_sent_amount = transaction_data['avg val sent']
    average_received_amount = transaction_data['avg val received']
    unique_sent_addresses = transaction_data['Unique Sent To Addresses']
    unique_received_addresses = transaction_data['Unique Received From Addresses']
    time_diff_first_last = transaction_data['Time Diff between first and last (Mins)']
    
    # Calculate time difference between transactions (if previous transaction is available)
    if previous_data is not None:
        previous_time = previous_data['Timestamp']
        time_diff = (transaction_data['Timestamp'] - previous_time).total_seconds() / 60.0
    else:
        time_diff = 0

    # Build a dictionary of features to use for prediction
    features = {
        'Transaction Frequency': transaction_frequency,
        'Average Sent Amount': average_sent_amount,
        'Average Received Amount': average_received_amount,
        'Unique Sent Addresses': unique_sent_addresses,
        'Unique Received Addresses': unique_received_addresses,
        'Transaction Time Consistency': time_diff_first_last,
        'Time Diff between Transactions (Minutes)': time_diff,
        'Account Age': transaction_data['Account Age'],
        'Total Sent Transactions': transaction_data['Sent tnx'],
        'Total Received Transactions': transaction_data['Received Tnx'],
        'Device Fingerprint': transaction_data['Device Fingerprint'],
        'IP Address': transaction_data['IP Address'],
        'Geolocation': transaction_data['Geolocation']
    }
    
    # Convert to DataFrame and apply feature engineering to fill in any missing derived features
    features_df = pd.DataFrame([features])
    features_df = feature_engineering(features_df)
    # Ensure column alignment with training features
    features_df = align_features(features_df)
    return features_df

In [None]:


def real_time_monitoring(num_transactions=10, delay=2):
    """
    Simulate real-time monitoring of incoming transactions for Sybil attack detection.
    """
    previous_data = None  # For computing time differences
    print("\n--- Starting Real-Time Sybil Attack Detection Simulation ---")
    
    for i in range(num_transactions):
        transaction_data = simulate_real_time_transaction()
        transaction_df = feature_engineering_for_real_time(transaction_data, previous_data)
        
        # Predict using the trained model
        sybil_prob = pipeline.predict_proba(transaction_df)[0, 1]
        prediction = pipeline.predict(transaction_df)[0]
        
        print(f"\nTransaction {i + 1}:")
        print(f"Predicted Sybil Attack Probability: {sybil_prob:.4f}")
        if prediction == 1:
            print("⚠️ Sybil Attack Detected! Stopping transaction.")
        else:
            print("Transaction appears legitimate. Proceeding with processing.")
        
        previous_data = transaction_data
        time.sleep(delay)
    
    print("\n--- Simulation Completed ---")

# Run the real-time Sybil attack monitoring simulation
real_time_monitoring(num_transactions=10, delay=2)



Test Accuracy: 0.9340

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96      1533
           1       0.91      0.78      0.84       436

    accuracy                           0.93      1969
   macro avg       0.93      0.88      0.90      1969
weighted avg       0.93      0.93      0.93      1969


--- Starting Real-Time Sybil Attack Detection Simulation ---

Transaction 1:
Predicted Sybil Attack Probability: 0.4900
Transaction appears legitimate. Proceeding with processing.

Transaction 2:
Predicted Sybil Attack Probability: 0.3600
Transaction appears legitimate. Proceeding with processing.

Transaction 3:
Predicted Sybil Attack Probability: 0.5700
⚠️ Sybil Attack Detected! Stopping transaction.

Transaction 4:
Predicted Sybil Attack Probability: 0.3900
Transaction appears legitimate. Proceeding with processing.

Transaction 5:
Predicted Sybil Attack Probability: 0.2100
Transaction appears legitimate. Proceedi

In [9]:
import joblib
training_columns = X.columns.tolist()
joblib.dump(training_columns, "feature_columns.pkl")
print("Model saved as model.pkl and feature columns saved as feature_columns.pkl!")

Model saved as model.pkl and feature columns saved as feature_columns.pkl!


In [None]:

joblib.dump(pipeline, "model.pkl")

['model.pkl']

: 