<a href="https://colab.research.google.com/github/Kcreation25/hybrid/blob/main/hybrid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


Saving BitcoinHeistData.csv.zip to BitcoinHeistData.csv.zip
Saving no label.csv to no label.csv


In [None]:
# ==============================================================
# 🔐 HYBRID RANSOMWARE DETECTION MODEL
# Combines Supervised (with SMOTE) + Unsupervised (Isolation Forest)
# Weighted Ensemble: 65% Supervised + 35% Unsupervised
# ==============================================================

import pandas as pd
import numpy as np
import time
from sklearn.ensemble import IsolationForest, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# -----------------------------
# 1️⃣ LOAD DATASETS
# -----------------------------
df_supervised = pd.read_csv("BitcoinHeistData.csv.zip")   # Labeled dataset
df_unsupervised = pd.read_csv("no label.csv")              # Unlabeled dataset

# -----------------------------
# 2️⃣ SUPERVISED MODEL (with SMOTE and Scaling)
# -----------------------------
print("\n🏁 Training Supervised Model with SMOTE and Scaling...")

# Prepare features and target for supervised model
# Classify 'white' as 0 (non-malicious) and all other labels as 1 (malicious)
X = df_supervised.drop("label", axis=1)
y = df_supervised["label"].apply(lambda x: 0 if x == "white" else 1)


# Encode categorical columns
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Before SMOTE:")
print(y_train.value_counts())

# Create a pipeline with SMOTE and RobustScaler
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('scaler', RobustScaler()),
    ('hgb', HistGradientBoostingClassifier(max_iter=100, learning_rate=0.1, max_depth=5, random_state=42))
])

start_time = time.time()
pipeline.fit(X_train, y_train)
supervised_time = time.time() - start_time

# Predict on test set
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]  # probability of ransomware


supervised_acc = accuracy_score(y_test, y_pred)

print(f"\n✅ Supervised Model Accuracy (with SMOTE and Scaling): {supervised_acc:.4f}")
print("⏱️ Training Time (Supervised):", round(supervised_time, 3), "seconds")
print("\n📊 Classification Report (Supervised):")
print(classification_report(y_test, y_pred, target_names=["Non-malicious", "Malicious"]))

# -----------------------------
# 3️⃣ UNSUPERVISED MODEL (Isolation Forest)
# -----------------------------
print("\n🏁 Training Unsupervised Model (Isolation Forest)...")

# Drop non-numeric columns
X_unsup = df_unsupervised.drop(columns=["address"], errors="ignore")

# Scale unsupervised data
scaler_unsup = RobustScaler()
X_unsup_scaled = scaler_unsup.fit_transform(X_unsup)


unsup_model = IsolationForest(
    n_estimators=100,
    contamination=0.01,
    random_state=42,
    n_jobs=-1
)

start_time = time.time()
unsup_model.fit(X_unsup_scaled)
unsup_time = time.time() - start_time

unsup_scores = -unsup_model.decision_function(X_unsup_scaled)  # higher = more suspicious
unsup_flags = unsup_model.predict(X_unsup_scaled)  # -1 = anomaly

print("⏱️ Training Time (Unsupervised):", round(unsup_time, 3), "seconds")
print(f"Total anomalies detected (unsupervised): {(unsup_flags == -1).sum()}")

# -----------------------------
# 4️⃣ HYBRID ENSEMBLE COMBINATION
# -----------------------------
print("\n🔗 Combining Supervised + Unsupervised Models...")

# Normalize both score sets to 0–1
supervised_scaled = (y_prob - y_prob.min()) / (y_prob.max() - y_prob.min())

# Scale X_test before generating unsupervised scores for hybrid
X_test_scaled = scaler_unsup.transform(X_test.drop(columns=["address"], errors="ignore"))
unsup_scores_test = -unsup_model.decision_function(X_test_scaled)
unsup_scaled = (unsup_scores_test - unsup_scores_test.min()) / (unsup_scores_test.max() - unsup_scores_test.min())


hybrid_score = 0.65 * supervised_scaled + 0.35 * unsup_scaled
hybrid_pred = (hybrid_score > 0.5).astype(int)

# -----------------------------
# 5️⃣ HYBRID PERFORMANCE EVALUATION
# -----------------------------
print("\n📊 Hybrid Model Evaluation")
print("Accuracy:", accuracy_score(y_test, hybrid_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, hybrid_pred))
print("\nClassification Report:\n", classification_report(y_test, hybrid_pred))

# -----------------------------
# 6️⃣ TIME TO DETECTION COMPARISON
# -----------------------------
print("\n⏱️ Detection Time Comparison:")
print(f"Supervised Model: {supervised_time:.3f} sec")
print(f"Unsupervised Model: {unsup_time:.3f} sec")
print(f"Hybrid Model (approx): {supervised_time + unsup_time:.3f} sec")

# -----------------------------
# 7️⃣ TEST ON UNKNOWN DATA
# -----------------------------
print("\n🧩 Testing Hybrid Model on Unknown (Unlabeled) Dataset...")

# Scale the unknown data using the same scaler as the unsupervised model
X_unknown_scaled = scaler_unsup.transform(df_unsupervised.drop(columns=["address"], errors="ignore"))
unsup_scores_unknown = -unsup_model.decision_function(X_unknown_scaled)
unsup_scaled_unknown = (unsup_scores_unknown - unsup_scores_unknown.min()) / (unsup_scores_unknown.max() - unsup_scores_unknown.min())


# Combine supervised average probability with unsupervised anomaly scores
hybrid_unsup_score = 0.65 * np.mean(y_prob) + 0.35 * unsup_scaled_unknown
threshold = np.median(hybrid_unsup_score)
unknown_flags = (hybrid_unsup_score > threshold).astype(int)


print(f"🔍 Unknown pattern detection (unlabeled dataset): Detected {unknown_flags.sum()} suspicious entries.")

print("\n✅ Hybrid Model Execution Complete!")


🏁 Training Supervised Model with SMOTE and Scaling...
Before SMOTE:
label
0    2300227
1      33130
Name: count, dtype: int64

✅ Supervised Model Accuracy (with SMOTE and Scaling): 0.9092
⏱️ Training Time (Supervised): 68.583 seconds

📊 Classification Report (Supervised):
               precision    recall  f1-score   support

Non-malicious       0.99      0.91      0.95    575057
    Malicious       0.09      0.60      0.16      8283

     accuracy                           0.91    583340
    macro avg       0.54      0.76      0.56    583340
 weighted avg       0.98      0.91      0.94    583340


🏁 Training Unsupervised Model (Isolation Forest)...
⏱️ Training Time (Unsupervised): 5.828 seconds
Total anomalies detected (unsupervised): 10486

🔗 Combining Supervised + Unsupervised Models...

📊 Hybrid Model Evaluation
Accuracy: 0.9440086399012583

Confusion Matrix:
 [[547070  27987]
 [  4675   3608]]

Classification Report:
               precision    recall  f1-score   support

      