<a href="https://colab.research.google.com/github/Kcreation25/hybrid/blob/main/hybrid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


Saving BitcoinHeistData.csv.zip to BitcoinHeistData.csv.zip
Saving no label.csv to no label.csv


In [None]:
# ==============================================================
# 🔐 HYBRID RANSOMWARE DETECTION MODEL
# Combines Supervised (with SMOTE) + Unsupervised (Isolation Forest)
# Weighted Ensemble: 65% Supervised + 35% Unsupervised
# ==============================================================

import pandas as pd
import numpy as np
import time
from sklearn.ensemble import IsolationForest, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# -----------------------------
# 1️⃣ LOAD DATASETS
# -----------------------------
df_supervised = pd.read_csv("BitcoinHeistData.csv.zip")   # Labeled dataset
df_unsupervised = pd.read_csv("no label.csv")              # Unlabeled dataset

# -----------------------------
# 2️⃣ SUPERVISED MODEL (with SMOTE and Scaling)
# -----------------------------
print("\n🏁 Training Supervised Model with SMOTE and Scaling...")

# Prepare features and target for supervised model
# Classify 'white' as 0 (non-malicious) and all other labels as 1 (malicious)
X = df_supervised.drop("label", axis=1)
y = df_supervised["label"].apply(lambda x: 0 if x == "white" else 1)


# Encode categorical columns
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Before SMOTE:")
print(y_train.value_counts())

# Create a pipeline with SMOTE and RobustScaler
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('scaler', RobustScaler()),
    ('hgb', HistGradientBoostingClassifier(max_iter=100, learning_rate=0.1, max_depth=5, random_state=42))
])

start_time = time.time()
pipeline.fit(X_train, y_train)
supervised_time = time.time() - start_time

# Predict on test set
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]  # probability of ransomware


supervised_acc = accuracy_score(y_test, y_pred)

print(f"\n✅ Supervised Model Accuracy (with SMOTE and Scaling): {supervised_acc:.4f}")
print("⏱️ Training Time (Supervised):", round(supervised_time, 3), "seconds")
print("\n📊 Classification Report (Supervised):")
print(classification_report(y_test, y_pred, target_names=["Non-malicious", "Malicious"]))

# -----------------------------
# 3️⃣ UNSUPERVISED MODEL (Isolation Forest)
# -----------------------------
print("\n🏁 Training Unsupervised Model (Isolation Forest)...")

# Drop non-numeric columns
X_unsup = df_unsupervised.drop(columns=["address"], errors="ignore")

# Scale unsupervised data
scaler_unsup = RobustScaler()
X_unsup_scaled = scaler_unsup.fit_transform(X_unsup)


unsup_model = IsolationForest(
    n_estimators=100,
    contamination=0.01,
    random_state=42,
    n_jobs=-1
)

start_time = time.time()
unsup_model.fit(X_unsup_scaled)
unsup_time = time.time() - start_time

unsup_scores = -unsup_model.decision_function(X_unsup_scaled)  # higher = more suspicious
unsup_flags = unsup_model.predict(X_unsup_scaled)  # -1 = anomaly

print("⏱️ Training Time (Unsupervised):", round(unsup_time, 3), "seconds")
print(f"Total anomalies detected (unsupervised): {(unsup_flags == -1).sum()}")

# -----------------------------
# 4️⃣ HYBRID ENSEMBLE COMBINATION
# -----------------------------
print("\n🔗 Combining Supervised + Unsupervised Models...")

# Normalize both score sets to 0–1
supervised_scaled = (y_prob - y_prob.min()) / (y_prob.max() - y_prob.min())

# Scale X_test before generating unsupervised scores for hybrid
X_test_scaled = scaler_unsup.transform(X_test.drop(columns=["address"], errors="ignore"))
unsup_scores_test = -unsup_model.decision_function(X_test_scaled)
unsup_scaled = (unsup_scores_test - unsup_scores_test.min()) / (unsup_scores_test.max() - unsup_scores_test.min())


hybrid_score = 0.65 * supervised_scaled + 0.35 * unsup_scaled
hybrid_pred = (hybrid_score > 0.5).astype(int)

# -----------------------------
# 5️⃣ HYBRID PERFORMANCE EVALUATION
# -----------------------------
print("\n📊 Hybrid Model Evaluation")
print("Accuracy:", accuracy_score(y_test, hybrid_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, hybrid_pred))
print("\nClassification Report:\n", classification_report(y_test, hybrid_pred))

# -----------------------------
# 6️⃣ TIME TO DETECTION COMPARISON
# -----------------------------
print("\n⏱️ Detection Time Comparison:")
print(f"Supervised Model: {supervised_time:.3f} sec")
print(f"Unsupervised Model: {unsup_time:.3f} sec")
print(f"Hybrid Model (approx): {supervised_time + unsup_time:.3f} sec")

# -----------------------------
# 7️⃣ TEST ON UNKNOWN DATA
# -----------------------------
print("\n🧩 Testing Hybrid Model on Unknown (Unlabeled) Dataset...")

# Scale the unknown data using the same scaler as the unsupervised model
X_unknown_scaled = scaler_unsup.transform(df_unsupervised.drop(columns=["address"], errors="ignore"))
unsup_scores_unknown = -unsup_model.decision_function(X_unknown_scaled)
unsup_scaled_unknown = (unsup_scores_unknown - unsup_scores_unknown.min()) / (unsup_scores_unknown.max() - unsup_scores_unknown.min())


# Combine supervised average probability with unsupervised anomaly scores
hybrid_unsup_score = 0.65 * np.mean(y_prob) + 0.35 * unsup_scaled_unknown
threshold = np.median(hybrid_unsup_score)
unknown_flags = (hybrid_unsup_score > threshold).astype(int)


print(f"🔍 Unknown pattern detection (unlabeled dataset): Detected {unknown_flags.sum()} suspicious entries.")

print("\n✅ Hybrid Model Execution Complete!")


🏁 Training Supervised Model with SMOTE and Scaling...
Before SMOTE:
label
0    2300227
1      33130
Name: count, dtype: int64

✅ Supervised Model Accuracy (with SMOTE and Scaling): 0.9092
⏱️ Training Time (Supervised): 68.583 seconds

📊 Classification Report (Supervised):
               precision    recall  f1-score   support

Non-malicious       0.99      0.91      0.95    575057
    Malicious       0.09      0.60      0.16      8283

     accuracy                           0.91    583340
    macro avg       0.54      0.76      0.56    583340
 weighted avg       0.98      0.91      0.94    583340


🏁 Training Unsupervised Model (Isolation Forest)...
⏱️ Training Time (Unsupervised): 5.828 seconds
Total anomalies detected (unsupervised): 10486

🔗 Combining Supervised + Unsupervised Models...

📊 Hybrid Model Evaluation
Accuracy: 0.9440086399012583

Confusion Matrix:
 [[547070  27987]
 [  4675   3608]]

Classification Report:
               precision    recall  f1-score   support

      

In [None]:
print("Original df_supervised label counts:")
print(df_supervised['label'].value_counts())

print("\nFiltered df_supervised_filtered label counts:")
print(df_supervised_filtered['label'].value_counts())

Original df_supervised label counts:
label
white                          2875284
paduaCryptoWall                  12390
montrealCryptoLocker              9315
princetonCerber                   9223
princetonLocky                    6625
montrealCryptXXX                  2419
montrealNoobCrypt                  483
montrealDMALockerv3                354
montrealDMALocker                  251
montrealSamSam                      62
montrealCryptoTorLocker2015         55
montrealGlobeImposter               55
montrealGlobev3                     34
montrealGlobe                       32
montrealWannaCry                    28
montrealRazy                        13
montrealAPT                         11
paduaKeRanger                       10
montrealFlyper                       9
montrealXTPLocker                    8
montrealVenusLocker                  7
montrealCryptConsole                 7
montrealXLockerv5.0                  7
montrealEDA2                         6
montrealJigSaw       

In [None]:
print(X_test.isnull().sum())

address      0
year         0
day          0
length       0
weight       0
count        0
looped       0
neighbors    0
income       0
dtype: int64


In [None]:
# ==========================================
# 🔒 HYBRID RANSOMWARE DETECTION PROGRAM
# Combines Supervised + Unsupervised Detection
# Accepts any CSV dataset automatically
# ==========================================

import pandas as pd
import numpy as np
import time
from sklearn.ensemble import IsolationForest, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import os

# ------------------------------------------
# 1️⃣ LOAD ANY CSV DATASET
# ------------------------------------------
file_path = input("📂 Enter path to your dataset (CSV or ZIP file): ").strip()

if not os.path.exists(file_path):
    print("❌ File not found. Please check the path and try again.")
    exit()

print("✅ File found. Loading dataset...")
df = pd.read_csv(file_path, keep_default_na=True)
print(f"📊 Dataset loaded successfully with {len(df)} rows and {len(df.columns)} columns.\n")

# ------------------------------------------
# 2️⃣ CHECK IF LABELED OR UNLABELED
# ------------------------------------------
is_labeled = 'label' in df.columns
print(f"🧾 Dataset Type: {'Labeled (Supervised + Unsupervised)' if is_labeled else 'Unlabeled (Unsupervised only)'}\n")

# ------------------------------------------
# 3️⃣ UNSUPERVISED MODEL (Isolation Forest)
# ------------------------------------------
print("🤖 Running Unsupervised Detection (Isolation Forest)...")
unsup_df = df.select_dtypes(include=[np.number]).copy()

unsup_model = IsolationForest(
    n_estimators=100,
    contamination=0.01,
    random_state=42,
    n_jobs=-1
)

start_time = time.time()
unsup_model.fit(unsup_df)
unsup_time = time.time() - start_time

unsup_scores = -unsup_model.decision_function(unsup_df)
unsup_flags = unsup_model.predict(unsup_df)
unsup_flags = np.where(unsup_flags == -1, 1, 0)  # 1 = anomaly (ransomware)

print(f"✅ Unsupervised Detection Completed in {unsup_time:.2f} sec")
print(f"🚨 Total anomalies detected (unsupervised): {unsup_flags.sum()} / {len(df)}\n")

df["unsup_score"] = unsup_scores
df["unsup_flag"] = unsup_flags

# ------------------------------------------
# 4️⃣ SUPERVISED MODEL (if labels available)
# ------------------------------------------
if is_labeled:
    print("🧠 Running Supervised Model (HistGradientBoosting + SMOTE)...")

    X = df.drop(columns=["label"], errors="ignore").copy()
    y = df["label"].apply(lambda x: 0 if x == "benign" else 1)

    # Encode categorical columns
    for col in X.select_dtypes(include='object').columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

    # Split & balance data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    smote = SMOTE(random_state=42)
    X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

    supervised_model = HistGradientBoostingClassifier(
        max_iter=100, learning_rate=0.1, max_depth=5, random_state=42
    )

    start_time = time.time()
    supervised_model.fit(X_train_bal, y_train_bal)
    supervised_time = time.time() - start_time

    y_prob = supervised_model.predict_proba(X_test)[:, 1]
    y_pred = supervised_model.predict(X_test)
    supervised_acc = accuracy_score(y_test, y_pred)

    print(f"✅ Supervised Model Accuracy: {supervised_acc:.4f}")
    print(f"⏱️ Training Time: {supervised_time:.2f} sec\n")

    # Scale supervised scores
    supervised_scaled = (y_prob - y_prob.min()) / (y_prob.max() - y_prob.min())

else:
    supervised_scaled = np.zeros(len(df))  # No supervised data
    supervised_acc = 0
    supervised_time = 0

# ------------------------------------------
# 5️⃣ HYBRID ENSEMBLE COMBINATION
# ------------------------------------------
print("⚙️ Combining Models into Hybrid Ensemble (65% Supervised + 35% Unsupervised)...")

# Normalize unsupervised scores
unsup_scaled = (unsup_scores - unsup_scores.min()) / (unsup_scores.max() - unsup_scores.min())

# Match lengths for hybrid calculation
min_len = min(len(supervised_scaled), len(unsup_scaled))
hybrid_score = 0.65 * supervised_scaled[:min_len] + 0.35 * unsup_scaled[:min_len]
hybrid_pred = (hybrid_score > 0.5).astype(int)

df["hybrid_score"] = np.pad(hybrid_score, (0, len(df) - len(hybrid_score)), 'constant')
df["hybrid_flag"] = np.pad(hybrid_pred, (0, len(df) - len(hybrid_pred)), 'constant')

print("✅ Hybrid Model Combination Complete!\n")

# ------------------------------------------
# 6️⃣ EVALUATION (if labeled data available)
# ------------------------------------------
if is_labeled:
    print("📊 Hybrid Model Evaluation on Known Data:")
    print("Accuracy:", accuracy_score(y_test[:min_len], hybrid_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test[:min_len], hybrid_pred))
    print("\nClassification Report:\n", classification_report(y_test[:min_len], hybrid_pred))
else:
    print("ℹ️ Unlabeled dataset – accuracy not available (unsupervised mode).\n")

# ------------------------------------------
# 7️⃣ DISPLAY TOP SUSPICIOUS TRANSACTIONS
# ------------------------------------------
top_suspicious = df.sort_values(by="hybrid_score", ascending=False).head(10)
print("🔎 Top 10 Suspicious Transactions:\n")
print(top_suspicious.head(10))

# ------------------------------------------
# 8️⃣ SAVE RESULTS
# ------------------------------------------
output_path = "ransomware_detection_results.csv"
df.to_csv(output_path, index=False)
print(f"\n💾 Results saved to: {output_path}")
print(f"✅ Hybrid Ransomware Detection Completed Successfully!\n")

# ------------------------------------------
# 9️⃣ SUMMARY
# ------------------------------------------
print("📘 SUMMARY REPORT")
print(f"🧠 Supervised Accuracy: {supervised_acc:.4f}")
print(f"🚨 Total Suspicious (Hybrid): {df['hybrid_flag'].sum()} / {len(df)}")
print(f"⏱️ Total Processing Time: {unsup_time + supervised_time:.2f} sec")
print("\nProgram finished successfully ✅")


📂 Enter path to your dataset (CSV or ZIP file): CSV
