<a href="https://colab.research.google.com/github/Nawin03-DS/ML-Project/blob/main/Microsoft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Graph-based Threat Detection System for Insider Threats

In [None]:
pip install networkx pandas torch torch-geometric scikit-learn

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuff

1. Simulate Access Data

In [None]:
import pandas as pd
import random

# Simulated log: user, device, file, timestamp
users = ['U1', 'U2', 'U3', 'U4']
devices = ['D1', 'D2', 'D3']
files = ['F1', 'F2', 'F3', 'F4']

data = []
for _ in range(100):
    data.append({
        'user': random.choice(users),
        'device': random.choice(devices),
        'file': random.choice(files),
        'hour': random.randint(0, 23)
    })

df = pd.DataFrame(data)
print(df.head())

  user device file  hour
0   U2     D3   F4    19
1   U3     D2   F1     2
2   U3     D3   F1     0
3   U2     D3   F3     7
4   U4     D3   F1    20


2. Build a Heterogeneous Graph using NetworkX

In [None]:
import networkx as nx

G = nx.Graph()

# Add nodes
for u in users:
    G.add_node(u, type='user')
for d in devices:
    G.add_node(d, type='device')
for f in files:
    G.add_node(f, type='file')

# Add edges
for _, row in df.iterrows():
    G.add_edge(row['user'], row['device'], relation='uses')
    G.add_edge(row['user'], row['file'], relation='accesses')

nx.write_edgelist(G, "graph.edgelist")
print("Graph created with", G.number_of_nodes(), "nodes and", G.number_of_edges(), "edges.")

Graph created with 11 nodes and 28 edges.


3. Convert to PyTorch Geometric Data

In [None]:
from torch_geometric.utils import from_networkx
from torch_geometric.data import Data
import torch

# Assign numeric features
for node in G.nodes():
    G.nodes[node]['x'] = [1.0 if G.nodes[node]['type'] == 'user' else 0.0]

data = from_networkx(G)
data.x = torch.tensor([G.nodes[n]['x'] for n in G.nodes()], dtype=torch.float)
print(data)

Data(x=[11, 1], edge_index=[2, 56], type=[11], relation=[56])


4. Train a Simple GNN Encoder

In [None]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool

class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 2)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

model = GNN(in_channels=1, hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.mse_loss(out, out.mean(dim=0))  # unsupervised
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')


  loss = F.mse_loss(out, out.mean(dim=0))  # unsupervised


Epoch 0, Loss: 0.0014
Epoch 10, Loss: 0.0002
Epoch 20, Loss: 0.0001
Epoch 30, Loss: 0.0000
Epoch 40, Loss: 0.0000
Epoch 50, Loss: 0.0000
Epoch 60, Loss: 0.0000
Epoch 70, Loss: 0.0000
Epoch 80, Loss: 0.0000
Epoch 90, Loss: 0.0000


5. Detect Anomalies via Embedding Distance

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

model.eval()
with torch.no_grad():
    embeddings = model(data.x, data.edge_index).numpy()

dist_matrix = euclidean_distances(embeddings)
mean_dist = dist_matrix.mean(axis=1)

# Flag top 5% most distant nodes as anomalies
threshold = np.percentile(mean_dist, 95)
anomalies = [i for i, d in enumerate(mean_dist) if d > threshold]

print("Anomalous node indices:", anomalies)
print("Anomalous nodes:", [list(G.nodes)[i] for i in anomalies])

Anomalous node indices: []
Anomalous nodes: []


## 2.end-to-end graph-based cybersecurity project ,phishing domain detection using Graph ML

# **Project Title: Graph-based Detection of Phishing Domains**

Build a Graph Neural Network (GNN) model to detect phishing websites by analyzing the relationships between domains, IP addresses, WHOIS info, and SSL certificate metadata.**

Instead of treating each domain in isolation, we represent the ecosystem of domain infrastructure as a graph and apply graph learning to detect suspicious patterns.

In [None]:
pip install torch torch-geometric pandas networkx scikit-learn



1️. Simulate Data (Domain Relationships)


In [None]:
import pandas as pd
import random

domains = [f'domain{i}.com' for i in range(20)]
ips = [f'192.168.0.{i}' for i in range(1, 6)]
ssl_issuers = ['Let\'s Encrypt', 'UnknownCA', 'Google Trust Services']
whois_countries = ['US', 'RU', 'CN', 'IN']

data = []
for domain in domains:
    data.append({
        'domain': domain,
        'ip': random.choice(ips),
        'ssl': random.choice(ssl_issuers),
        'whois': random.choice(whois_countries),
        'label': 1 if random.random() < 0.2 else 0  # 1 = phishing, 0 = benign
    })

df = pd.DataFrame(data)
print(df.head())

        domain           ip                    ssl whois  label
0  domain0.com  192.168.0.3          Let's Encrypt    CN      0
1  domain1.com  192.168.0.5              UnknownCA    IN      0
2  domain2.com  192.168.0.1  Google Trust Services    IN      0
3  domain3.com  192.168.0.2          Let's Encrypt    CN      0
4  domain4.com  192.168.0.4          Let's Encrypt    CN      1


2️. Build Domain Graph (NetworkX)

In [None]:
import networkx as nx

G = nx.Graph()

for _, row in df.iterrows():
    domain = row['domain']
    ip = f"IP:{row['ip']}"
    ssl = f"SSL:{row['ssl']}"
    whois = f"WHOIS:{row['whois']}"

    G.add_node(domain, type='domain', label=row['label'])
    G.add_node(ip, type='ip')
    G.add_node(ssl, type='ssl')
    G.add_node(whois, type='whois')

    G.add_edge(domain, ip)
    G.add_edge(domain, ssl)
    G.add_edge(domain, whois)

print("Graph has", G.number_of_nodes(), "nodes and", G.number_of_edges(), "edges.")

Graph has 32 nodes and 60 edges.


3️. Convert to PyTorch Geometric Format

In [None]:
from torch_geometric.utils import from_networkx
import torch

# Assign simple features: one-hot by type
type_map = {'domain': [1, 0, 0, 0], 'ip': [0, 1, 0, 0], 'ssl': [0, 0, 1, 0], 'whois': [0, 0, 0, 1]}
for node in G.nodes():
    G.nodes[node]['x'] = type_map[G.nodes[node]['type']]
    # Add a default label for non-domain nodes
    if 'label' not in G.nodes[node]:
        G.nodes[node]['label'] = -1 # Or any other suitable default value

# Store labels for domains
domain_nodes = [n for n in G.nodes() if G.nodes[n]['type'] == 'domain']
domain_labels = torch.tensor([G.nodes[n]['label'] for n in domain_nodes])

data = from_networkx(G)
data.x = torch.tensor([G.nodes[n]['x'] for n in G.nodes()], dtype=torch.float)
data.y = torch.zeros(data.x.shape[0], dtype=torch.long)
for i, node in enumerate(G.nodes()):
    if G.nodes[node]['type'] == 'domain':
        data.y[i] = G.nodes[node]['label']

print(data)

Data(x=[32, 4], edge_index=[2, 120], type=[32], label=[32], y=[32])


4️. Train GNN Classifier for Phishing Detection

In [None]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class DomainClassifier(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 2)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

model = DomainClassifier(4, 16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train only on domain nodes
mask = torch.tensor([G.nodes[n]['type'] == 'domain' for n in G.nodes()], dtype=torch.bool)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out[mask], data.y[mask])
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 0, Loss: 0.6944
Epoch 10, Loss: 0.4588
Epoch 20, Loss: 0.3284
Epoch 30, Loss: 0.3297
Epoch 40, Loss: 0.3253
Epoch 50, Loss: 0.3213
Epoch 60, Loss: 0.3216
Epoch 70, Loss: 0.3206
Epoch 80, Loss: 0.3204
Epoch 90, Loss: 0.3199


5️. Predict and Flag Phishing Domains

In [None]:
model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index)
    pred = out[mask].argmax(dim=1)

domain_preds = pred.tolist()
domain_names = [n for n in G.nodes() if G.nodes[n]['type'] == 'domain']

for d, p in zip(domain_names, domain_preds):
    label = 'Phishing' if p == 1 else 'Benign'
    print(f"{d}: {label}")

domain0.com: Benign
domain1.com: Benign
domain2.com: Benign
domain3.com: Benign
domain4.com: Benign
domain5.com: Benign
domain6.com: Benign
domain7.com: Benign
domain8.com: Benign
domain9.com: Benign
domain10.com: Benign
domain11.com: Benign
domain12.com: Benign
domain13.com: Benign
domain14.com: Benign
domain15.com: Benign
domain16.com: Benign
domain17.com: Benign
domain18.com: Benign
domain19.com: Benign
