<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Threat_Intelligence_Information_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Threat Intelligence Information Extraction Pipeline**

In [51]:
!pip install -q rake_nltk keybert pyvis

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/756.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [59]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
from keybert import KeyBERT
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import nltk
from gensim.models import Word2Vec
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Initialize global models to avoid reloading
nlp = spacy.load("en_core_web_sm")
bert_model = KeyBERT()
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [60]:
def preprocess_text(text):
    """
    Preprocess text while preserving important technical indicators.

    Args:
        text (str): Input text to preprocess

    Returns:
        str: Preprocessed text
    """
    # Convert to lowercase
    text = text.lower()

    # Preserve technical indicators
    text = re.sub(r'[^\w\s\.-:/@]', ' ', text)  # Keep special chars for IPs/URLs/paths

    # Split camelCase and PascalCase
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

In [61]:
def extract_technical_patterns(text):
    """
    Extract technical indicators commonly found in threat intelligence.

    Args:
        text (str): Input text to analyze

    Returns:
        dict: Dictionary of found patterns categorized by type
    """
    patterns = {
        'ip_addresses': r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
        'domains': r'\b[a-zA-Z0-9][a-zA-Z0-9-._]+\.[a-zA-Z]{2,}\b',
        'md5_hashes': r'\b[a-fA-F0-9]{32}\b',
        'sha256_hashes': r'\b[a-fA-F0-9]{64}\b',
        'cve_ids': r'CVE-\d{4}-\d{4,7}',
        'urls': r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[^\s]*',
        'file_paths': r'(?:[a-zA-Z]:\|/)[^\s/:\"|<>]+',
        'registry_keys': r'HKEY_[A-Z_]+\[^\s]+'
    }

    results = defaultdict(list)
    for pattern_type, pattern in patterns.items():
        matches = re.finditer(pattern, text, re.IGNORECASE)
        results[pattern_type].extend([match.group() for match in matches])

    return dict(results)

In [62]:
def extract_domain_specific_terms(text):
    """
    Extract cyber security and threat intelligence specific terminology.

    Args:
        text (str): Input text to analyze

    Returns:
        dict: Dictionary of domain-specific terms categorized by type
    """
    threat_indicators = {
        'malware_terms': ['malware', 'ransomware', 'trojan', 'botnet', 'worm', 'rootkit', 'keylogger'],
        'attack_patterns': ['phishing', 'spear-phishing', 'ddos', 'bruteforce', 'zero-day', 'exploit'],
        'threat_actors': ['apt', 'threat actor', 'threat group', 'adversary'],
        'security_terms': ['vulnerability', 'payload', 'backdoor', 'c2', 'command and control']
    }

    results = defaultdict(list)
    doc = nlp(text.lower())

    # Extract terms based on categories
    for category, terms in threat_indicators.items():
        for term in terms:
            if term in text.lower():
                # Find the complete phrase containing the term
                matches = re.finditer(r'\b\w+\s*' + term + r'\s*\w+\b', text.lower())
                results[category].extend([match.group() for match in matches])
                # Also add single term matches
                if term in text.lower():
                    results[category].append(term)

    # Extract potential new terms using NER
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'PRODUCT']:
            results['potential_threats'].append(ent.text)

    return dict(results)

In [63]:
def extract_contextual_keywords(text, top_n=15):
    """
    Extract keywords using multiple methods including statistical and contextual approaches.

    Args:
        text (str): Input text to analyze
        top_n (int): Number of top keywords to extract

    Returns:
        dict: Dictionary containing keywords extracted through different methods
    """
    # TF-IDF extraction
    vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                  stop_words='english',
                                  max_features=100)
    try:
        tfidf_matrix = vectorizer.fit_transform([text])
        feature_names = vectorizer.get_feature_names_out()
        tfidf_scores = dict(zip(feature_names, tfidf_matrix.toarray()[0]))
        tfidf_keywords = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    except:
        tfidf_keywords = []

    # BERT-based extraction
    try:
        bert_keywords = bert_model.extract_keywords(text,
                                                    keyphrase_ngram_range=(1,3),
                                                    stop_words='english',
                                                    top_n=top_n)
    except:
        bert_keywords = []

    # Spacy NER and noun phrases
    doc = nlp(text)
    ner_terms = [ent.text for ent in doc.ents]
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]

    return {
        'tfidf_keywords': tfidf_keywords,
        'bert_keywords': bert_keywords,
        'ner_terms': ner_terms,
        'noun_phrases': noun_phrases
    }

In [64]:
def find_similar_terms(keywords, similarity_threshold=0.8):
    """
    Find similar terms using multiple similarity measures.

    Args:
        keywords (list): List of keywords to analyze
        similarity_threshold (float): Threshold for considering terms similar

    Returns:
        dict: Dictionary of terms and their similar variants
    """
    similar_terms = defaultdict(set)

    # WordNet synonyms
    for word in keywords:
        synsets = wordnet.synsets(word)
        for syn in synsets:
            similar_terms[word].update(lemma.name() for lemma in syn.lemmas())

    # Sentence transformer similarity
    if len(keywords) > 1:
        try:
            embeddings = sentence_transformer.encode(keywords)
            similarity_matrix = np.inner(embeddings, embeddings)
            for i, word in enumerate(keywords):
                similar_indices = np.where(similarity_matrix[i] > similarity_threshold)[0]
                similar_terms[word].update([keywords[idx] for idx in similar_indices if idx != i])
        except:
            pass

    # Clean up similar terms
    for word in similar_terms:
        similar_terms[word] = list(similar_terms[word])

    return dict(similar_terms)

In [65]:
def cluster_keywords(keywords, n_clusters=None, min_cluster_size=2):
    """
    Cluster keywords based on semantic similarity.

    Args:
        keywords (list): List of keywords to cluster
        n_clusters (int): Number of clusters (optional)
        min_cluster_size (int): Minimum size for a cluster

    Returns:
        dict: Dictionary of clusters and their keywords
    """
    if len(keywords) < min_cluster_size:
        return {'single_cluster': keywords}

    try:
        # Generate embeddings
        embeddings = sentence_transformer.encode(keywords)
        # Determine optimal number of clusters
        if n_clusters is None:
            n_clusters = min(max(2, len(keywords) // 5), 10)
        # Perform clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(embeddings)
        # Calculate cluster centers and distances
        cluster_centers = kmeans.cluster_centers_
        distances = np.linalg.norm(embeddings[:, np.newaxis] - cluster_centers, axis=2)
        # Organize results with confidence scores
        clustered_keywords = defaultdict(list)
        for keyword, cluster_id, dist in zip(keywords, clusters, distances):
            confidence = 1 - (dist[cluster_id] / np.max(dist))
            clustered_keywords[f'cluster_{cluster_id}'].append({
                'keyword': keyword,
                'confidence': float(confidence)
            })
        return dict(clustered_keywords)
    except:
        return {'error_cluster': keywords}

In [66]:
def analyze_threat_intelligence(text):
    """
    Analyze threat intelligence text and extract relevant information.

    Args:
        text (str): Input threat intelligence text

    Returns:
        dict: Comprehensive analysis results
    """
    # Preprocess text
    processed_text = preprocess_text(text)
    # Extract different types of information
    technical_indicators = extract_technical_patterns(text)
    domain_terms = extract_domain_specific_terms(processed_text)
    contextual_kw = extract_contextual_keywords(processed_text)
    # Combine all unique keywords
    all_keywords = set()
    # Add technical indicators
    for indicator_type in technical_indicators.values():
        all_keywords.update(indicator_type)
    # Add domain terms
    for term_type in domain_terms.values():
        all_keywords.update(term_type)
    # Add contextual keywords
    all_keywords.update([kw[0] for kw in contextual_kw['bert_keywords']])
    all_keywords.update(contextual_kw['ner_terms'])
    # Convert to list and find similar terms
    all_keywords = list(all_keywords)
    similar_terms = find_similar_terms(all_keywords)
    # Cluster keywords
    clusters = cluster_keywords(all_keywords)
    return {
        'technical_indicators': technical_indicators,
        'domain_specific_terms': domain_terms,
        'contextual_keywords': contextual_kw,
        'similar_terms': similar_terms,
        'clustered_keywords': clusters
    }

In [67]:
# Execute the analysis immediately after initialization
sample_text = """
APT29 (also known as Cozy Bear) has been observed using WellMess malware targeting COVID-19 research organizations.
The threat actor employs sophisticated spear-phishing campaigns and exploits CVE-2020-0688 to deploy backdoors.
Command and control traffic was observed at 192.168.1.1 and malicious.domain.com.
The attacks involved SHA256 hash abc123def456 and registry modifications at HKEY_LOCAL_MACHINE\Software\Microsoft.
"""

results = analyze_threat_intelligence(sample_text)

# Print results in a structured way
print("\nTechnical Indicators:")
for indicator_type, indicators in results['technical_indicators'].items():
    if indicators:
        print(f"\n{indicator_type.replace('_', ' ').title()}:")
        for indicator in indicators:
            print(f"- {indicator}")

print("\nDomain-Specific Terms:")
for term_type, terms in results['domain_specific_terms'].items():
    if terms:
        print(f"\n{term_type.replace('_', ' ').title()}:")
        for term in terms:
            print(f"- {term}")

print("\nClustered Keywords:")
for cluster_name, keywords in results['clustered_keywords'].items():
    print(f"\n{cluster_name}:")
    for kw in keywords:
        if isinstance(kw, dict):
            print(f"- {kw['keyword']} (confidence: {kw['confidence']:.2f})")
        else:
            print(f"- {kw}")


Technical Indicators:

Ip Addresses:
- 192.168.1.1

Domains:
- malicious.domain.com

Cve Ids:
- CVE-2020-0688

Domain-Specific Terms:

Malware Terms:
- wellmess malware targeting
- malware

Attack Patterns:
- spear phishing campaigns
- phishing
- and exploits
- exploit

Threat Actors:
- apt
- the threat actor employs
- threat actor

Security Terms:
- deploy backdoors
- backdoor
- command and control

Potential Threats:
- cozy bear
- hkey_local_machine
- microsoft

Clustered Keywords:

cluster_4:
- 192.168.1.1 (confidence: 0.35)
- spear phishing campaigns (confidence: 0.50)
- phishing (confidence: 0.50)

cluster_2:
- apt29 known cozy (confidence: 0.64)
- cozy bear (confidence: 0.36)
- apt29 known (confidence: 0.48)
- apt (confidence: 0.46)

cluster_1:
- using wellmess malware (confidence: 0.60)
- hkey_local_machine (confidence: 0.19)
- malware targeting (confidence: 0.55)
- malware targeting covid (confidence: 0.43)
- wellmess malware (confidence: 0.61)
- wellmess malware targeting (co