<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Threat_Intelligence_Information_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Threat Intelligence Information Extraction Pipeline**

In [5]:
!pip install -q rake_nltk keybert

In [15]:
# Import libraries
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
from keybert import KeyBERT
import spacy
from gensim import corpora, models
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [16]:
# 1. Preprocessing
def preprocess_text(text):
    # Basic cleaning
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    # Add threat intelligence-specific cleanup here (e.g., IOC patterns)
    return text

In [17]:
# 2. Keyword Extraction Modules
def tfidf_extractor(text, n=15):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
    tfidf_matrix = vectorizer.fit_transform([text])
    features = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray()[0]
    return features[np.argsort(scores)[-n:][::-1]].tolist()

def rake_extractor(text, n=15):
    r = Rake()
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()[:n]

def keybert_extractor(text, n=15):
    kw_model = KeyBERT()
    return [kw[0] for kw in kw_model.extract_keywords(text, keyphrase_ngram_range=(1,3), top_n=n)]

def ner_extractor(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'GPE', 'PRODUCT', 'MALWARE']]

In [18]:
# 3. Aggregation and Deduplication
def aggregate_keywords(text):
    processed_text = preprocess_text(text)

    keywords = []
    keywords += tfidf_extractor(processed_text)
    keywords += rake_extractor(processed_text)
    keywords += keybert_extractor(processed_text)
    keywords += ner_extractor(processed_text)
    # Add your LLM-generated keywords here

    # Deduplicate and clean
    return list(set([kw.strip() for kw in keywords if len(kw) > 2]))

In [19]:
# 4. Clustering for Grouping
def cluster_keywords(keywords):
    # Generate embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(keywords)

    # Cluster using K-Means
    n_clusters = min(5, len(keywords))
    kmeans = KMeans(n_clusters=n_clusters).fit(embeddings)

    # Organize results
    clusters = {}
    for idx, label in enumerate(kmeans.labels_):
        clusters.setdefault(label, []).append(keywords[idx])
    return clusters

In [20]:
# 5. Full Pipeline
def threat_intel_pipeline(text):
    # Step 1-3: Extract and aggregate
    all_keywords = aggregate_keywords(text)

    # Step 4: Cluster
    clustered_keywords = cluster_keywords(all_keywords)

    # Step 5: Validation (add LLM validation here)
    return {
        'raw_keywords': all_keywords,
        'clustered_keywords': clustered_keywords
    }

In [21]:
# Example usage
sample_text = "APT29 uses WellMess malware targeting COVID-19 research. The group employs spear-phishing and CVE-2020-0688 exploits..."
result = threat_intel_pipeline(sample_text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]