<a href="https://colab.research.google.com/github/Ndyis/CTI-NLP-MODELS/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Step 1: Install Required Libraries**

In [None]:
!pip install transformers datasets seqeval torch pandas scikit-learn evaluate

In [None]:
# Install required dependencies
!pip install spacy tqdm nltk
!python -m spacy download en_core_web_sm

In [9]:
!pip install tabulate



# **Step 2: Parse and Prepare Your Data**

In [5]:
import pandas as pd
import json
import os
from tqdm import tqdm
import torch
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# File path
file_path = "/content/drive/MyDrive/Colab Notebooks/cti-news.json"

# Function to read JSON data line by line
def read_json_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Reading JSON data"):
            try:
                # Each line is a separate JSON object
                json_obj = json.loads(line.strip())
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON line: {e}")
    return data

# Function to extract text content based on source
def extract_text_content(item):
    try:
        if item['key'] == 'rss':
            # For RSS items, combine title and full content
            value = item['value']
            title = value.get('title', '')
            content = value.get('full_content', '')
            if content is None:  # Handle null content
                content = ''
            return f"{title}\n\n{content}".strip()

        elif item['key'] == 'telegram':
            # For Telegram items, use the text field
            return item['value'].get('text', '').strip()

        elif item['key'] == 'twitter':
            # For Twitter items, use the text field
            return item['value'].get('text', '').strip()

        return ""
    except Exception as e:
        print(f"Error extracting text: {e}")
        return ""

# Read the JSON data
print("Reading JSON data...")
json_data = read_json_data(file_path)
print(f"Loaded {len(json_data)} records from JSON file")

# Convert to DataFrame and extract text content
print("Processing data...")
processed_data = []

for item in tqdm(json_data, desc="Extracting text content"):
    text = extract_text_content(item)
    if text:  # Only include non-empty text
        processed_data.append({
            'id': item.get('offset', len(processed_data)),
            'timestamp': item.get('timestamp'),
            'source': item['key'],
            'text_content': text
        })

# Create DataFrame
df = pd.DataFrame(processed_data)

# Remove any remaining empty content
df = df[df['text_content'] != '']

# Display statistics
print(f"\nExtracted {len(df)} text samples")
print(f"Sources distribution: {df['source'].value_counts().to_dict()}")

# Display a sample
print("\nSample text content:")
if not df.empty:
    print(df['text_content'].iloc[0][:500] + "...")  # Show first 500 chars of first item

# Save processed data to CSV for easier handling in next steps
output_path = "/content/drive/MyDrive/Colab Notebooks/cti-news-processed.csv"
df.to_csv(output_path, index=False)
print(f"\nProcessed data saved to {output_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
GPU: Tesla T4
GPU Memory: 15.83 GB
Reading JSON data...


Reading JSON data: 500it [00:00, 885.20it/s]


Loaded 500 records from JSON file
Processing data...


Extracting text content: 100%|██████████| 500/500 [00:00<00:00, 119136.06it/s]



Extracted 500 text samples
Sources distribution: {'rss': 360, 'telegram': 100, 'twitter': 40}

Sample text content:
Watering Hole Attacks Push ScanBox Keylogger

Share this article:
Researchers uncover a watering hole attack likely carried out by APT TA423, which attempts to plant the ScanBox JavaScript-based reconnaissance tool.
A China-based threat actor has ramped up efforts to distribute the ScanBox reconnaissance framework to victims that include domestic Australian organizations and offshore energy firms in the South China Sea. The bait used by the advanced threat group (APT) is targeted messages that s...

Processed data saved to /content/drive/MyDrive/Colab Notebooks/cti-news-processed.csv


# **Step 3: Tokenize and Prepare Data for NER**

In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer
from tqdm import tqdm
import re
import spacy
import nltk
from nltk.tokenize import sent_tokenize
from google.colab import drive

# Download NLTK data - fix the punkt_tab issue
nltk.download('punkt')
nltk.download('punkt_tab')  # Add this line

# Mount Google Drive
drive.mount('/content/drive')

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the processed data
processed_data_path = "/content/drive/MyDrive/Colab Notebooks/cti-news-processed.csv"
df = pd.read_csv(processed_data_path)
print(f"Loaded {len(df)} processed records")

# Load tokenizer - using the multilingual model that performed better in previous tests
model_name = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"Loaded tokenizer: {model_name}")

# Load spaCy for sentence segmentation
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger"])
nlp.max_length = 3000000  # Increase max length for large texts

# Function to split text into sentences - using a more robust approach
def split_into_sentences(text):
    try:
        # Use NLTK for sentence tokenization
        sentences = sent_tokenize(text)
        # Filter out very short sentences (likely not useful for NER)
        return [s.strip() for s in sentences if len(s.strip()) > 10]
    except LookupError:
        # Fallback to a simple regex-based approach if NLTK fails
        print("Using fallback sentence splitter")
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
        return [s.strip() for s in sentences if len(s.strip()) > 10]

# Function to clean text
def clean_text(text):
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove URLs (simplified pattern)
    text = re.sub(r'https?://\S+', '', text)
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    return text.strip()

# Process each text into sentences
print("Processing texts into sentences...")
all_sentences = []
sentence_sources = []  # Keep track of which source each sentence came from

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing texts"):
    text = row['text_content']
    source = row['source']

    # Clean the text
    cleaned_text = clean_text(text)

    # Split into sentences
    sentences = split_into_sentences(cleaned_text)

    # Add to our collections
    all_sentences.extend(sentences)
    sentence_sources.extend([source] * len(sentences))

print(f"Generated {len(all_sentences)} sentences from {len(df)} documents")

# Create a DataFrame with sentences
sentences_df = pd.DataFrame({
    'sentence': all_sentences,
    'source': sentence_sources
})

# Sample some sentences to verify
print("\nSample sentences:")
for i in range(min(5, len(sentences_df))):
    print(f"[{sentences_df.iloc[i]['source']}] {sentences_df.iloc[i]['sentence'][:100]}...")

# Tokenize sentences using the transformer tokenizer
print("\nTokenizing sentences...")
tokenized_sentences = []

for sentence in tqdm(sentences_df['sentence'], desc="Tokenizing"):
    # Tokenize with the transformer tokenizer
    tokens = tokenizer.tokenize(sentence)
    # Only keep sentences with a reasonable number of tokens
    if 5 <= len(tokens) <= 128:
        tokenized_sentences.append({
            'sentence': sentence,
            'tokens': tokens
        })

print(f"Kept {len(tokenized_sentences)} tokenized sentences within length limits")

# Save the tokenized sentences for the next step
tokenized_df = pd.DataFrame(tokenized_sentences)
tokenized_path = "/content/drive/MyDrive/Colab Notebooks/cti-tokenized-sentences.csv"
tokenized_df.to_csv(tokenized_path, index=False)
print(f"Saved tokenized sentences to {tokenized_path}")

# Display a sample of tokenized sentences
print("\nSample tokenized sentence:")
if not tokenized_df.empty:
    sample_idx = min(3, len(tokenized_df) - 1)
    sample = tokenized_df.iloc[sample_idx]
    print(f"Sentence: {sample['sentence'][:100]}...")
    print(f"Tokens: {sample['tokens'][:20]}...")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Loaded 500 processed records
Loaded tokenizer: Davlan/bert-base-multilingual-cased-ner-hrl
Processing texts into sentences...


Processing texts: 100%|██████████| 500/500 [00:00<00:00, 1010.73it/s]


Generated 9641 sentences from 500 documents

Sample sentences:
[rss] Watering Hole Attacks Push ScanBox Keylogger Share this article: Researchers uncover a watering hole...
[rss] A China-based threat actor has ramped up efforts to distribute the ScanBox reconnaissance framework ...
[rss] The bait used by the advanced threat group (APT) is targeted messages that supposedly link back to A...
[rss] The cyber-espionage campaigns are believed to have launched April 2022 through mid-June 2022, accord...
[rss] The threat actor, according to researchers, is believed to be the China-based APT TA423, also known ...

Tokenizing sentences...


Tokenizing:   0%|          | 0/9641 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (632 > 512). Running this sequence through the model will result in indexing errors
Tokenizing: 100%|██████████| 9641/9641 [00:01<00:00, 6035.71it/s]


Kept 9352 tokenized sentences within length limits
Saved tokenized sentences to /content/drive/MyDrive/Colab Notebooks/cti-tokenized-sentences.csv

Sample tokenized sentence:
Sentence: The cyber-espionage campaigns are believed to have launched April 2022 through mid-June 2022, accord...
Tokens: ['The', 'c', '##y', '##ber', '-', 'es', '##pion', '##age', 'campaigns', 'are', 'believed', 'to', 'have', 'launched', 'April', '2022', 'through', 'mid', '-', 'June']...


# **Step 4: Create Initial NER Annotations**

In [4]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm
import re
import json
import spacy
import numpy as np
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenized sentences
tokenized_path = "/content/drive/MyDrive/Colab Notebooks/cti-tokenized-sentences.csv"
tokenized_df = pd.read_csv(tokenized_path)
print(f"Loaded {len(tokenized_df)} tokenized sentences")

# Load spaCy for additional NER
nlp = spacy.load("en_core_web_lg")

# Define CTI-specific keywords with expanded lists for better coverage
cti_keywords = {
    "THREAT_ACTOR": [
        "APT", "threat actor", "hacker group", "threat group", "cybercriminal",
        "nation-state actor", "Lazarus", "Fancy Bear", "Cozy Bear", "APT29", "APT28",
        "Sandworm", "TA423", "Red Ladon", "Kimsuky", "Conti", "LockBit", "BlackCat",
        "Nobelium", "Mustang Panda", "Volt Typhoon", "Scattered Spider", "Midnight Blizzard",
        "Cl0p", "Lapsus$", "REvil", "DarkSide", "BlackMatter", "FIN7", "FIN8"
    ],
    "MALWARE": [
        "malware", "ransomware", "trojan", "virus", "worm", "backdoor", "spyware",
        "keylogger", "rootkit", "bootkit", "adware", "ScanBox", "SUNBURST", "BLINDINGCAN",
        "Emotet", "TrickBot", "Ryuk", "WannaCry", "NotPetya", "BlackEnergy", "Stuxnet",
        "Duqu", "Flame", "Gauss", "ZeuS", "SpyEye", "Dridex", "Locky", "CryptoLocker",
        "BlackEnergy", "DarkHotel", "Industroyer", "BlackEnergy", "PlugX", "Cobalt Strike"
    ],
    "VULNERABILITY": [
        "vulnerability", "exploit", "zero-day", "CVE-", "security flaw", "security bug",
        "security vulnerability", "RCE", "code execution", "buffer overflow", "SQL injection",
        "cross-site scripting", "XSS", "CSRF", "path traversal", "command injection",
        "privilege escalation", "authentication bypass", "memory corruption", "use-after-free",
        "integer overflow", "race condition", "insecure deserialization", "SSRF", "XXE"
    ],
    "ATTACK_PATTERN": [
        "phishing", "spear-phishing", "watering hole", "DDoS", "brute force",
        "credential stuffing", "social engineering", "XSS", "SQL injection",
        "man-in-the-middle", "supply chain attack", "lateral movement", "privilege escalation",
        "data exfiltration", "command and control", "C2", "living off the land", "fileless malware",
        "password spraying", "credential harvesting", "business email compromise", "BEC",
        "island hopping", "drive-by download", "typosquatting", "smishing", "vishing"
    ]
}

# Helper function to convert tensor to native Python type
def to_python_type(obj):
    if isinstance(obj, torch.Tensor):
        return obj.item()
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, np.float32) or isinstance(obj, np.float64):
        return float(obj)
    elif isinstance(obj, np.int32) or isinstance(obj, np.int64):
        return int(obj)
    return obj

# Load pre-trained NER model
print("Loading pre-trained NER model...")
model_name = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Move model to GPU if available
model = model.to(device)
print(f"Model loaded and moved to {device}")

# Create NER pipeline
ner_pipeline = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

# Function to find keyword matches in text
def find_keyword_matches(text, entity_type, keywords):
    matches = []
    text_lower = text.lower()

    for keyword in keywords:
        keyword_lower = keyword.lower()
        start = 0
        while True:
            pos = text_lower.find(keyword_lower, start)
            if pos == -1:
                break

            # Add entity with proper case from the original text
            entity_text = text[pos:pos + len(keyword)]
            matches.append({
                'entity_type': entity_type,
                'text': entity_text,
                'start': pos,
                'end': pos + len(keyword),
                'source': 'keyword'
            })

            start = pos + 1

    return matches

# Function to convert spaCy entities to our format
def convert_spacy_entities(doc):
    entities = []
    for ent in doc.ents:
        # Map spaCy entity types to our schema
        entity_type = None
        if ent.label_ in ["ORG", "ORGANIZATION"]:
            entity_type = "ORG"
        elif ent.label_ in ["PERSON", "PER"]:
            entity_type = "PER"
        elif ent.label_ in ["LOC", "GPE", "LOCATION"]:
            entity_type = "LOC"
        elif ent.label_ in ["MISC"]:
            entity_type = "MISC"

        if entity_type:
            entities.append({
                'entity_type': entity_type,
                'text': ent.text,
                'start': ent.start_char,
                'end': ent.end_char,
                'source': 'spacy'
            })

    return entities

# Function to convert transformer NER entities to our format
def convert_transformer_entities(entities, text):
    converted = []
    for entity in entities:
        # Map transformer entity types to our schema
        entity_type = None
        if entity['entity_group'] in ["ORG", "ORGANIZATION", "I-ORG", "B-ORG"]:
            entity_type = "ORG"
        elif entity['entity_group'] in ["PER", "PERSON", "I-PER", "B-PER"]:
            entity_type = "PER"
        elif entity['entity_group'] in ["LOC", "LOCATION", "I-LOC", "B-LOC"]:
            entity_type = "LOC"
        elif entity['entity_group'] in ["MISC", "I-MISC", "B-MISC"]:
            entity_type = "MISC"

        if entity_type:
            # Convert score to Python float
            score = to_python_type(entity.get('score', 0.0))

            converted.append({
                'entity_type': entity_type,
                'text': entity['word'],
                'start': entity['start'],
                'end': entity['end'],
                'source': 'transformer',
                'score': score
            })

    return converted

# Process each sentence to create annotations
print("Creating initial NER annotations...")
annotated_sentences = []

for idx, row in tqdm(tokenized_df.iterrows(), total=len(tokenized_df), desc="Annotating sentences"):
    sentence = row['sentence']
    tokens = eval(row['tokens']) if isinstance(row['tokens'], str) else row['tokens']

    # Get entities from transformer model
    try:
        transformer_entities = ner_pipeline(sentence)
        transformer_entities = convert_transformer_entities(transformer_entities, sentence)
    except Exception as e:
        print(f"Error with transformer NER for sentence {idx}: {e}")
        transformer_entities = []

    # Get entities from spaCy
    try:
        doc = nlp(sentence)
        spacy_entities = convert_spacy_entities(doc)
    except Exception as e:
        print(f"Error with spaCy NER for sentence {idx}: {e}")
        spacy_entities = []

    # Get keyword matches for CTI entities
    keyword_entities = []
    for entity_type, keywords in cti_keywords.items():
        matches = find_keyword_matches(sentence, entity_type, keywords)
        keyword_entities.extend(matches)

    # Combine all entities
    all_entities = transformer_entities + spacy_entities + keyword_entities

    # Remove overlapping entities (prefer keyword > transformer > spaCy)
    non_overlapping = []
    all_entities.sort(key=lambda x: (x['start'], -len(x['text'])))  # Sort by start position and then by length (descending)

    for entity in all_entities:
        # Check if this entity overlaps with any already selected entity
        overlaps = False
        for selected in non_overlapping:
            if (entity['start'] < selected['end'] and entity['end'] > selected['start']):
                # If keyword entity, it takes precedence
                if entity['source'] == 'keyword' and selected['source'] != 'keyword':
                    non_overlapping.remove(selected)
                    non_overlapping.append(entity)
                overlaps = True
                break

        if not overlaps:
            non_overlapping.append(entity)

    # Sort entities by start position
    non_overlapping.sort(key=lambda x: x['start'])

    # Add to annotated sentences
    annotated_sentences.append({
        'sentence': sentence,
        'tokens': tokens,
        'entities': non_overlapping
    })

print(f"Created annotations for {len(annotated_sentences)} sentences")

# Save the annotated sentences
annotated_path = "/content/drive/MyDrive/Colab Notebooks/cti-annotated-sentences.json"
with open(annotated_path, 'w', encoding='utf-8') as f:
    json.dump(annotated_sentences, f, ensure_ascii=False, indent=2)

print(f"Saved annotated sentences to {annotated_path}")

# Display a sample of annotated sentences
print("\nSample annotated sentence:")
if annotated_sentences:
    sample_idx = min(3, len(annotated_sentences) - 1)
    sample = annotated_sentences[sample_idx]
    print(f"Sentence: {sample['sentence'][:100]}...")
    print(f"Entities:")
    for entity in sample['entities']:
        print(f"  - {entity['text']} ({entity['entity_type']}) [{entity['start']}:{entity['end']}] from {entity['source']}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Loaded 9352 tokenized sentences
Loading pre-trained NER model...


Device set to use cuda:0


Model loaded and moved to cuda
Creating initial NER annotations...


Annotating sentences: 100%|██████████| 9352/9352 [03:28<00:00, 44.79it/s]


Created annotations for 9352 sentences
Saved annotated sentences to /content/drive/MyDrive/Colab Notebooks/cti-annotated-sentences.json

Sample annotated sentence:
Sentence: The cyber-espionage campaigns are believed to have launched April 2022 through mid-June 2022, accord...
Entities:
  - Proofpoint’s Threat Research Team (ORG) [112:145] from spacy
  - PwC (ORG) [150:153] from transformer
  - Threat Intelligence (ORG) [156:175] from transformer


# **# Step 5: Convert Annotations to Training Format**

In [5]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
from transformers import AutoTokenizer
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the annotated sentences
annotated_path = "/content/drive/MyDrive/Colab Notebooks/cti-annotated-sentences.json"
with open(annotated_path, 'r', encoding='utf-8') as f:
    annotated_sentences = json.load(f)

print(f"Loaded {len(annotated_sentences)} annotated sentences")

# Load tokenizer
model_name = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define our NER label list
label_list = [
    "O",  # Outside (not an entity)
    # Person entities
    "B-PER", "I-PER",
    # Organization entities
    "B-ORG", "I-ORG",
    # Location entities
    "B-LOC", "I-LOC",
    # Miscellaneous entities
    "B-MISC", "I-MISC",
    # CTI-specific entities
    "B-THREAT_ACTOR", "I-THREAT_ACTOR",
    "B-MALWARE", "I-MALWARE",
    "B-VULNERABILITY", "I-VULNERABILITY",
    "B-ATTACK_PATTERN", "I-ATTACK_PATTERN"
]

# Create label mappings
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

# Function to convert entity annotations to token-level BIO tags
def create_token_labels(sentence, entities, tokenizer):
    # Tokenize the sentence
    tokenized = tokenizer(sentence, return_offsets_mapping=True, add_special_tokens=False)
    tokens = tokenizer.convert_ids_to_tokens(tokenized['input_ids'])
    offset_mapping = tokenized['offset_mapping']

    # Initialize all labels as "O"
    labels = ["O"] * len(tokens)

    # For each entity, find the corresponding tokens and assign BIO tags
    for entity in entities:
        entity_type = entity['entity_type']
        start_char = entity['start']
        end_char = entity['end']

        # Find tokens that overlap with this entity
        entity_tokens = []
        for i, (token_start, token_end) in enumerate(offset_mapping):
            # Check if this token overlaps with the entity
            if token_end > start_char and token_start < end_char:
                entity_tokens.append(i)

        # Assign BIO tags to the entity tokens
        if entity_tokens:
            # First token gets B- tag
            labels[entity_tokens[0]] = f"B-{entity_type}"

            # Remaining tokens get I- tag
            for i in entity_tokens[1:]:
                labels[i] = f"I-{entity_type}"

    return {
        'tokens': tokens,
        'labels': labels,
        'offset_mapping': offset_mapping
    }

# Convert all annotated sentences to token-level training data
print("Converting annotations to token-level training data...")
training_data = []

for idx, item in tqdm(enumerate(annotated_sentences), total=len(annotated_sentences), desc="Processing"):
    sentence = item['sentence']
    entities = item['entities']

    # Create token-level labels
    token_data = create_token_labels(sentence, entities, tokenizer)

    # Add to training data
    training_data.append({
        'id': idx,
        'sentence': sentence,
        'tokens': token_data['tokens'],
        'labels': token_data['labels'],
        'offset_mapping': token_data['offset_mapping']
    })

print(f"Created {len(training_data)} token-level training examples")

# Split into train/test sets (80/20 split)
random.seed(42)  # For reproducibility
random.shuffle(training_data)

train_size = int(0.8 * len(training_data))
train_data = training_data[:train_size]
test_data = training_data[train_size:]

print(f"Split into {len(train_data)} training and {len(test_data)} test examples")

# Save the training and test data
train_path = "/content/drive/MyDrive/Colab Notebooks/cti-train-data.json"
test_path = "/content/drive/MyDrive/Colab Notebooks/cti-test-data.json"

with open(train_path, 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

with open(test_path, 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=2)

print(f"Saved training data to {train_path}")
print(f"Saved test data to {test_path}")

# Display a sample of the training data
print("\nSample training example:")
if train_data:
    sample = train_data[0]
    print(f"Sentence: {sample['sentence'][:100]}...")
    print(f"Tokens: {sample['tokens'][:10]}...")
    print(f"Labels: {sample['labels'][:10]}...")

    # Count entity types in the training data
    entity_counts = {}
    for item in train_data:
        for label in item['labels']:
            if label != "O":
                entity_type = label.split('-')[1]
                entity_counts[entity_type] = entity_counts.get(entity_type, 0) + 1

    print("\nEntity type counts in training data:")
    for entity_type, count in sorted(entity_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"  - {entity_type}: {count}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded 9352 annotated sentences
Converting annotations to token-level training data...


Processing: 100%|██████████| 9352/9352 [00:01<00:00, 5460.72it/s]


Created 9352 token-level training examples
Split into 7481 training and 1871 test examples
Saved training data to /content/drive/MyDrive/Colab Notebooks/cti-train-data.json
Saved test data to /content/drive/MyDrive/Colab Notebooks/cti-test-data.json

Sample training example:
Sentence: Users of Zymbra Collaboration should install the update as soon as possible....
Tokens: ['User', '##s', 'of', 'Z', '##ym', '##bra', 'Coll', '##aboration', 'should', 'instal']...
Labels: ['O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O']...

Entity type counts in training data:
  - ORG: 18886
  - PER: 5096
  - VULNERABILITY: 3016
  - LOC: 2941
  - MALWARE: 2173
  - THREAT_ACTOR: 1572
  - ATTACK_PATTERN: 1247


# **Step 6: Create Token-Level Training Data**

In [6]:
import json
import torch
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from datasets import Dataset
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the training and test data
train_path = "/content/drive/MyDrive/Colab Notebooks/cti-train-data.json"
test_path = "/content/drive/MyDrive/Colab Notebooks/cti-test-data.json"

with open(train_path, 'r', encoding='utf-8') as f:
    train_data = json.load(f)

with open(test_path, 'r', encoding='utf-8') as f:
    test_data = json.load(f)

print(f"Loaded {len(train_data)} training and {len(test_data)} test examples")

# Load tokenizer
model_name = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define our NER label list
label_list = [
    "O",  # Outside (not an entity)
    # Person entities
    "B-PER", "I-PER",
    # Organization entities
    "B-ORG", "I-ORG",
    # Location entities
    "B-LOC", "I-LOC",
    # Miscellaneous entities
    "B-MISC", "I-MISC",
    # CTI-specific entities
    "B-THREAT_ACTOR", "I-THREAT_ACTOR",
    "B-MALWARE", "I-MALWARE",
    "B-VULNERABILITY", "I-VULNERABILITY",
    "B-ATTACK_PATTERN", "I-ATTACK_PATTERN"
]

# Create label mappings
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

# Function to convert our data to the format expected by the Hugging Face Trainer
def convert_to_hf_dataset(data):
    # Convert labels to IDs
    for item in data:
        item['label_ids'] = [label2id[label] for label in item['labels']]

    # Create a Hugging Face Dataset
    dataset = Dataset.from_dict({
        'id': [item['id'] for item in data],
        'tokens': [item['tokens'] for item in data],
        'labels': [item['label_ids'] for item in data]
    })

    return dataset

# Convert our data to Hugging Face datasets
train_dataset = convert_to_hf_dataset(train_data)
test_dataset = convert_to_hf_dataset(test_data)

print(f"Created Hugging Face datasets with {len(train_dataset)} training and {len(test_dataset)} test examples")

# Function to tokenize and align labels for the model
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to -100
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization and label alignment
train_tokenized = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=train_dataset.column_names
)

test_tokenized = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=test_dataset.column_names
)

print(f"Tokenized datasets created with {len(train_tokenized)} training and {len(test_tokenized)} test examples")

# Create data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Save the prepared datasets
train_tokenized.save_to_disk("/content/drive/MyDrive/Colab Notebooks/cti-train-tokenized")
test_tokenized.save_to_disk("/content/drive/MyDrive/Colab Notebooks/cti-test-tokenized")

print("Saved tokenized datasets to disk")

# Display a sample from the tokenized dataset
print("\nSample from tokenized dataset:")
sample = train_tokenized[0]
print(f"Input IDs (first 10): {sample['input_ids'][:10]}")
print(f"Attention Mask (first 10): {sample['attention_mask'][:10]}")
print(f"Labels (first 10): {sample['labels'][:10]}")

# Count the number of examples with each entity type
entity_counts = {label: 0 for label in label_list if label != "O"}

for example in train_tokenized:
    labels = example["labels"]
    for label_id in labels:
        if label_id >= 0 and label_id < len(label_list) and label_list[label_id] != "O":
            entity_counts[label_list[label_id]] += 1

print("\nEntity counts in tokenized training data:")
for entity, count in sorted(entity_counts.items(), key=lambda x: x[1], reverse=True):
    if count > 0:
        print(f"  - {entity}: {count}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded 7481 training and 1871 test examples
Created Hugging Face datasets with 7481 training and 1871 test examples


Map:   0%|          | 0/7481 [00:00<?, ? examples/s]

Map:   0%|          | 0/1871 [00:00<?, ? examples/s]

Tokenized datasets created with 7481 training and 1871 test examples


Saving the dataset (0/1 shards):   0%|          | 0/7481 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1871 [00:00<?, ? examples/s]

Saved tokenized datasets to disk

Sample from tokenized dataset:
Input IDs (first 10): [101, 93974, 108, 108, 187, 10108, 163, 108, 108, 25942]
Attention Mask (first 10): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Labels (first 10): [-100, 0, 0, -100, -100, 0, 3, 4, -100, -100]

Entity counts in tokenized training data:
  - I-ORG: 12785
  - B-ORG: 5620
  - I-PER: 3280
  - I-VULNERABILITY: 1833
  - B-LOC: 1752
  - B-PER: 1739
  - I-MALWARE: 1290
  - I-LOC: 1126
  - B-VULNERABILITY: 1124
  - I-THREAT_ACTOR: 911
  - B-MALWARE: 838
  - B-THREAT_ACTOR: 655
  - I-ATTACK_PATTERN: 617
  - B-ATTACK_PATTERN: 607


# **Step 7: Fine Tune Model**

In [10]:
import torch
import numpy as np
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from datasets import load_from_disk
import evaluate
from google.colab import drive
import pandas as pd
from tabulate import tabulate

# Mount Google Drive
drive.mount('/content/drive')

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Load the tokenized datasets
train_dataset = load_from_disk("/content/drive/MyDrive/Colab Notebooks/cti-train-tokenized")
test_dataset = load_from_disk("/content/drive/MyDrive/Colab Notebooks/cti-test-tokenized")

print(f"Loaded {len(train_dataset)} training and {len(test_dataset)} test examples")

# Load tokenizer
model_name = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define our NER label list
label_list = [
    "O",  # Outside (not an entity)
    # Person entities
    "B-PER", "I-PER",
    # Organization entities
    "B-ORG", "I-ORG",
    # Location entities
    "B-LOC", "I-LOC",
    # Miscellaneous entities
    "B-MISC", "I-MISC",
    # CTI-specific entities
    "B-THREAT_ACTOR", "I-THREAT_ACTOR",
    "B-MALWARE", "I-MALWARE",
    "B-VULNERABILITY", "I-VULNERABILITY",
    "B-ATTACK_PATTERN", "I-ATTACK_PATTERN"
]

# Create label mappings
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

# Load the model - add ignore_mismatched_sizes=True to fix the error
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True  # Add this parameter to fix the error
)

# Move model to GPU if available
model = model.to(device)
print(f"Model loaded and moved to {device}")

# Create data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Load metric
metric = evaluate.load("seqeval")

# Function to compute metrics
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens)
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)

    # Extract results for each entity type
    entity_results = {}
    for key in results.keys():
        if key.startswith("overall"):
            continue
        entity_type = key.split("_")[1] if "_" in key else key
        metric_type = key.split("_")[0] if "_" in key else "all"

        if entity_type not in entity_results:
            entity_results[entity_type] = {}

        entity_results[entity_type][metric_type] = results[key]

    # Add overall results
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
        **{f"{entity}_{metric}": value
           for entity, metrics in entity_results.items()
           for metric, value in metrics.items()}
    }

# Define training arguments with more epochs (5 instead of 3)
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/NER-model-improved",  # Changed model name
    eval_strategy="epoch",  # Fixed deprecated parameter
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # Increased from 3 to 5
    weight_decay=0.01,
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if available
    report_to="none"  # Disable wandb, tensorboard, etc.
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=None,  # Removed tokenizer to avoid deprecation warning
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
print("\nStarting fine-tuning...")
train_result = trainer.train()

# Function to format results in a nice table
def format_results_table(results_dict):
    # Extract epoch results
    epochs = []
    metrics = []

    for log in trainer.state.log_history:
        if 'eval_f1' in log:
            epoch = log.get('epoch', 0)

            # Extract main metrics
            main_metrics = {
                'Epoch': f"{epoch:.1f}",
                'Loss': f"{log.get('eval_loss', 0):.4f}",
                'Precision': f"{log.get('eval_precision', 0):.4f}",
                'Recall': f"{log.get('eval_recall', 0):.4f}",
                'F1': f"{log.get('eval_f1', 0):.4f}",
                'Accuracy': f"{log.get('eval_accuracy', 0):.4f}"
            }

            # Extract entity-specific metrics
            entity_metrics = {}
            for key, value in log.items():
                if key.startswith('eval_') and '_all' in key:
                    entity_type = key.split('_')[1].capitalize()
                    entity_metrics[f"{entity_type} F1"] = f"{value['f1']:.4f}"
                elif key.startswith('eval_') and any(x in key for x in ['THREAT', 'ATTACK']):
                    parts = key.split('_')
                    if len(parts) > 1:
                        entity_type = ' '.join([p.capitalize() for p in parts[1:]])
                        entity_metrics[f"{entity_type} F1"] = f"{value['f1']:.4f}"

            # Combine metrics
            combined_metrics = {**main_metrics, **entity_metrics}
            metrics.append(combined_metrics)
            epochs.append(epoch)

    # Create DataFrame and format as table
    if metrics:
        df = pd.DataFrame(metrics)
        table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
        return table
    else:
        return "No evaluation results found"

# Evaluate the final model
print("\nEvaluating the final model...")
evaluation_results = trainer.evaluate()

# Print formatted results
print("\nTraining Results by Epoch:")
results_table = format_results_table(evaluation_results)
print(results_table)

# Print final evaluation results
print("\nFinal Evaluation Results:")
for key, value in evaluation_results.items():
    if isinstance(value, dict):
        print(f"{key}: {value['f1']:.4f} (P: {value['precision']:.4f}, R: {value['recall']:.4f})")
    elif not key.startswith('eval_runtime') and not key.startswith('eval_samples_'):
        print(f"{key}: {value:.4f}")

# Save the fine-tuned model to Google Drive
drive_model_path = "/content/drive/MyDrive/Colab Notebooks/NER-model-improved"
model.save_pretrained(drive_model_path)
tokenizer.save_pretrained(drive_model_path)
print(f"\nModel saved to {drive_model_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
GPU: Tesla T4
GPU Memory: 15.83 GB
Loaded 7481 training and 1871 test examples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-ner-hrl and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([17]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([17, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to cuda


  trainer = Trainer(



Starting fine-tuning...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Pattern Attack,Loc All,Malware All,Org All,Per All,Actor Threat,Vulnerability All
1,No log,0.154491,0.76669,0.737504,0.751814,0.960129,"{'precision': 0.7027027027027027, 'recall': 0.3443708609271523, 'f1': 0.46222222222222226, 'number': 151}","{'precision': 0.9267326732673268, 'recall': 0.9052224371373307, 'f1': 0.9158512720156555, 'number': 517}","{'precision': 0.8071065989847716, 'recall': 0.8784530386740331, 'f1': 0.8412698412698412, 'number': 181}","{'precision': 0.7010752688172043, 'recall': 0.7540478026214341, 'f1': 0.7265973254086181, 'number': 1297}","{'precision': 0.7990762124711316, 'recall': 0.7954022988505747, 'f1': 0.7972350230414745, 'number': 435}","{'precision': 0.7058823529411765, 'recall': 0.5, 'f1': 0.5853658536585366, 'number': 192}","{'precision': 0.7771084337349398, 'recall': 0.5201612903225806, 'f1': 0.6231884057971014, 'number': 248}"
2,0.268300,0.122079,0.810901,0.80768,0.809287,0.965769,"{'precision': 0.78125, 'recall': 0.8278145695364238, 'f1': 0.8038585209003216, 'number': 151}","{'precision': 0.9456740442655935, 'recall': 0.9090909090909091, 'f1': 0.9270216962524654, 'number': 517}","{'precision': 0.9162011173184358, 'recall': 0.9060773480662984, 'f1': 0.9111111111111111, 'number': 181}","{'precision': 0.7213712618526623, 'recall': 0.7625289128758674, 'f1': 0.7413793103448276, 'number': 1297}","{'precision': 0.851581508515815, 'recall': 0.8045977011494253, 'f1': 0.8274231678486997, 'number': 435}","{'precision': 0.9030303030303031, 'recall': 0.7760416666666666, 'f1': 0.8347338935574229, 'number': 192}","{'precision': 0.8539823008849557, 'recall': 0.7782258064516129, 'f1': 0.8143459915611814, 'number': 248}"
3,0.116800,0.119142,0.861808,0.829858,0.845531,0.970187,"{'precision': 0.9047619047619048, 'recall': 0.8807947019867549, 'f1': 0.8926174496644295, 'number': 151}","{'precision': 0.9574898785425101, 'recall': 0.9148936170212766, 'f1': 0.9357072205736895, 'number': 517}","{'precision': 0.9597701149425287, 'recall': 0.9226519337016574, 'f1': 0.9408450704225353, 'number': 181}","{'precision': 0.7780320366132724, 'recall': 0.7864302235929067, 'f1': 0.7822085889570551, 'number': 1297}","{'precision': 0.8894736842105263, 'recall': 0.7770114942528735, 'f1': 0.8294478527607362, 'number': 435}","{'precision': 0.9485714285714286, 'recall': 0.8645833333333334, 'f1': 0.904632152588556, 'number': 192}","{'precision': 0.9210526315789473, 'recall': 0.8467741935483871, 'f1': 0.8823529411764707, 'number': 248}"
4,0.076300,0.12616,0.852383,0.846739,0.849552,0.971303,"{'precision': 0.8874172185430463, 'recall': 0.8874172185430463, 'f1': 0.8874172185430463, 'number': 151}","{'precision': 0.9554655870445344, 'recall': 0.9129593810444874, 'f1': 0.9337289812067259, 'number': 517}","{'precision': 0.9553072625698324, 'recall': 0.9447513812154696, 'f1': 0.9500000000000001, 'number': 181}","{'precision': 0.7846853677028052, 'recall': 0.7979953739398612, 'f1': 0.7912844036697247, 'number': 1297}","{'precision': 0.8090909090909091, 'recall': 0.8183908045977012, 'f1': 0.8137142857142856, 'number': 435}","{'precision': 0.9153439153439153, 'recall': 0.9010416666666666, 'f1': 0.9081364829396326, 'number': 192}","{'precision': 0.9475982532751092, 'recall': 0.875, 'f1': 0.909853249475891, 'number': 248}"
5,0.051200,0.128503,0.847485,0.858987,0.853197,0.97162,"{'precision': 0.8823529411764706, 'recall': 0.8940397350993378, 'f1': 0.8881578947368421, 'number': 151}","{'precision': 0.9496981891348089, 'recall': 0.9129593810444874, 'f1': 0.9309664694280079, 'number': 517}","{'precision': 0.9344262295081968, 'recall': 0.9447513812154696, 'f1': 0.9395604395604397, 'number': 181}","{'precision': 0.7750362844702468, 'recall': 0.8234387047031612, 'f1': 0.7985046728971962, 'number': 1297}","{'precision': 0.8198614318706697, 'recall': 0.8160919540229885, 'f1': 0.8179723502304148, 'number': 435}","{'precision': 0.9402173913043478, 'recall': 0.9010416666666666, 'f1': 0.9202127659574467, 'number': 192}","{'precision': 0.9444444444444444, 'recall': 0.8911290322580645, 'f1': 0.9170124481327802, 'number': 248}"



Evaluating the final model...



Training Results by Epoch:
|   Epoch |   Loss |   Precision |   Recall |     F1 |   Accuracy |   Pattern Attack F1 |   Loc F1 |   Malware F1 |   Org F1 |   Per F1 |   Actor Threat F1 |   Vulnerability F1 |
|--------:|-------:|------------:|---------:|-------:|-----------:|--------------------:|---------:|-------------:|---------:|---------:|------------------:|-------------------:|
|       1 | 0.1545 |      0.7667 |   0.7375 | 0.7518 |     0.9601 |              0.4622 |   0.9159 |       0.8413 |   0.7266 |   0.7972 |            0.5854 |             0.6232 |
|       2 | 0.1221 |      0.8109 |   0.8077 | 0.8093 |     0.9658 |              0.8039 |   0.927  |       0.9111 |   0.7414 |   0.8274 |            0.8347 |             0.8143 |
|       3 | 0.1191 |      0.8618 |   0.8299 | 0.8455 |     0.9702 |              0.8926 |   0.9357 |       0.9408 |   0.7822 |   0.8294 |            0.9046 |             0.8824 |
|       4 | 0.1262 |      0.8524 |   0.8467 | 0.8496 |     0.9713 |          