In [None]:
# Class 12: Natural Language Processing (NLP) for Security
# Student Practice Notebook - Machine Learning Lifecycle Demonstration 

"""
Learning Objectives:
1. Understand the ML lifecycle for NLP in security contexts
2. Practice text preprocessing techniques
3. Apply sentiment analysis to security data
4. Implement topic modeling for threat intelligence
5. Build a simple security text classifier

This notebook uses synthetic data to demonstrate concepts safely.
"""

# =============================================================================
# STEP 1: SETUP AND IMPORTS
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob
import re
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

print("✅ All imports successful!")
print("📚 Class 12: NLP for Security - Practice Notebook (Fixed)")
print("=" * 60)

In [None]:
# =============================================================================
# STEP 2: PROBLEM DEFINITION
# =============================================================================

print("\n🎯 STEP 2: PROBLEM DEFINITION")
print("=" * 40)

problem_statement = """
BUSINESS PROBLEM:
Our Security Operations Center (SOC) receives thousands of text-based security alerts,
incident reports, and threat intelligence feeds daily. Manual analysis is:
- Time-consuming (45+ minutes per incident)
- Inconsistent (different analysts, different interpretations)
- Error-prone (human fatigue, information overload)
- Not scalable (volume growing 20% yearly)

TECHNICAL OBJECTIVES:
1. Automatically classify security incidents by severity and type
2. Extract key entities (IPs, domains, malware families) from text
3. Perform sentiment analysis on threat intelligence
4. Discover hidden topics in security data
5. Reduce analyst workload by 70%

SUCCESS METRICS:
- Classification accuracy > 85%
- Processing time < 1 second per document
- False positive rate < 10%
- Analyst satisfaction score > 4/5
"""

print(problem_statement)

In [None]:
# =============================================================================
# STEP 3: SYNTHETIC DATA GENERATION
# =============================================================================

print("\n📊 STEP 3: SYNTHETIC DATA GENERATION")
print("=" * 40)

# Generate synthetic security incident data
np.random.seed(42)  # For reproducible results

# Security incident templates
incident_templates = {
    'malware': [
        "Malware detected on endpoint {hostname}. User {user} may be infected. {malware_family} signature found.",
        "Suspicious file {filename} quarantined from {hostname}. Possible {malware_family} infection.",
        "Antivirus alert: {malware_family} detected in {location}. User {user} affected.",
        "Endpoint protection blocked {malware_family} execution on {hostname}. Immediate action required.",
        "Ransomware encryption detected on {hostname}. {malware_family} variant identified."
    ],
    'network': [
        "Suspicious network traffic detected from {ip_source} to {ip_dest} on port {port}.",
        "Potential data exfiltration: Large file transfer from {hostname} to external IP {ip_dest}.",
        "DDoS attack detected: High volume traffic from {ip_source}. {attack_size} requests per second.",
        "Port scan detected from {ip_source}. Multiple ports targeted on {hostname}.",
        "Firewall blocked {blocked_count} connection attempts from {ip_source}."
    ],
    'phishing': [
        "Phishing email detected from {sender}. Subject: {subject}. {user_count} users targeted.",
        "Suspicious email campaign: {user_count} users received emails from {sender}.",
        "Email security alert: Malicious attachment in email from {sender} to {user}.",
        "Credential harvesting attempt detected. Fake {service} login page reported.",
        "Business Email Compromise suspected. Fraudulent email from {sender} requesting wire transfer."
    ],
    'vulnerability': [
        "Critical vulnerability {cve} discovered in {software}. Patch required immediately.",
        "Vulnerability scan completed. {vuln_count} high-risk vulnerabilities found on {hostname}.",
        "Zero-day exploit detected targeting {software}. {cve} affects version {version}.",
        "Security patch required: {software} version {version} vulnerable to {attack_type}.",
        "Penetration test identified {vuln_count} vulnerabilities in {service}."
    ]
}

# Generate sample data
def generate_security_incidents(n_samples=500):
    incidents = []
    
    # Sample values for templates
    hostnames = ['SRV-WEB-01', 'WS-FINANCE-05', 'DB-PROD-02', 'LAPTOP-HR-12', 'SRV-EMAIL-01']
    users = ['john.doe', 'sarah.smith', 'mike.jones', 'lisa.wang', 'admin']
    malware_families = ['Emotet', 'TrickBot', 'Ryuk', 'Cobalt Strike', 'Mimikatz']
    ip_addresses = ['192.168.1.100', '10.0.0.50', '203.0.113.5', '198.51.100.10']
    ports = ['3389', '22', '80', '443', '445', '135']
    cves = ['CVE-2021-44228', 'CVE-2021-34527', 'CVE-2020-1472', 'CVE-2021-26855']
    software = ['Apache Log4j', 'Microsoft Exchange', 'Windows Server', 'Adobe Flash']
    
    for i in range(n_samples):
        # Randomly select incident type
        incident_type = np.random.choice(list(incident_templates.keys()))
        template = np.random.choice(incident_templates[incident_type])
        
        # Fill template with random values
        incident_text = template.format(
            hostname=np.random.choice(hostnames),
            user=np.random.choice(users),
            malware_family=np.random.choice(malware_families),
            filename=f"suspicious_file_{np.random.randint(1,100)}.exe",
            location="C:\\Users\\Downloads\\",
            ip_source=np.random.choice(ip_addresses),
            ip_dest=np.random.choice(ip_addresses),
            port=np.random.choice(ports),
            attack_size=np.random.randint(1000, 10000),
            blocked_count=np.random.randint(50, 500),
            sender=f"attacker{np.random.randint(1,20)}@malicious-domain.com",
            subject="Urgent: Account Verification Required",
            user_count=np.random.randint(5, 50),
            service=np.random.choice(['Office365', 'Gmail', 'PayPal', 'Banking']),
            cve=np.random.choice(cves),
            software=np.random.choice(software),
            vuln_count=np.random.randint(1, 15),
            version=f"{np.random.randint(1,5)}.{np.random.randint(0,9)}",
            attack_type=np.random.choice(['RCE', 'XSS', 'SQLi', 'Buffer Overflow'])
        )
        
        # Assign severity based on incident type
        severity_mapping = {
            'malware': np.random.choice(['High', 'Critical'], p=[0.7, 0.3]),
            'network': np.random.choice(['Medium', 'High'], p=[0.6, 0.4]),
            'phishing': np.random.choice(['Medium', 'High'], p=[0.8, 0.2]),
            'vulnerability': np.random.choice(['High', 'Critical'], p=[0.5, 0.5])
        }
        
        incidents.append({
            'incident_id': f'INC-{i+1:04d}',
            'text': incident_text,
            'type': incident_type,
            'severity': severity_mapping[incident_type],
            'timestamp': pd.Timestamp.now() - pd.Timedelta(days=np.random.randint(0, 30))
        })
    
    return pd.DataFrame(incidents)

# Generate phishing vs legitimate email data
def generate_email_data(n_samples=200):
    emails = []
    
    # Phishing email templates (negative sentiment, urgency)
    phishing_templates = [
        "URGENT: Your account will be suspended in 24 hours! Click here immediately to verify.",
        "Security Alert: Suspicious activity detected. Verify your identity NOW or lose access.",
        "Final Notice: Payment overdue. Click to avoid legal action.",
        "Your account has been compromised! Change password immediately at this link.",
        "Congratulations! You've won $10,000! Claim your prize before it expires."
    ]
    
    # Legitimate email templates (neutral/positive sentiment)
    legitimate_templates = [
        "Thank you for your recent order. Your package will arrive in 3-5 business days.",
        "Your monthly security report is ready for review. Please find attached.",
        "Reminder: Team meeting scheduled for tomorrow at 2 PM in conference room A.",
        "Software update available. Please install during your next maintenance window.",
        "Your backup completed successfully. All files are secure and accessible."
    ]
    
    # Generate phishing emails
    for i in range(n_samples // 2):
        emails.append({
            'email_id': f'EMAIL-{i+1:04d}',
            'text': np.random.choice(phishing_templates),
            'label': 'phishing',
            'sentiment': 'negative'
        })
    
    # Generate legitimate emails
    for i in range(n_samples // 2, n_samples):
        emails.append({
            'email_id': f'EMAIL-{i+1:04d}',
            'text': np.random.choice(legitimate_templates),
            'label': 'legitimate',
            'sentiment': 'neutral'
        })
    
    return pd.DataFrame(emails)

# Generate threat intelligence reports
def generate_threat_intel(n_samples=100):
    reports = []
    
    threat_templates = [
        "APT29 observed using {tool} to target {sector} organizations. {ioc_count} IOCs identified.",
        "New {malware} campaign targeting {sector}. Distributed via {vector}.",
        "Vulnerability {cve} actively exploited by {group}. {affected_count} organizations at risk.",
        "Ransomware group {group} demands ${ransom_amount} from {sector} victims.",
        "Nation-state actor {group} conducting espionage against {sector} using {technique}."
    ]
    
    tools = ['Cobalt Strike', 'Metasploit', 'Empire', 'PowerShell', 'Living off the Land']
    sectors = ['healthcare', 'finance', 'government', 'education', 'manufacturing']
    groups = ['APT29', 'APT28', 'Lazarus', 'FIN7', 'Carbanak']
    vectors = ['phishing emails', 'watering hole attacks', 'supply chain compromise']
    techniques = ['spear phishing', 'credential stuffing', 'lateral movement']
    
    for i in range(n_samples):
        report_text = np.random.choice(threat_templates).format(
            tool=np.random.choice(tools),
            sector=np.random.choice(sectors),
            ioc_count=np.random.randint(5, 50),
            malware=f"Malware-{np.random.randint(1,10)}",
            vector=np.random.choice(vectors),
            cve=f"CVE-2024-{np.random.randint(1000, 9999)}",
            group=np.random.choice(groups),
            affected_count=np.random.randint(10, 1000),
            ransom_amount=np.random.choice([1, 2, 5, 10]) * 1000000,
            technique=np.random.choice(techniques)
        )
        
        reports.append({
            'report_id': f'TI-{i+1:04d}',
            'text': report_text,
            'source': np.random.choice(['OSINT', 'Commercial', 'Government', 'Internal']),
            'confidence': np.random.choice(['Low', 'Medium', 'High'])
        })
    
    return pd.DataFrame(reports)

# Generate all datasets
print("Generating synthetic security datasets...")
incidents_df = generate_security_incidents(500)
emails_df = generate_email_data(200)
threat_intel_df = generate_threat_intel(100)

print(f"✅ Generated {len(incidents_df)} security incidents")
print(f"✅ Generated {len(emails_df)} email samples")
print(f"✅ Generated {len(threat_intel_df)} threat intelligence reports")

In [None]:
# =============================================================================
# STEP 4: EXPLORATORY DATA ANALYSIS (EDA)
# =============================================================================

print("\n🔍 STEP 4: EXPLORATORY DATA ANALYSIS")
print("=" * 40)

# Display basic statistics
print("SECURITY INCIDENTS DATASET:")
print(f"Shape: {incidents_df.shape}")
print(f"Columns: {list(incidents_df.columns)}")
print("\nIncident Type Distribution:")
print(incidents_df['type'].value_counts())
print("\nSeverity Distribution:")
print(incidents_df['severity'].value_counts())

print("\nEMAIL DATASET:")
print(f"Shape: {emails_df.shape}")
print(f"Columns: {list(emails_df.columns)}")
print("\nEmail Label Distribution:")
print(emails_df['label'].value_counts())

# Sample data preview
print("\nSAMPLE SECURITY INCIDENTS:")
for i, row in incidents_df.head(3).iterrows():
    print(f"\nIncident {row['incident_id']} ({row['type']}, {row['severity']}):")
    print(f"Text: {row['text']}")

print("\nSAMPLE EMAILS:")
for i, row in emails_df.head(2).iterrows():
    print(f"\nEmail {row['email_id']} ({row['label']}):")
    print(f"Text: {row['text']}")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Incident type distribution
incidents_df['type'].value_counts().plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Security Incident Types')
axes[0,0].set_xlabel('Incident Type')
axes[0,0].set_ylabel('Count')
axes[0,0].tick_params(axis='x', rotation=45)

# Severity distribution
incidents_df['severity'].value_counts().plot(kind='pie', ax=axes[0,1], autopct='%1.1f%%')
axes[0,1].set_title('Incident Severity Distribution')

# Email classification
emails_df['label'].value_counts().plot(kind='bar', ax=axes[1,0], color=['red', 'green'])
axes[1,0].set_title('Email Classification')
axes[1,0].set_xlabel('Email Type')
axes[1,0].set_ylabel('Count')
axes[1,0].tick_params(axis='x', rotation=45)

# Text length distribution
text_lengths = incidents_df['text'].str.len()
axes[1,1].hist(text_lengths, bins=20, color='lightcoral', alpha=0.7)
axes[1,1].set_title('Incident Text Length Distribution')
axes[1,1].set_xlabel('Text Length (characters)')
axes[1,1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# STEP 5: TEXT PREPROCESSING
# =============================================================================

print("\n🧹 STEP 5: TEXT PREPROCESSING")
print("=" * 40)

# Initialize preprocessing tools
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Custom security stop words (domain-specific)
security_stop_words = {
    'alert', 'detected', 'found', 'system', 'user', 'file', 'server', 
    'network', 'security', 'incident', 'report', 'please', 'immediate'
}
stop_words.update(security_stop_words)

def preprocess_text(text, use_stemming=True, remove_stopwords=True):
    """
    Comprehensive text preprocessing function for security text
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters but keep important security patterns
    # Keep IPs, domains, CVE IDs, file extensions
    text = re.sub(r'[^a-zA-Z0-9\s\-\._]', ' ', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    
    # Stemming or lemmatization
    if use_stemming:
        tokens = [stemmer.stem(token) for token in tokens]
    else:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

# Demonstrate preprocessing steps
sample_text = incidents_df.iloc[0]['text']
print("PREPROCESSING DEMONSTRATION:")
print(f"Original text: {sample_text}")
print()

# Step by step preprocessing
print("Step 1 - Lowercase:")
step1 = sample_text.lower()
print(step1)
print()

print("Step 2 - Remove special characters:")
step2 = re.sub(r'[^a-zA-Z0-9\s\-\._]', ' ', step1)
print(step2)
print()

print("Step 3 - Tokenization:")
step3 = word_tokenize(step2)
print(step3)
print()

print("Step 4 - Remove stopwords:")
step4 = [token for token in step3 if token not in stop_words and len(token) > 2]
print(step4)
print()

print("Step 5 - Stemming:")
step5 = [stemmer.stem(token) for token in step4]
print(step5)
print()

print("Final preprocessed text:")
final_text = preprocess_text(sample_text)
print(final_text)

# Apply preprocessing to datasets
print("\nApplying preprocessing to all datasets...")
incidents_df['processed_text'] = incidents_df['text'].apply(preprocess_text)
emails_df['processed_text'] = emails_df['text'].apply(preprocess_text)
threat_intel_df['processed_text'] = threat_intel_df['text'].apply(preprocess_text)

print("✅ Preprocessing completed!")

# Compare original vs processed text lengths
original_lengths = incidents_df['text'].str.len()
processed_lengths = incidents_df['processed_text'].str.len()

print(f"\nText length comparison:")
print(f"Original average length: {original_lengths.mean():.1f} characters")
print(f"Processed average length: {processed_lengths.mean():.1f} characters")
print(f"Reduction: {(1 - processed_lengths.mean()/original_lengths.mean())*100:.1f}%")

In [None]:
# =============================================================================
# STEP 6: FEATURE ENGINEERING (TF-IDF)
# =============================================================================

print("\n⚙️ STEP 6: FEATURE ENGINEERING - TF-IDF")
print("=" * 40)

# Demonstrate TF-IDF on incident data
print("Creating TF-IDF features for incident classification...")

# Prepare data for TF-IDF
X_text = incidents_df['processed_text']
y_type = incidents_df['type']
y_severity = incidents_df['severity']

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=1000,        # Limit vocabulary size
    min_df=2,                 # Ignore terms that appear in less than 2 documents
    max_df=0.8,               # Ignore terms that appear in more than 80% of documents
    ngram_range=(1, 2)        # Use unigrams and bigrams
)

# Fit and transform the text data
X_tfidf = tfidf_vectorizer.fit_transform(X_text)

print(f"TF-IDF Matrix Shape: {X_tfidf.shape}")
print(f"Vocabulary Size: {len(tfidf_vectorizer.vocabulary_)}")

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Analyze top TF-IDF features by incident type
print("\nTOP TF-IDF FEATURES BY INCIDENT TYPE:")
for incident_type in incidents_df['type'].unique():
    # Get indices for this incident type
    type_indices = incidents_df[incidents_df['type'] == incident_type].index
    
    # Get TF-IDF scores for this type
    type_tfidf = X_tfidf[type_indices].mean(axis=0).A1
    
    # Get top features
    top_features_idx = type_tfidf.argsort()[-10:][::-1]
    top_features = [(feature_names[i], type_tfidf[i]) for i in top_features_idx]
    
    print(f"\n{incident_type.upper()}:")
    for feature, score in top_features[:5]:
        print(f"  {feature}: {score:.4f}")

# Visualize TF-IDF feature importance
fig, ax = plt.subplots(figsize=(12, 6))

# Get overall top features
overall_tfidf = X_tfidf.mean(axis=0).A1
top_overall_idx = overall_tfidf.argsort()[-20:][::-1]
top_features = [feature_names[i] for i in top_overall_idx]
top_scores = [overall_tfidf[i] for i in top_overall_idx]

ax.barh(range(len(top_features)), top_scores, color='skyblue')
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features)
ax.set_xlabel('Average TF-IDF Score')
ax.set_title('Top 20 TF-IDF Features Across All Incidents')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# STEP 7: SENTIMENT ANALYSIS
# =============================================================================

print("\n😊 STEP 7: SENTIMENT ANALYSIS FOR SECURITY")
print("=" * 40)

def analyze_sentiment(text):
    """
    Analyze sentiment using TextBlob
    Returns polarity (-1 to 1) and subjectivity (0 to 1)
    """
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def interpret_security_sentiment(polarity, subjectivity):
    """
    Interpret sentiment in security context
    """
    # Map sentiment to security urgency/threat level
    if polarity < -0.3:
        urgency = "High"  # Negative sentiment often indicates threats
    elif polarity < 0.1:
        urgency = "Medium"
    else:
        urgency = "Low"
    
    # Subjectivity indicates confidence level
    if subjectivity > 0.6:
        confidence = "Low"  # High subjectivity = low confidence
    elif subjectivity > 0.3:
        confidence = "Medium"
    else:
        confidence = "High"
    
    return urgency, confidence

# Analyze sentiment for different datasets
print("SENTIMENT ANALYSIS ON EMAIL DATASET:")

# Analyze emails
email_sentiments = []
for _, row in emails_df.iterrows():
    polarity, subjectivity = analyze_sentiment(row['text'])
    urgency, confidence = interpret_security_sentiment(polarity, subjectivity)
    
    email_sentiments.append({
        'email_id': row['email_id'],
        'label': row['label'],
        'polarity': polarity,
        'subjectivity': subjectivity,
        'urgency': urgency,
        'confidence': confidence
    })

sentiment_df = pd.DataFrame(email_sentiments)

# Display results
print(f"Average sentiment by email type:")
sentiment_summary = sentiment_df.groupby('label')[['polarity', 'subjectivity']].mean()
print(sentiment_summary)

print(f"\nUrgency distribution by email type:")
urgency_crosstab = pd.crosstab(sentiment_df['label'], sentiment_df['urgency'])
print(urgency_crosstab)

# Visualize sentiment analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Sentiment polarity by email type
sentiment_df.boxplot(column='polarity', by='label', ax=axes[0])
axes[0].set_title('Sentiment Polarity by Email Type')
axes[0].set_xlabel('Email Type')
axes[0].set_ylabel('Polarity (Negative ← → Positive)')

# Subjectivity by email type
sentiment_df.boxplot(column='subjectivity', by='label', ax=axes[1])
axes[1].set_title('Subjectivity by Email Type')
axes[1].set_xlabel('Email Type')
axes[1].set_ylabel('Subjectivity (Objective ← → Subjective)')

# Urgency distribution
urgency_crosstab.plot(kind='bar', ax=axes[2], color=['green', 'orange', 'red'])
axes[2].set_title('Security Urgency Distribution by Email Type')
axes[2].set_xlabel('Email Type')
axes[2].set_ylabel('Count')
axes[2].legend(title='Urgency Level')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Example sentiment analysis
print("SENTIMENT ANALYSIS EXAMPLES:")
sample_emails = emails_df.sample(4, random_state=42)
for _, row in sample_emails.iterrows():
    polarity, subjectivity = analyze_sentiment(row['text'])
    urgency, confidence = interpret_security_sentiment(polarity, subjectivity)
    
    print(f"\nEmail: {row['email_id']} (Actual: {row['label']})")
    print(f"Text: {row['text'][:100]}...")
    print(f"Sentiment: Polarity={polarity:.3f}, Subjectivity={subjectivity:.3f}")
    print(f"Security Assessment: Urgency={urgency}, Confidence={confidence}")

In [None]:
# =============================================================================
# STEP 8: TOPIC MODELING
# =============================================================================

print("\n📊 STEP 8: TOPIC MODELING FOR THREAT INTELLIGENCE")
print("=" * 40)

# Prepare data for topic modeling
threat_intel_texts = threat_intel_df['processed_text'].tolist()

# Use CountVectorizer for LDA (works better than TF-IDF)
count_vectorizer = CountVectorizer(
    max_features=100,
    min_df=2,
    max_df=0.8,
    stop_words='english'
)

# Create document-term matrix
doc_term_matrix = count_vectorizer.fit_transform(threat_intel_texts)

print(f"Document-Term Matrix Shape: {doc_term_matrix.shape}")

# Apply Latent Dirichlet Allocation (LDA)
n_topics = 5
lda_model = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    max_iter=100,
    learning_method='online'
)

# Fit the model
lda_model.fit(doc_term_matrix)

# Get feature names
feature_names = count_vectorizer.get_feature_names_out()

# Display topics
print("DISCOVERED TOPICS IN THREAT INTELLIGENCE:")
def display_topics(model, feature_names, n_top_words=8):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics[f"Topic {topic_idx + 1}"] = top_words
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join(top_words))
    return topics

discovered_topics = display_topics(lda_model, feature_names)

# Assign topics to documents
doc_topic_probs = lda_model.transform(doc_term_matrix)

# Find dominant topic for each document
dominant_topics = doc_topic_probs.argmax(axis=1)

# Add topic assignments to dataframe
threat_intel_df['dominant_topic'] = dominant_topics
threat_intel_df['topic_probability'] = doc_topic_probs.max(axis=1)

print(f"\nTOPIC DISTRIBUTION IN THREAT INTELLIGENCE:")
topic_counts = pd.Series(dominant_topics).value_counts().sort_index()
for topic_id, count in topic_counts.items():
    print(f"Topic {topic_id + 1}: {count} documents ({count/len(threat_intel_df)*100:.1f}%)")

# Visualize topics
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Topic distribution
topic_counts.plot(kind='bar', ax=axes[0], color='lightblue')
axes[0].set_title('Topic Distribution in Threat Intelligence')
axes[0].set_xlabel('Topic Number')
axes[0].set_ylabel('Number of Documents')

# Topic probability distribution
axes[1].hist(threat_intel_df['topic_probability'], bins=20, color='lightgreen', alpha=0.7)
axes[1].set_title('Topic Assignment Confidence Distribution')
axes[1].set_xlabel('Probability of Dominant Topic')
axes[1].set_ylabel('Number of Documents')

plt.tight_layout()
plt.show()

# Show examples from each topic
print("\nEXAMPLE DOCUMENTS BY TOPIC:")
for topic_id in range(n_topics):
    topic_docs = threat_intel_df[threat_intel_df['dominant_topic'] == topic_id]
    if len(topic_docs) > 0:
        sample_doc = topic_docs.iloc[0]
        print(f"\nTopic {topic_id + 1} Example:")
        print(f"Document: {sample_doc['report_id']}")
        print(f"Text: {sample_doc['text']}")
        print(f"Confidence: {sample_doc['topic_probability']:.3f}")

In [None]:
# =============================================================================
# STEP 9: CLASSIFICATION MODEL TRAINING
# =============================================================================

print("\n🤖 STEP 9: CLASSIFICATION MODEL TRAINING")
print("=" * 40)

# Classification Task 1: Incident Type Classification
print("TASK 1: INCIDENT TYPE CLASSIFICATION")

# Prepare features and labels
X = X_tfidf  # TF-IDF features from Step 6
y = incidents_df['type']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Train multiple models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Naive Bayes': MultinomialNB()
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'predictions': y_pred
    }
    
    print(f"{name} Accuracy: {accuracy:.3f}")

# Detailed evaluation of best model
best_model_name = max(results.keys(), key=lambda k: results[k]['accuracy'])
best_model = results[best_model_name]['model']
best_predictions = results[best_model_name]['predictions']

print(f"\nBEST MODEL: {best_model_name}")
print(f"Accuracy: {results[best_model_name]['accuracy']:.3f}")

print("\nDetailed Classification Report:")
print(classification_report(y_test, best_predictions))

# Confusion Matrix
cm = confusion_matrix(y_test, best_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=incidents_df['type'].unique(),
            yticklabels=incidents_df['type'].unique())
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Feature importance (for Logistic Regression)
if best_model_name == 'Logistic Regression':
    print("\nTOP FEATURES BY INCIDENT TYPE:")
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    for i, class_name in enumerate(best_model.classes_):
        coefficients = best_model.coef_[i]
        top_features_idx = coefficients.argsort()[-10:][::-1]
        
        print(f"\n{class_name.upper()}:")
        for idx in top_features_idx[:5]:
            print(f"  {feature_names[idx]}: {coefficients[idx]:.4f}")

In [None]:
# =============================================================================
# STEP 10: CLASSIFICATION TASK 2 - EMAIL CLASSIFICATION (FIXED)
# =============================================================================

print("\n📧 CLASSIFICATION TASK 2: PHISHING EMAIL DETECTION")
print("=" * 50)

# Prepare email data
X_email_text = emails_df['processed_text']
y_email = emails_df['label']

# Create TF-IDF features for emails
email_tfidf = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
X_email_tfidf = email_tfidf.fit_transform(X_email_text)

# Split email data
X_email_train, X_email_test, y_email_train, y_email_test = train_test_split(
    X_email_tfidf, y_email, test_size=0.2, random_state=42, stratify=y_email
)

# Train email classifier
email_classifier = LogisticRegression(random_state=42)
email_classifier.fit(X_email_train, y_email_train)

# Evaluate email classifier
y_email_pred = email_classifier.predict(X_email_test)
email_accuracy = accuracy_score(y_email_test, y_email_pred)

print(f"Email Classification Accuracy: {email_accuracy:.3f}")
print("\nEmail Classification Report:")
print(classification_report(y_email_test, y_email_pred))

# FIXED: Feature importance for email classification (handles binary classification)
print("\nTOP FEATURES FOR PHISHING DETECTION:")
feature_names_email = email_tfidf.get_feature_names_out()

# For binary classification, LogisticRegression only stores one set of coefficients
if len(email_classifier.classes_) == 2:
    # Get the coefficients (only one row for binary classification)
    coefficients = email_classifier.coef_[0]
    
    # Positive coefficients indicate the positive class (usually the second class)
    # Negative coefficients indicate the negative class (usually the first class)
    
    print(f"\nFeatures indicating '{email_classifier.classes_[1]}' (positive coefficients):")
    top_positive_idx = coefficients.argsort()[-10:][::-1]
    for idx in top_positive_idx[:5]:
        if coefficients[idx] > 0:
            print(f"  {feature_names_email[idx]}: {coefficients[idx]:.4f}")
    
    print(f"\nFeatures indicating '{email_classifier.classes_[0]}' (negative coefficients):")
    top_negative_idx = coefficients.argsort()[:10]
    for idx in top_negative_idx[:5]:
        if coefficients[idx] < 0:
            print(f"  {feature_names_email[idx]}: {coefficients[idx]:.4f}")
else:
    # Multi-class case (original code would work)
    for i, class_name in enumerate(email_classifier.classes_):
        coefficients = email_classifier.coef_[i]
        top_features_idx = coefficients.argsort()[-10:][::-1]
        
        print(f"\n{class_name.upper()}:")
        for idx in top_features_idx[:5]:
            print(f"  {feature_names_email[idx]}: {coefficients[idx]:.4f}")

In [None]:
# =============================================================================
# STEP 11: MODEL EVALUATION AND INTERPRETATION
# =============================================================================

print("\n📈 STEP 11: MODEL EVALUATION AND INTERPRETATION")
print("=" * 50)

# Create a comprehensive evaluation summary
evaluation_summary = {
    'Incident Type Classification': {
        'Best Model': best_model_name,
        'Accuracy': results[best_model_name]['accuracy'],
        'Classes': list(incidents_df['type'].unique()),
        'Training Samples': X_train.shape[0],
        'Test Samples': X_test.shape[0]
    },
    'Email Classification': {
        'Model': 'Logistic Regression',
        'Accuracy': email_accuracy,
        'Classes': list(emails_df['label'].unique()),
        'Training Samples': X_email_train.shape[0],
        'Test Samples': X_email_test.shape[0]
    }
}

print("MODEL PERFORMANCE SUMMARY:")
for task, metrics in evaluation_summary.items():
    print(f"\n{task}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")

# Demonstrate model predictions on new data
print("\nMODEL PREDICTIONS ON NEW EXAMPLES:")

# New incident examples
new_incidents = [
    "Critical ransomware attack detected on file server. Multiple files encrypted with .encrypted extension.",
    "Employee reported suspicious email requesting password reset. Potential phishing attempt.",
    "Vulnerability scanner identified SQL injection flaw in web application login form.",
    "Firewall logs show repeated connection attempts from external IP to SSH port."
]

print("\nINCIDENT TYPE PREDICTIONS:")
for i, incident in enumerate(new_incidents, 1):
    # Preprocess
    processed = preprocess_text(incident)
    
    # Vectorize
    incident_tfidf = tfidf_vectorizer.transform([processed])
    
    # Predict
    prediction = best_model.predict(incident_tfidf)[0]
    probability = best_model.predict_proba(incident_tfidf)[0].max()
    
    print(f"\nExample {i}:")
    print(f"Text: {incident}")
    print(f"Predicted Type: {prediction}")
    print(f"Confidence: {probability:.3f}")

# New email examples
new_emails = [
    "URGENT: Your PayPal account has been limited. Click here to restore access immediately!",
    "Hi team, please review the attached security report and let me know your thoughts."
]

print("\nEMAIL CLASSIFICATION PREDICTIONS:")
for i, email in enumerate(new_emails, 1):
    # Preprocess
    processed = preprocess_text(email)
    
    # Vectorize
    email_tfidf_vec = email_tfidf.transform([processed])
    
    # Predict
    prediction = email_classifier.predict(email_tfidf_vec)[0]
    probability = email_classifier.predict_proba(email_tfidf_vec)[0].max()
    
    print(f"\nExample {i}:")
    print(f"Text: {email}")
    print(f"Predicted Label: {prediction}")
    print(f"Confidence: {probability:.3f}")

In [None]:
# =============================================================================
# STEP 12: BUSINESS IMPACT AND INSIGHTS
# =============================================================================

print("\n💼 STEP 12: BUSINESS IMPACT AND INSIGHTS")
print("=" * 50)

# Calculate business metrics
total_incidents = len(incidents_df)
total_emails = len(emails_df)
processing_time_manual = 45  # minutes per incident
processing_time_auto = 0.02  # minutes per incident (1.2 seconds)

# Time savings calculation
manual_time_hours = (total_incidents * processing_time_manual) / 60
auto_time_hours = (total_incidents * processing_time_auto) / 60
time_saved_hours = manual_time_hours - auto_time_hours

# Cost savings (assuming $50/hour analyst cost)
analyst_hourly_rate = 50
cost_savings = time_saved_hours * analyst_hourly_rate

print("BUSINESS IMPACT ANALYSIS:")
print(f"Total Incidents Processed: {total_incidents}")
print(f"Manual Processing Time: {manual_time_hours:.1f} hours")
print(f"Automated Processing Time: {auto_time_hours:.1f} hours")
print(f"Time Saved: {time_saved_hours:.1f} hours ({time_saved_hours/manual_time_hours*100:.1f}% reduction)")
print(f"Cost Savings: ${cost_savings:,.2f}")

# Accuracy impact
incident_accuracy = results[best_model_name]['accuracy']
false_positive_rate = 1 - incident_accuracy

print(f"\nACCURACY ANALYSIS:")
print(f"Incident Classification Accuracy: {incident_accuracy:.1%}")
print(f"Email Classification Accuracy: {email_accuracy:.1%}")
print(f"Estimated False Positive Rate: {false_positive_rate:.1%}")

# Key insights from topic modeling
print(f"\nTHREAT INTELLIGENCE INSIGHTS:")
print(f"Number of Topics Discovered: {n_topics}")
if len(dominant_topics) > 0:
    mode_result = pd.Series(dominant_topics).mode()
    if len(mode_result) > 0:
        print(f"Most Common Topic: Topic {mode_result.iloc[0] + 1}")
    else:
        print("Most Common Topic: Unable to determine")
else:
    print("Most Common Topic: No topics assigned")
print(f"Average Topic Confidence: {threat_intel_df['topic_probability'].mean():.3f}")

# Recommendations
recommendations = [
    "Deploy automated incident classification to reduce analyst workload by 70%",
    "Implement real-time phishing email detection with 85%+ accuracy",
    "Use topic modeling to automatically categorize threat intelligence feeds",
    "Set up sentiment analysis for prioritizing security alerts",
    "Establish human-in-the-loop workflow for low-confidence predictions",
    "Regular model retraining (monthly) to adapt to new threat patterns"
]

print(f"\nRECOMMENDATIONS:")
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

In [None]:
# =============================================================================
# STEP 13: CONCLUSION AND NEXT STEPS
# =============================================================================

print("\n🎯 STEP 13: CONCLUSION AND NEXT STEPS")
print("=" * 50)

print("WHAT WE ACCOMPLISHED:")
print("✅ Generated realistic synthetic security data")
print("✅ Implemented complete text preprocessing pipeline")
print("✅ Applied TF-IDF feature engineering")
print("✅ Performed sentiment analysis for threat assessment")
print("✅ Discovered topics in threat intelligence using LDA")
print("✅ Built and evaluated classification models")
print("✅ Demonstrated business impact and ROI")

print("\nKEY LEARNINGS:")
key_learnings = [
    "Text preprocessing is crucial for security NLP (removes noise, standardizes format)",
    "TF-IDF effectively captures important security terms and concepts",
    "Sentiment analysis can indicate threat urgency and emotional manipulation",
    "Topic modeling reveals hidden patterns in large text collections",
    "Simple models (Logistic Regression) can achieve high accuracy on well-preprocessed data",
    "Binary classification in scikit-learn requires special handling for feature importance",
    "NLP provides significant ROI through automation and consistency"
]

for learning in key_learnings:
    print(f"• {learning}")

print("\nNEXT STEPS FOR PRODUCTION DEPLOYMENT:")
next_steps = [
    "Collect and label real security data (with proper privacy controls)",
    "Implement robust data pipeline for continuous model training",
    "Add explainability features (LIME, SHAP) for analyst trust",
    "Integrate with existing SIEM/SOAR platforms",
    "Establish monitoring for model drift and performance degradation",
    "Design human-in-the-loop workflows for edge cases",
    "Implement adversarial robustness testing",
    "Develop custom security-domain language models"
]

for i, step in enumerate(next_steps, 1):
    print(f"{i}. {step}")

print("\nADDITIONAL LEARNING RESOURCES:")
resources = [
    "Practice with real datasets: MITRE ATT&CK, CVE database, public security feeds",
    "Explore advanced models: BERT, RoBERTa for security text understanding",
    "Learn security-specific NLP libraries: spaCy security models, YARA rules",
    "Study adversarial NLP: how attackers might try to fool your models",
    "Join communities: security data science groups, NLP conferences"
]

for resource in resources:
    print(f"• {resource}")

print(f"\n{'='*60}")
print("🏆 CONGRATULATIONS! You've completed the NLP for Security workshop!")
print("You now have hands-on experience with the complete ML lifecycle for security text analysis.")
print("Keep practicing and building more advanced security NLP solutions!")
print(f"{'='*60}")

# Save key results for reference
print("\n💾 SAVING RESULTS FOR REFERENCE...")

# Create results summary
results_summary = {
    'incident_classification_accuracy': results[best_model_name]['accuracy'],
    'email_classification_accuracy': email_accuracy,
    'time_savings_hours': time_saved_hours,
    'cost_savings_dollars': cost_savings,
    'topics_discovered': n_topics,
    'preprocessing_vocab_reduction': (1 - processed_lengths.mean()/original_lengths.mean())*100
}

# Display final summary
print("\nFINAL RESULTS SUMMARY:")
for metric, value in results_summary.items():
    if 'accuracy' in metric:
        print(f"{metric}: {value:.3f}")
    elif 'percentage' in metric or 'reduction' in metric:
        print(f"{metric}: {value:.1f}%")
    elif 'dollars' in metric:
        print(f"{metric}: ${value:,.2f}")
    else:
        print(f"{metric}: {value}")

print("\n✨ Workshop completed successfully! ✨")