# Smart Documentation Matcher - Development & Testing
## Testing documentation link matching based on response content using OpenAI embeddings

### 1. Setup and Imports

In [1]:
# === SETUP AND IMPORTS ===
import json
import re
from pathlib import Path
from typing import List, Dict, Any, Tuple
from collections import defaultdict
import pandas as pd
import numpy as np
from openai import OpenAI
import os
from dotenv import load_dotenv

# Load environment
load_dotenv()

print("✅ Setup complete!")

✅ Setup complete!


### 2. Load Documentation Database

In [3]:
# === LOAD DOCUMENTATION DATABASE ===
def load_documentation_database(file_path: str = "../data/official_docs/documentation_links.json"):
    """Load and flatten the documentation database"""
    try:
        with open(file_path, 'r') as f:
            doc_data = json.load(f)
        
        # Flatten the nested structure
        flattened_docs = []
        for category, docs in doc_data.items():
            for doc in docs:
                doc['category'] = category
                flattened_docs.append(doc)
        
        print(f"✅ Loaded {len(flattened_docs)} documentation links across {len(doc_data)} categories")
        print(f"Categories: {list(doc_data.keys())}")
        return flattened_docs, doc_data
        
    except FileNotFoundError:
        print(f"❌ Documentation file not found: {file_path}")
        return [], {}
    except json.JSONDecodeError as e:
        print(f"❌ Error parsing JSON: {e}")
        return [], {}

# Load the documentation
docs_flat, docs_nested = load_documentation_database()

# Display sample
if docs_flat:
    print("\n📋 Sample documentation entry:")
    sample_doc = docs_flat[0]
    for key, value in sample_doc.items():
        print(f"  {key}: {value}")

✅ Loaded 60 documentation links across 14 categories
Categories: ['docker', 'kubernetes', 'excel', 'bloomberg', 'python', 'aws', 'linux', 'networking', 'security', 'infrastructure', 'raspberry_pi', 'vpn', 'powershell', 'finance']

📋 Sample documentation entry:
  title: Get Started with Docker
  url: https://docs.docker.com/get-started/
  description: Official Docker getting started guide
  keywords: ['docker', 'container', 'beginner', 'tutorial', 'basics', 'installation', 'hello world']
  difficulty: beginner
  topics: ['containers', 'setup', 'fundamentals']
  status: validated
  category: docker


### 3. Smart Documentation Matcher Class (OpenAI Embeddings)

In [4]:
# === SMART DOCUMENTATION MATCHER (OpenAI Embeddings Only) ===
class SmartDocumentationMatcher:
    def __init__(self, documentation_data: List[Dict]):
        self.docs = documentation_data
        self.client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
        self.setup_embeddings()
        print("✅ Smart Documentation Matcher initialized with OpenAI embeddings!")
    
    def setup_embeddings(self):
        """Setup OpenAI embeddings for semantic matching"""
        # Create searchable text for each document
        self.doc_texts = []
        for doc in self.docs:
            # Combine title, description, keywords, and topics for matching
            text_parts = [
                doc.get('title', ''),
                doc.get('description', ''),
                ' '.join(doc.get('keywords', [])),
                ' '.join(doc.get('topics', [])),
                doc.get('category', '')
            ]
            self.doc_texts.append(' '.join(text_parts).lower())
        
        # Generate embeddings for all documentation
        print(f"🔄 Generating OpenAI embeddings for {len(self.doc_texts)} documentation entries...")
        
        try:
            response = self.client.embeddings.create(
                model="text-embedding-3-small",
                input=self.doc_texts
            )
            
            # Store embeddings as numpy array for easier similarity calculation
            self.doc_embeddings = np.array([item.embedding for item in response.data])
            print(f"✅ Documentation embeddings generated: {self.doc_embeddings.shape}")
            
        except Exception as e:
            print(f"❌ Error generating embeddings: {e}")
            self.doc_embeddings = None
    
    def cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
        """Calculate cosine similarity between two vectors"""
        dot_product = np.dot(vec1, vec2)
        norm_a = np.linalg.norm(vec1)
        norm_b = np.linalg.norm(vec2)
        return dot_product / (norm_a * norm_b)
    
    def extract_keywords_from_response(self, response_text: str) -> List[str]:
        """Extract relevant keywords from the AI response"""
        # Clean and normalize the response
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', response_text.lower())
        
        # Define technology keywords to look for
        tech_keywords = {
            'docker', 'container', 'kubernetes', 'k8s', 'pod', 'kubectl',
            'excel', 'vlookup', 'pivot', 'vba', 'macro', 'power query', 'power pivot',
            'bloomberg', 'terminal', 'bdh', 'bdp', 'bds', 'bql',
            'python', 'script', 'programming', 'code',
            'aws', 'ec2', 'vpc', 's3', 'lambda', 'cloud',
            'linux', 'ubuntu', 'bash', 'command line', 'terminal',
            'network', 'dns', 'ip', 'subnet', 'router', 'firewall',
            'security', 'encryption', 'vpn', 'ssl', 'certificate',
            'ansible', 'terraform', 'infrastructure', 'automation',
            'github', 'git', 'ci/cd', 'actions', 'workflow',
            'proxmox', 'vmware', 'virtualization', 'hypervisor',
            'raspberry pi', 'pi-hole', 'iot',
            'openvpn', 'wireguard', 'pfsense',
            'powershell', 'cmdlet', 'scripting',
            'nmap', 'wireshark', 'kali', 'metasploit', 'penetration testing'
        }
        
        # Find keywords in the response
        found_keywords = []
        for keyword in tech_keywords:
            if keyword in cleaned_text:
                found_keywords.append(keyword)
        
        return found_keywords
    
    def match_documentation(self, response_text: str, top_k: int = 5, 
                          min_similarity: float = 0.1) -> List[Dict]:
        """Match documentation based on response content using OpenAI embeddings"""
        if self.doc_embeddings is None:
            return []
        
        # Extract keywords from response
        keywords = self.extract_keywords_from_response(response_text)
        
        # Create search query from response text and keywords
        search_text = response_text.lower() + ' ' + ' '.join(keywords)
        
        try:
            # Generate embedding for search text
            response = self.client.embeddings.create(
                model="text-embedding-3-small",
                input=[search_text]
            )
            
            search_embedding = np.array(response.data[0].embedding)
            
            # Calculate similarities with all documentation embeddings
            similarities = []
            for doc_embedding in self.doc_embeddings:
                similarity = self.cosine_similarity(search_embedding, doc_embedding)
                similarities.append(similarity)
            
            similarities = np.array(similarities)
            
            # Get top matches above threshold
            top_indices = np.argsort(similarities)[::-1][:top_k]
            
            matches = []
            for idx in top_indices:
                if similarities[idx] >= min_similarity:
                    doc = self.docs[idx].copy()
                    doc['similarity_score'] = similarities[idx]
                    doc['matched_keywords'] = keywords
                    matches.append(doc)
            
            return matches
            
        except Exception as e:
            print(f"❌ Error matching documentation: {e}")
            return []
    
    def get_category_stats(self) -> Dict[str, int]:
        """Get statistics about documentation categories"""
        category_counts = defaultdict(int)
        for doc in self.docs:
            category_counts[doc['category']] += 1
        return dict(category_counts)

# Initialize matcher
if docs_flat:
    doc_matcher = SmartDocumentationMatcher(docs_flat)
    
    # Show category statistics
    print("\n📊 Documentation categories:")
    stats = doc_matcher.get_category_stats()
    for category, count in stats.items():
        print(f"  {category}: {count} docs")
else:
    print("❌ No documentation loaded, skipping matcher initialization")

🔄 Generating OpenAI embeddings for 60 documentation entries...
✅ Documentation embeddings generated: (60, 1536)
✅ Smart Documentation Matcher initialized with OpenAI embeddings!

📊 Documentation categories:
  docker: 6 docs
  kubernetes: 6 docs
  excel: 6 docs
  bloomberg: 3 docs
  python: 5 docs
  aws: 6 docs
  linux: 4 docs
  networking: 4 docs
  security: 5 docs
  infrastructure: 5 docs
  raspberry_pi: 2 docs
  vpn: 2 docs
  powershell: 2 docs
  finance: 4 docs


### 4. Test Documentation Matching

In [5]:
# === TEST DOCUMENTATION MATCHING ===
def test_documentation_matching(response_text: str, expected_categories: List[str] = None):
    """Test documentation matching for a given response"""
    print(f"\n{'='*80}")
    print(f"🧪 TESTING DOCUMENTATION MATCHING (OpenAI Embeddings)")
    print(f"{'='*80}")
    print(f"Response text: {response_text[:200]}...")
    if expected_categories:
        print(f"Expected categories: {expected_categories}")
    print(f"{'='*80}")
    
    if not docs_flat:
        print("❌ No documentation loaded")
        return
    
    # Get matches
    matches = doc_matcher.match_documentation(response_text, top_k=5)
    
    # Extract keywords
    keywords = doc_matcher.extract_keywords_from_response(response_text)
    
    print(f"\n🔍 ANALYSIS:")
    print(f"Extracted keywords: {keywords}")
    print(f"Found {len(matches)} relevant documentation links")
    
    if matches:
        print(f"\n📚 MATCHED DOCUMENTATION:")
        for i, match in enumerate(matches, 1):
            print(f"\n{i}. 📖 {match['title']}")
            print(f"   Category: {match['category']}")
            print(f"   Similarity: {match['similarity_score']:.3f}")
            print(f"   URL: {match['url']}")
            print(f"   Description: {match['description']}")
            print(f"   Keywords: {', '.join(match['keywords'][:5])}...")
            print(f"   Difficulty: {match['difficulty']}")
    else:
        print("\n❌ No documentation matches found")
    
    return matches

# Test with sample responses
test_responses = [
    {
        "text": "To set up Docker containers, you first need to install Docker on your system. Then you can use 'docker run' commands to create containers from images. Docker Compose is useful for multi-container applications.",
        "expected": ["docker"]
    },
    {
        "text": "VLOOKUP is a powerful Excel function that allows you to search for data in a table. You can also use INDEX and MATCH functions for more flexibility. Power Query is great for data transformation.",
        "expected": ["excel"]
    },
    {
        "text": "Bloomberg Terminal functions like BDH and BDP are essential for financial data analysis. The Bloomberg Query Language (BQL) Builder helps create complex queries for panel data.",
        "expected": ["bloomberg"]
    },
    {
        "text": "To scan networks with Nmap, use commands like 'nmap -sn' for ping sweeps. Wireshark can help analyze the network traffic. Kali Linux includes many penetration testing tools.",
        "expected": ["security", "networking"]
    }
]

print("🚀 Starting documentation matching tests with OpenAI embeddings...")

🚀 Starting documentation matching tests with OpenAI embeddings...


In [6]:
# Run the tests
if docs_flat:
    for i, test_case in enumerate(test_responses, 1):
        print(f"\n\n🧪 TEST CASE {i}")
        matches = test_documentation_matching(
            test_case["text"], 
            test_case["expected"]
        )
else:
    print("❌ Cannot run tests - no documentation loaded")



🧪 TEST CASE 1

🧪 TESTING DOCUMENTATION MATCHING (OpenAI Embeddings)
Response text: To set up Docker containers, you first need to install Docker on your system. Then you can use 'docker run' commands to create containers from images. Docker Compose is useful for multi-container appl...
Expected categories: ['docker']

🔍 ANALYSIS:
Extracted keywords: ['docker', 'container']
Found 5 relevant documentation links

📚 MATCHED DOCUMENTATION:

1. 📖 Get Started with Docker
   Category: docker
   Similarity: 0.541
   URL: https://docs.docker.com/get-started/
   Description: Official Docker getting started guide
   Keywords: docker, container, beginner, tutorial, basics...
   Difficulty: beginner

2. 📖 Docker Compose
   Category: docker
   Similarity: 0.516
   URL: https://docs.docker.com/compose/
   Description: Multi-container Docker applications
   Keywords: docker, compose, multi-container, orchestration, yaml...
   Difficulty: intermediate

3. 📖 Dockerfile Reference
   Category: docker
   

### 5. Advanced Testing - Cross-Domain Scenarios

In [7]:
# === CROSS-DOMAIN TESTING ===
cross_domain_tests = [
    {
        "text": "You can automate Excel tasks using Python scripts. Libraries like pandas and openpyxl make it easy to manipulate spreadsheets programmatically. VBA is also useful for Excel automation.",
        "expected": ["excel", "python"],
        "description": "Excel + Python automation"
    },
    {
        "text": "Deploy your Python applications using Docker containers on AWS EC2 instances. Use GitHub Actions for CI/CD to automate deployments. Terraform can help provision the infrastructure.",
        "expected": ["docker", "aws", "python", "infrastructure"],
        "description": "Cloud deployment pipeline"
    },
    {
        "text": "Use Ansible playbooks to configure Ubuntu servers. You can automate the installation of Docker, set up networking, and deploy applications across multiple servers.",
        "expected": ["infrastructure", "linux", "docker"],
        "description": "Infrastructure automation"
    },
    {
        "text": "For financial modeling in Excel, create dynamic tables with Power Query data sources. Use Bloomberg add-in functions to pull real-time market data into your models.",
        "expected": ["excel", "bloomberg", "finance"],
        "description": "Financial modeling workflow"
    }
]



In [8]:
# Execute the cross-domain tests
print("🔄 Testing cross-domain scenarios...")

if docs_flat:
    for i, test_case in enumerate(cross_domain_tests, 1):
        print(f"\n\n🌐 CROSS-DOMAIN TEST {i}: {test_case['description']}")
        matches = test_documentation_matching(
            test_case["text"], 
            test_case["expected"]
        )
        
        # Analyze category diversity
        matched_categories = [match['category'] for match in matches]
        unique_categories = list(set(matched_categories))
        print(f"\n📊 Category diversity: {len(unique_categories)} unique categories")
        print(f"   Categories found: {unique_categories}")
else:
    print("❌ Cannot run cross-domain tests - no documentation loaded")

🔄 Testing cross-domain scenarios...


🌐 CROSS-DOMAIN TEST 1: Excel + Python automation

🧪 TESTING DOCUMENTATION MATCHING (OpenAI Embeddings)
Response text: You can automate Excel tasks using Python scripts. Libraries like pandas and openpyxl make it easy to manipulate spreadsheets programmatically. VBA is also useful for Excel automation....
Expected categories: ['excel', 'python']

🔍 ANALYSIS:
Extracted keywords: ['automation', 'python', 'ip', 'vba', 'excel', 'script']
Found 5 relevant documentation links

📚 MATCHED DOCUMENTATION:

1. 📖 Excel VBA Reference
   Category: excel
   Similarity: 0.575
   URL: https://learn.microsoft.com/en-us/office/vba/api/overview/
   Description: Complete Excel VBA programming reference
   Keywords: excel, vba, programming, automation, macros...
   Difficulty: intermediate

2. 📖 VBA Language Reference
   Category: excel
   Similarity: 0.491
   URL: https://learn.microsoft.com/en-us/office/vba/api/overview/language-reference
   Description: VBA programmin