## Documentation Quality Analyzer

This script analyzes technical documentation quality using AI and readability metrics. It's designed for content teams to evaluate documentation against key quality dimensions and Microsoft Style Guide principles.

### Technologies Used

1. **Google Gemini API** (`gemini-1.5-flash`)
   - AI-powered analysis of documentation content
   - Evaluates readability, structure, completeness, and style compliance
   - Processes text in efficient JSON format

2. **Text Processing Libraries**
   - `markdown`: Converts MD to plain text
   - `BeautifulSoup`: HTML content cleaning
   - `frontmatter`: Handles YAML metadata in MD files
   - `textstat`: Readability metrics (not implemented but prepped for extension)

3. **Core Python Modules**
   - `re`: Pattern matching for content cleaning
   - `json`: Structured report generation
   - `os`: File system operations
   - `time`: API rate limiting

### Why This Approach?

The implementation focuses on:
- **Efficiency**: 
  - Uses lightweight Gemini Flash model
  - Combines analysis dimensions in single API call
  - Implements 1-sec rate limiting for free tier
- **Accuracy**:
  - Advanced content preprocessing
  - JSON response validation/repair
  - Context-aware heading markers
- **Actionability**:
  - Structured suggestions for writers
  - Microsoft Style Guide compliance checks
  - Focus on non-technical readability
- **Scalability**:
  - Modular design for new analysis types
  - Automatic filename handling
  - Batch processing ready

### Analysis Dimensions

1. **Readability**: 
   - Jargon usage
   - Sentence complexity
   - Non-technical audience suitability

2. **Structure**:
   - Heading hierarchy
   - Logical flow
   - Paragraph length optimization

3. **Completeness**:
   - Implementation details
   - Example quality
   - Coverage depth

4. **Style Compliance**:
   - Microsoft Voice guidelines
   - Action-oriented language
   - Clarity standards

In [8]:
import os
import re
import json
import textstat
import markdown
import google.generativeai as genai
from bs4 import BeautifulSoup
import frontmatter
from urllib.parse import urlparse
import time

GEMINI_API_KEY = "Secret..."  # Replace with your actual API key
genai.configure(api_key=GEMINI_API_KEY)

# I used an efficient model for free tier
GEMINI_MODEL = "gemini-1.5-flash"

def preprocess_md(file_path):
    """Extract and clean Markdown content"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Handle frontmatter if exists
        if content.startswith('---'):
            post = frontmatter.loads(content)
            content = post.content
        
        # Remove code blocks and images
        content = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
        content = re.sub(r'\!\[.*?\]\(.*?\)', '', content)
        
        # Convert markdown to plain text with proper heading formatting
        text = markdown.markdown(content)
        
        # Clean HTML tags and excessive whitespace
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'\n{3,}', '\n\n', text).strip()
        
        # Add structure markers for headings
        text = re.sub(r'\n(#+)\s+(.*?)\n', r'\nHEADING_\1: \2\n', text)
        
        return text
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return ""

def url_to_filename(url):
    """Create safe filename from URL"""
    parsed = urlparse(url)
    path = parsed.path.strip('/')
    # Extract article ID if exists
    article_id = re.search(r'articles/(\d+)', path)
    if article_id:
        return f"analysis_{article_id.group(1)}.json"
    # Fallback to last path segment
    return f"analysis_{path.split('/')[-1]}.json"

class DocumentationAnalyzer:
    def __init__(self, model=GEMINI_MODEL):
        self.model = genai.GenerativeModel(model)
        self.last_call_time = 0
    
    def _clean_json_response(self, response_text):
        """Clean and parse JSON response from Gemini"""
        try:
            # Remove markdown code block markers if present
            if response_text.startswith("```json"):
                response_text = response_text[7:]
            if response_text.endswith("```"):
                response_text = response_text[:-3]
                
            return json.loads(response_text.strip())
        except json.JSONDecodeError:
            print("Fixing malformed JSON response...")
            # Attempt to repair common JSON issues
            response_text = re.sub(r',\s*}', '}', response_text)  # Trailing commas
            response_text = re.sub(r',\s*]', ']', response_text)
            response_text = re.sub(r'(\w+):', r'"\1":', response_text)  # Unquoted keys
            return json.loads(response_text)

    def _llm_analysis(self, prompt, text):
        """Gemini analysis with rate limiting"""
        # Enforce 60 RPM free tier limit (1 call/second)
        elapsed = time.time() - self.last_call_time
        if elapsed < 1.0:
            time.sleep(1.0 - elapsed)
        
        try:
            # Combine analysis types to reduce API calls
            combined_prompt = f"""Analyze documentation content for the following aspects:
1. READABILITY (for non-technical marketers): Jargon usage, sentence complexity
2. STRUCTURE: Heading organization, logical flow, paragraph length
3. COMPLETENESS: Implementation details, example quality
4. STYLE: Microsoft Style Guide compliance (voice, clarity, action-oriented)

Provide JSON output with these keys:
- "readability": {{"assessment": "summary", "suggestions": ["list"]}}
- "structure": {{"assessment": "summary", "suggestions": ["list"]}}
- "completeness": {{"assessment": "summary", "suggestions": ["list"]}}
- "style": {{"assessment": "summary", "suggestions": ["list"]}}

CONTENT:
{text[:30000]}"""  # Stay within context limits
            
            response = self.model.generate_content(
                combined_prompt,
                generation_config=genai.types.GenerationConfig(
                    temperature=0.2,
                    max_output_tokens=2000,
                    response_mime_type="application/json"
                )
            )
            
            self.last_call_time = time.time()
            return self._clean_json_response(response.text)
            
        except Exception as e:
            print(f"Gemini API error: {str(e)}")
            return {"error": str(e)}

    def analyze(self, url, file_path):
        """Efficient analysis workflow"""
        text = preprocess_md(file_path)
        if not text or len(text) < 100:
            return {"error": "Insufficient text content for analysis"}
        
        return {
            "url": url,
            "results": self._llm_analysis(None, text)
        }

    def save_report(self, results, output_file):
        """Save structured report"""
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print(f"Report saved to: {output_file}")

def main():
    """Main function with predefined file details"""
    analyzer = DocumentationAnalyzer()
    
    # Predefined document details
    file1 = "preprocessed_data/preprocessed_web_sdk_overview.md"
    url1 = "https://developers.moengage.com/hc/en-us/articles/360061108111-Web-SDK-Overview#h_01H9G1YMFWVN61PKBDN0MGAJWG"
    
    file2 = "preprocessed_data/preprocessed_getting_started_with_react_native_sdk.md"
    url2 = "https://developers.moengage.com/hc/en-us/articles/22105190881044-Getting-Started-with-React-Native-SDK#h_01HEJAHP5W49AASNSHF5614AHP"
    
    output_dir = "./Outputs_Task1"  # Output directory
    
    # Create output directory if needed
    os.makedirs(output_dir, exist_ok=True)
    
    # Process documents
    print("Starting analysis...\n")
    
    print(f"Analyzing {file1}...")
    report1 = analyzer.analyze(url1, file1)
    analyzer.save_report(report1, os.path.join(output_dir, url_to_filename(url1)))
    
    print(f"\nAnalyzing {file2}...")
    report2 = analyzer.analyze(url2, file2)
    analyzer.save_report(report2, os.path.join(output_dir, url_to_filename(url2)))
    
    print("\nAnalysis complete!")

if __name__ == "__main__":
    main()

Starting analysis...

Analyzing preprocessed_data/preprocessed_web_sdk_overview.md...
Report saved to: ./Outputs_Task1\analysis_360061108111.json

Analyzing preprocessed_data/preprocessed_getting_started_with_react_native_sdk.md...
Report saved to: ./Outputs_Task1\analysis_22105190881044.json

Analysis complete!
