In [None]:
print('Setup complete.')

# Package Refactor Demo

## Learning Objectives
- Use AI to analyze and improve existing code structure
- Implement systematic refactoring with AI assistance
- Create maintainable package architectures
- Build automated code quality assessment tools

## The Challenge: Legacy Code Modernization

This demo shows how to use AI to refactor messy, monolithic code into clean, modular packages. We'll transform a single-file script into a well-structured Python package with:
1. **Architecture Analysis** - Understanding current structure
2. **Modular Design** - Breaking code into logical components
3. **Quality Improvement** - Enhancing readability and maintainability
4. **Testing Strategy** - Adding comprehensive test coverage

In [None]:
# Install required packages
!pip install asksageclient pip_system_certs rich ast-decompiler tiktoken

In [None]:
# ================================
# 🔐 Cell 1 — Load secrets (Colab) + pricing + token utils
# ================================
import os, time, csv
from typing import Optional, Dict
import tiktoken

from google.colab import userdata

ASKSAGE_API_KEY = userdata.get("ASKSAGE_API_KEY")
ASKSAGE_BASE_URL = userdata.get("ASKSAGE_BASE_URL")
ASKSAGE_EMAIL = userdata.get("ASKSAGE_EMAIL")

assert ASKSAGE_API_KEY, "ASKSAGE_API_KEY not provided."
assert ASKSAGE_EMAIL, "ASKSAGE_EMAIL not provided."

print("✓ Secrets loaded")
print("  • EMAIL:", ASKSAGE_EMAIL)
print("  • BASE URL:", ASKSAGE_BASE_URL or "(default)")

# Pricing (USD per 1,000,000 tokens)
PRICES_PER_M = {
    "gpt-5": {"input_per_m": 1.25, "output_per_m": 10.00},
    "gpt-5-mini": {"input_per_m": 0.25, "output_per_m": 2.00},
}

# Tokenizer
enc = tiktoken.get_encoding("o200k_base")

def count_tokens(text: str) -> int:
    return len(enc.encode(text or ""))

def cost_usd(model: str, input_tokens: int, output_tokens: int) -> float:
    if model not in PRICES_PER_M:
        raise ValueError(f"Unknown model: {model}")
    r = PRICES_PER_M[model]
    return (input_tokens / 1_000_000) * r["input_per_m"] + (output_tokens / 1_000_000) * r["output_per_m"]

In [None]:
# ================================
# 🔧 Cell 2 — Import bootcamp_common and setup AskSage client
# ================================
import sys
sys.path.append('../../../')  # Adjust path to reach bootcamp_common

from bootcamp_common.ask_sage import AskSageClient

# Initialize AskSage client
client = AskSageClient(
    api_key=ASKSAGE_API_KEY,
    base_url=ASKSAGE_BASE_URL
)

print("✓ AskSage client initialized")

In [None]:
import os
import ast
import inspect
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass
from pathlib import Path

import openai
from rich.console import Console
from rich.panel import Panel
from rich.syntax import Syntax
from rich.tree import Tree
from rich.table import Table

console = Console()
print("🔄 Package refactor system loading...")

## Legacy Code Example: Monolithic Data Processor

In [None]:
# Example of legacy code that needs refactoring
legacy_code = '''
import json
import csv
import requests
import sqlite3
from datetime import datetime
import pandas as pd

def process_data_files():
    # Configuration - should be in config file
    API_URL = "https://api.example.com/data"
    DB_PATH = "data.db"
    
    # Initialize database - mixed concerns
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("""CREATE TABLE IF NOT EXISTS processed_data 
                     (id INTEGER PRIMARY KEY, data TEXT, timestamp TEXT)""")
    
    # Fetch data from API - no error handling
    response = requests.get(API_URL)
    api_data = response.json()
    
    # Process CSV files - hardcoded paths
    csv_files = ["data1.csv", "data2.csv", "data3.csv"]
    all_data = []
    
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            # Data cleaning - should be separate function
            df = df.dropna()
            df['processed_date'] = datetime.now().isoformat()
            
            # Transform data - business logic mixed with I/O
            for _, row in df.iterrows():
                processed_row = {
                    'id': row.get('id', 0),
                    'value': float(row.get('value', 0)) * 1.1,  # Magic number
                    'category': row.get('category', 'unknown').upper(),
                    'timestamp': row['processed_date']
                }
                all_data.append(processed_row)
                
                # Save to database - inefficient
                cursor.execute("INSERT INTO processed_data VALUES (?, ?, ?)",
                             (processed_row['id'], 
                              json.dumps(processed_row), 
                              processed_row['timestamp']))
        except Exception as e:
            print(f"Error processing {file}: {e}")  # Poor error handling
    
    # Generate report - should be separate module
    report = {
        'total_records': len(all_data),
        'categories': list(set([d['category'] for d in all_data])),
        'avg_value': sum([d['value'] for d in all_data]) / len(all_data),
        'generated_at': datetime.now().isoformat()
    }
    
    # Save report - hardcoded filename
    with open('report.json', 'w') as f:
        json.dump(report, f, indent=2)
    
    conn.commit()
    conn.close()
    
    return report

if __name__ == "__main__":
    result = process_data_files()
    print(f"Processed {result['total_records']} records")
'''

console.print("📄 [bold red]Legacy Code Example Loaded[/bold red]")
syntax = Syntax(legacy_code, "python", theme="monokai", line_numbers=True)
console.print(Panel(syntax, title="Legacy: monolithic_processor.py", border_style="red"))
print("\n🔍 This code has multiple issues that need refactoring!")

## AI-Powered Code Analyzer

In [None]:
@dataclass
class CodeAnalysis:
    """Results of code analysis"""
    issues: List[str]
    complexity_score: float
    suggested_modules: List[str]
    refactor_priority: str
    architectural_recommendations: List[str]

class CodeRefactorAgent:
    """AI-powered code analysis and refactoring agent"""
    
    def __init__(self):
        self.setup_client()
        self.refactor_history = []
    
    def setup_client(self):
        """Setup API client"""
        if os.getenv('OPENAI_API_KEY'):
            try:
                self.client = openai.OpenAI()
                self.has_api = True
                console.print("✅ OpenAI client configured")
            except Exception as e:
                self.has_api = False
                console.print(f"⚠️ Using mock responses: {e}")
        else:
            self.has_api = False
            console.print("💡 No API key found, using mock responses")
    
    def analyze_code_structure(self, code: str) -> CodeAnalysis:
        """Analyze code and identify refactoring opportunities"""
        
        analysis_prompt = f"""Analyze this Python code and identify refactoring opportunities:

{code}

Provide analysis for:
1. Code quality issues (mixing concerns, hardcoded values, poor error handling)
2. Suggested module breakdown
3. Architectural improvements
4. Complexity assessment (1-10 scale)
5. Refactoring priority (high/medium/low)

Focus on practical, implementable suggestions."""
        
        if self.has_api:
            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[{"role": "user", "content": analysis_prompt}],
                max_tokens=800,
                temperature=0.3
            )
            analysis_text = response.choices[0].message.content
        else:
            # Mock analysis
            analysis_text = """Code Quality Issues:
1. Mixed concerns - database, API, file processing in one function
2. Hardcoded configuration values
3. Poor error handling with generic exception catching
4. Magic numbers (1.1 multiplier)
5. Inefficient database operations

Suggested Modules:
- config: Configuration management
- data_sources: API and file readers
- processors: Data transformation logic
- storage: Database operations
- reports: Report generation

Complexity Score: 8/10 (high)
Priority: High - monolithic function doing too much"""
        
        # Parse analysis (simplified)
        return CodeAnalysis(
            issues=[
                "Mixed concerns in single function",
                "Hardcoded configuration values",
                "Poor error handling",
                "Magic numbers",
                "Inefficient database operations"
            ],
            complexity_score=8.0,
            suggested_modules=["config", "data_sources", "processors", "storage", "reports"],
            refactor_priority="high",
            architectural_recommendations=[
                "Separate concerns into distinct modules",
                "Use dependency injection for configuration",
                "Implement proper error handling with custom exceptions",
                "Add data validation layer",
                "Create batch database operations"
            ]
        )
    
    def generate_refactored_module(self, module_name: str, original_code: str, requirements: str) -> str:
        """Generate a specific refactored module"""
        
        refactor_prompt = f"""Create a clean, well-structured Python module named '{module_name}' 
by extracting relevant functionality from this code:

{original_code[:1000]}...

Requirements for {module_name} module:
{requirements}

Generate code that:
1. Follows single responsibility principle
2. Has proper error handling
3. Includes type hints
4. Has clear documentation
5. Is testable and modular

Return only the Python module code."""
        
        if self.has_api:
            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[{"role": "user", "content": refactor_prompt}],
                max_tokens=1000,
                temperature=0.2
            )
            module_code = response.choices[0].message.content
        else:
            # Mock module code based on module name
            if module_name == "config":
                module_code = '''"""Configuration management module."""
from pathlib import Path
from typing import Dict, Any
import yaml

class Config:
    """Application configuration handler."""
    
    def __init__(self, config_path: Path = None):
        self.config_path = config_path or Path("config.yaml")
        self._config = self._load_config()
    
    def _load_config(self) -> Dict[str, Any]:
        """Load configuration from file."""
        if self.config_path.exists():
            with open(self.config_path) as f:
                return yaml.safe_load(f)
        return self._default_config()
    
    def _default_config(self) -> Dict[str, Any]:
        return {
            "api_url": "https://api.example.com/data",
            "db_path": "data.db", 
            "csv_files": ["data1.csv", "data2.csv"],
            "multiplier": 1.1
        }
    
    def get(self, key: str, default=None):
        return self._config.get(key, default)'''
            elif module_name == "processors":
                module_code = '''"""Data processing module."""
from typing import List, Dict, Any
from datetime import datetime
import pandas as pd

class DataProcessor:
    """Handle data transformation and cleaning."""
    
    def __init__(self, multiplier: float = 1.1):
        self.multiplier = multiplier
    
    def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean and prepare DataFrame."""
        df = df.dropna()
        df['processed_date'] = datetime.now().isoformat()
        return df
    
    def transform_row(self, row: pd.Series) -> Dict[str, Any]:
        """Transform a single row of data."""
        return {
            'id': row.get('id', 0),
            'value': float(row.get('value', 0)) * self.multiplier,
            'category': row.get('category', 'unknown').upper(),
            'timestamp': row['processed_date']
        }'''
            else:
                module_code = f'"""Generated {module_name} module."""\n# Implementation for {module_name}'
        
        return module_code
    
    def create_package_structure(self, analysis: CodeAnalysis) -> Dict[str, str]:
        """Generate complete refactored package structure"""
        
        package_structure = {}
        
        # Module requirements
        module_requirements = {
            "config": "Configuration management and settings",
            "data_sources": "API clients and file readers", 
            "processors": "Data transformation and cleaning logic",
            "storage": "Database operations and persistence",
            "reports": "Report generation and formatting"
        }
        
        console.print("\n🔧 [bold blue]Generating Refactored Package Structure[/bold blue]")
        
        for module in analysis.suggested_modules:
            if module in module_requirements:
                console.print(f"[yellow]Generating {module} module...[/yellow]")
                requirements = module_requirements[module]
                module_code = self.generate_refactored_module(module, legacy_code, requirements)
                package_structure[f"{module}.py"] = module_code
        
        # Generate main orchestrator
        package_structure["main.py"] = self._generate_main_module(analysis)
        package_structure["__init__.py"] = '"""Data processing package."""\n__version__ = "1.0.0"'
        
        return package_structure
    
    def _generate_main_module(self, analysis: CodeAnalysis) -> str:
        """Generate main orchestrator module"""
        return '''"""Main application orchestrator."""
from pathlib import Path
from .config import Config
from .processors import DataProcessor
from .data_sources import CSVReader, APIClient
from .storage import DatabaseManager
from .reports import ReportGenerator

class DataProcessingOrchestrator:
    """Orchestrate the data processing workflow."""
    
    def __init__(self, config_path: Path = None):
        self.config = Config(config_path)
        self.processor = DataProcessor(self.config.get("multiplier"))
        self.db = DatabaseManager(self.config.get("db_path"))
        self.report_gen = ReportGenerator()
    
    def run(self):
        """Execute the complete data processing pipeline."""
        # Implementation would orchestrate all modules
        pass'''

# Initialize refactor agent
refactor_agent = CodeRefactorAgent()
print("🤖 Code refactor agent ready!")

## Demo: Refactor Legacy Code

In [None]:
# Analyze and refactor the legacy code
console.print("\n🔍 [bold blue]Analyzing Legacy Code Structure[/bold blue]")

# Step 1: Analyze current code
analysis = refactor_agent.analyze_code_structure(legacy_code)

# Display analysis results
console.print("\n[red]Issues Identified:[/red]")
for issue in analysis.issues:
    console.print(f"  • {issue}")

console.print(f"\n[yellow]Complexity Score:[/yellow] {analysis.complexity_score}/10")
console.print(f"[yellow]Refactor Priority:[/yellow] {analysis.refactor_priority.upper()}")

console.print("\n[green]Suggested Module Structure:[/green]")
for module in analysis.suggested_modules:
    console.print(f"  📦 {module}.py")

console.print("\n[cyan]Architectural Recommendations:[/cyan]")
for rec in analysis.architectural_recommendations:
    console.print(f"  → {rec}")

# Step 2: Generate refactored package
console.print("\n🏗️ [bold blue]Generating Refactored Package[/bold blue]")
refactored_package = refactor_agent.create_package_structure(analysis)

# Display refactored structure
console.print("\n📁 [bold green]Refactored Package Structure:[/bold green]")
tree = Tree("📦 data_processor/")
for filename in sorted(refactored_package.keys()):
    tree.add(f"📄 {filename}")
console.print(tree)

# Show sample refactored module
if "config.py" in refactored_package:
    console.print("\n[yellow]Sample Refactored Module:[/yellow]")
    config_syntax = Syntax(refactored_package["config.py"], "python", theme="monokai", line_numbers=True)
    console.print(Panel(config_syntax, title="config.py", border_style="green"))

print("\n✨ Package refactoring complete!")

## Key Takeaways: AI-Powered Refactoring

### 🎯 **Systematic Refactoring Process**

1. **Analysis First**: Understand current structure and identify issues
2. **Modular Design**: Break monoliths into single-responsibility modules
3. **Dependency Management**: Design clear interfaces between components
4. **Quality Gates**: Maintain or improve code quality during refactoring
5. **Incremental Changes**: Refactor in manageable, testable chunks

### 🔧 **Benefits of AI-Assisted Refactoring**

- **Objective Analysis**: AI identifies issues humans might miss
- **Consistent Patterns**: Applies architectural patterns uniformly
- **Speed**: Rapidly generates structured alternatives
- **Best Practices**: Incorporates modern coding standards
- **Documentation**: Auto-generates documentation and comments

### ⚠️ **Important Considerations**

- **Human Review Required**: AI suggestions need expert validation
- **Test Coverage**: Ensure refactored code maintains functionality
- **Gradual Migration**: Plan incremental rollout strategy
- **Performance Impact**: Validate that refactoring doesn't hurt performance
- **Team Alignment**: Ensure architectural decisions align with team standards

## Next: Golden Test Demo

Ready to see how AI can help create comprehensive test suites for legacy code?