In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os
import time
import re
import random
import logging
from urllib.parse import urljoin
import markdown
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from typing import List, Dict, Optional, Tuple

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("ai_course_generator.log", encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('AI Course Generator')

class AIContentEnhancer:
    """Advanced AI content generation using GPT-2"""
    def __init__(self):
        self.model_name = "gpt2-large"
        self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_name)
        self.model = GPT2LMHeadModel.from_pretrained(self.model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        self.special_tokens = {
            'explanation': '<|explanation|>',
            'example': '<|example|>',
            'exercise': '<|exercise|>',
            'summary': '<|summary|>'
        }
        self.tokenizer.add_special_tokens({'additional_special_tokens': list(self.special_tokens.values())})
        self.model.resize_token_embeddings(len(self.tokenizer))

        logger.info("AI Content Enhancer initialized with GPT-2 model")

    def enhance_content(self, prompt: str, content_type: str, max_length: int = 200) -> str:
        """Generate enhanced educational content using GPT-2"""
        try:
            prompt = f"{self.special_tokens[content_type]}{prompt}"
            inputs = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_length=max_length,
                    num_return_sequences=1,
                    no_repeat_ngram_size=2,
                    top_k=50,
                    top_p=0.95,
                    temperature=0.7,
                    do_sample=True
                )

            generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return self._post_process(generated)
        except Exception as e:
            logger.error(f"Error in AI enhancement: {str(e)}")
            return prompt

    def _post_process(self, text: str) -> str:
        """Clean and format generated text"""
        # Remove incomplete sentences
        text = re.sub(r'[^.!?]+$', '', text)
        # Fix common GPT-2 quirks
        replacements = [
            (r'\s+', ' '),
            (r' ,', ','),
            (r' \.', '.'),
            (r'\( ', '('),
            (r' \)', ')')
        ]
        for pattern, repl in replacements:
            text = re.sub(pattern, repl, text)
        return text.strip()

class KnowledgeScraper:
    """Advanced web scraper for educational content"""
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Accept-Language": "en-US,en;q=0.9"
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)

        # Configured sources with updated selectors
        self.sources = {
            "w3schools": {
                "base_url": "https://www.w3schools.com/",
                "topics": {
                    "python": {"url": "python/default.asp", "depth": 3},
                    "javascript": {"url": "js/default.asp", "depth": 3},
                    "sql": {"url": "sql/default.asp", "depth": 2}
                },
                "content_selectors": ["#main", ".w3-example"],
                "code_selectors": [".w3-code"],
                "avoid_urls": ["tryit.asp", "exercise.asp"]
            },
            "geeksforgeeks": {
                "base_url": "https://www.geeksforgeeks.org/",
                "topics": {
                    "python": {"url": "python-programming-language/", "depth": 3},
                    "data structures": {"url": "data-structures/", "depth": 2},
                    "algorithms": {"url": "fundamentals-of-algorithms/", "depth": 2}
                },
                "content_selectors": [".content", "article"],
                "code_selectors": [".code-container pre"],
                "avoid_urls": ["practice/", "quiz/"]
            },
            "realpython": {
                "base_url": "https://realpython.com/",
                "topics": {
                    "python": {"url": "tutorials/", "depth": 2},
                    "django": {"url": "django/", "depth": 2}
                },
                "content_selectors": [".article-body"],
                "code_selectors": [".highlight pre"],
                "avoid_urls": ["/courses/", "/quiz/"]
            }
        }

        self.delays = {
            "w3schools": (1, 2),
            "geeksforgeeks": (2, 3),
            "realpython": (2, 4)
        }

    def scrape_knowledge_base(self, output_file: str = "knowledge_base.json") -> List[Dict]:
        """Scrape educational content from configured sources"""
        knowledge = []

        for source_name, config in self.sources.items():
            for topic, topic_config in config["topics"].items():
                logger.info(f"Scraping {topic} from {source_name}")

                try:
                    base_url = urljoin(config["base_url"], topic_config["url"])
                    response = self.session.get(base_url, timeout=15)
                    response.raise_for_status()

                    soup = BeautifulSoup(response.text, "html.parser")

                    # Process main page
                    main_content = self._extract_content(soup, source_name)
                    if main_content:
                        knowledge.append({
                            "topic": topic,
                            "title": self._get_title(soup),
                            "content": main_content["text"],
                            "code_examples": main_content["code"],
                            "source": source_name,
                            "url": base_url,
                            "level": self._determine_level(topic, source_name)
                        })

                    # Find and process additional pages
                    page_links = self._find_links(soup, base_url, source_name)
                    for link in page_links[:topic_config["depth"]]:
                        time.sleep(random.uniform(*self.delays[source_name]))

                        try:
                            page_content = self._scrape_page(link, source_name)
                            if page_content:
                                knowledge.append({
                                    "topic": topic,
                                    "title": page_content["title"],
                                    "content": page_content["text"],
                                    "code_examples": page_content["code"],
                                    "source": source_name,
                                    "url": link,
                                    "level": self._determine_level(topic, source_name)
                                })
                        except Exception as e:
                            logger.error(f"Error scraping {link}: {str(e)}")

                except Exception as e:
                    logger.error(f"Error scraping {topic} from {source_name}: {str(e)}")

        # Save the knowledge base
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(knowledge, f, indent=2, ensure_ascii=False)

        logger.info(f"Scraped {len(knowledge)} knowledge items")
        return knowledge

    def _extract_content(self, soup: BeautifulSoup, source_name: str) -> Dict:
        """Extract content from BeautifulSoup object"""
        content_selectors = self.sources[source_name]["content_selectors"]
        code_selectors = self.sources[source_name]["code_selectors"]

        # Find main content
        main_content = None
        for selector in content_selectors:
            main_content = soup.select_one(selector)
            if main_content:
                break

        if not main_content:
            return None

        # Clean text content
        text = self._clean_text(main_content.get_text())

        # Extract code examples
        code_examples = []
        for selector in code_selectors:
            for code_block in soup.select(selector):
                code = self._clean_code(code_block.get_text())
                if code:
                    code_examples.append(code)

        return {"text": text, "code": code_examples, "title": self._get_title(soup)}

    def _scrape_page(self, url: str, source_name: str) -> Dict:
        """Scrape content from a single page"""
        if any(p in url for p in self.sources[source_name]["avoid_urls"]):
            return None

        response = self.session.get(url, timeout=15)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        return self._extract_content(soup, source_name)

    def _find_links(self, soup: BeautifulSoup, base_url: str, source_name: str) -> List[str]:
        """Find relevant links on a page"""
        links = set()
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if href.startswith("#") or href.lower().startswith("javascript:"):
                continue

            full_url = urljoin(base_url, href)
            if not any(p in full_url for p in self.sources[source_name]["avoid_urls"]):
                links.add(full_url)

        return list(links)

    def _clean_text(self, text: str) -> str:
        """Clean and normalize text content"""
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
        text = re.sub(r'\[\d+\]', '', text)  # Remove citations
        return text

    def _clean_code(self, code: str) -> str:
        """Clean code examples"""
        code = re.sub(r'^\s*\$\s*', '', code, flags=re.MULTILINE)  # Remove shell prompts
        code = re.sub(r'\s+\n', '\n', code)
        return code.strip()

    def _get_title(self, soup: BeautifulSoup) -> str:
        """Extract page title"""
        title = soup.title.string if soup.title else "Untitled"
        return self._clean_text(title)

    def _determine_level(self, topic: str, source: str) -> str:
        """Determine content difficulty level"""
        if source == "w3schools":
            return "beginner"
        elif "data structure" in topic.lower() or "algorithm" in topic.lower():
            return "advanced"
        return "intermediate"

class CourseGenerator:
    """Advanced AI-powered course generator with dynamic content"""
    def __init__(self, knowledge_file: str = "knowledge_base.json"):
        self.knowledge_file = knowledge_file
        self.knowledge = []
        self.ai_enhancer = AIContentEnhancer()
        self.load_knowledge()

        # Course templates with placeholders for AI-enhanced content
        self.templates = {
            'module': {
                'introduction': (
                    "Welcome to this {level} course on {topic}!\n\n"
                    "In this module, we'll cover:\n"
                    "- Core concepts of {topic}\n"
                    "- Practical examples\n"
                    "- Hands-on exercises\n\n"
                    "By the end, you'll be able to:\n"
                    "- Explain key {topic} concepts\n"
                    "- Write basic {topic} code\n"
                    "- Solve simple problems using {topic}"
                ),
                'section': {
                    'explanation': (
                        "### {title}\n\n"
                        "{ai_enhanced_explanation}\n\n"
                        "**Key Concepts:**\n"
                        "{key_points}\n\n"
                        "**Real-world Analogy:**\n"
                        "{analogy}"
                    ),
                    'example': (
                        "#### Example: {title}\n\n"
                        "```{language}\n{code}\n```\n\n"
                        "**How It Works:**\n"
                        "{ai_enhanced_explanation}\n\n"
                        "**Try It Yourself:**\n"
                        "{suggestion}"
                    ),
                    'exercise': (
                        "#### Exercise: {title}\n\n"
                        "**Objective:** {objective}\n\n"
                        "**Requirements:**\n"
                        "{requirements}\n\n"
                        "**Steps to Solve:**\n"
                        "{steps}\n\n"
                        "**Hints:**\n"
                        "{hints}"
                    )
                },
                'summary': (
                    "### Summary\n\n"
                    "Congratulations on completing this {level} module on {topic}!\n\n"
                    "**Key Achievements:**\n"
                    "- Learned core {topic} concepts\n"
                    "- Practiced with real examples\n"
                    "- Completed hands-on exercises\n\n"
                    "**Next Steps:**\n"
                    "- Try building a small project\n"
                    "- Explore more advanced topics\n"
                    "- Join community discussions"
                )
            }
        }

    def load_knowledge(self):
        """Load knowledge base from file"""
        try:
            if os.path.exists(self.knowledge_file):
                with open(self.knowledge_file, "r", encoding="utf-8") as f:
                    self.knowledge = json.load(f)
                logger.info(f"Loaded {len(self.knowledge)} knowledge items")
            else:
                logger.warning("No knowledge base found. Please scrape content first.")
        except Exception as e:
            logger.error(f"Error loading knowledge base: {str(e)}")

    def generate_course(self, topic: str, level: str = "beginner") -> Dict:
        """Generate a complete AI-enhanced course"""
        relevant_content = [item for item in self.knowledge
                          if item['topic'].lower() == topic.lower()
                          and item['level'].lower() == level.lower()]

        if not relevant_content:
            return {"error": f"No content available for {topic} at {level} level"}

        course = {
            "title": f"{topic.title()} - {level.title()} Course",
            "description": f"Comprehensive {level} course on {topic} with AI Course Generator content",
            "modules": [],
            "resources": []
        }

        # Organize content into logical modules
        modules = self._organize_content(relevant_content)

        for i, module_content in enumerate(modules, 1):
            module = self._create_module(module_content, i, topic, level)
            course["modules"].append(module)
            course["resources"].extend(module["resources"])

        return course

    def _organize_content(self, content: List[Dict]) -> List[List[Dict]]:
        """Organize content into logical learning modules"""
        # Simple organization - can be enhanced with clustering
        modules = []
        current_module = []

        for item in content:
            if len(current_module) >= 3:  # 3 items per module
                modules.append(current_module)
                current_module = []
            current_module.append(item)

        if current_module:
            modules.append(current_module)

        return modules

    def _create_module(self, content: List[Dict], module_num: int, topic: str, level: str) -> Dict:
        """Create an AI-enhanced learning module"""
        module = {
            "title": f"Module {module_num}: {content[0]['title']}",
            "sections": [],
            "resources": []
        }

        # Module introduction
        module["introduction"] = self.templates['module']['introduction'].format(
            topic=topic,
            level=level
        )

        # Add sections for each content item
        for item in content:
            section = {
                "title": item["title"],
                "content": self._enhance_section(item, topic),
                "resources": [{"title": item["title"], "url": item.get("url", "")}]
            }
            module["sections"].append(section)
            module["resources"].extend(section["resources"])

        # Module summary
        module["summary"] = self.templates['module']['summary'].format(
            topic=topic,
            level=level
        )

        return module

    def _enhance_section(self, content: Dict, topic: str) -> Dict:
        """Enhance a content section with AI-generated material"""
        # Generate AI-enhanced explanations
        ai_explanation = self.ai_enhancer.enhance_content(
            f"Explain {content['title']} in {content['level']} terms: {content['content']}",
            'explanation'
        )

        # Generate key points
        key_points = self.ai_enhancer.enhance_content(
            f"Extract 3 key points about {content['title']} from: {content['content']}",
            'explanation'
        )

        # Generate analogy
        analogy = self.ai_enhancer.enhance_content(
            f"Create a real-world analogy for {content['title']}",
            'explanation'
        )

        # Prepare examples
        examples = []
        for i, code in enumerate(content.get("code_examples", [])[:2], 1):
            example_exp = self.ai_enhancer.enhance_content(
                f"Explain this {topic} code example: {code}",
                'example'
            )
            suggestion = self.ai_enhancer.enhance_content(
                f"Suggest a modification for this {topic} code: {code}",
                'example'
            )

            examples.append({
                "title": f"Example {i}",
                "code": code,
                "explanation": example_exp,
                "suggestion": suggestion,
                "language": topic.lower() if topic.lower() in ["python", "javascript"] else "python"
            })

        # Generate exercise
        exercise = {
            "title": f"Practice {content['title']}",
            "objective": self.ai_enhancer.enhance_content(
                f"Create a learning objective for practicing {content['title']}",
                'exercise'
            ),
            "requirements": self.ai_enhancer.enhance_content(
                f"List requirements for an exercise about {content['title']}",
                'exercise'
            ),
            "steps": self.ai_enhancer.enhance_content(
                f"Provide steps to solve an exercise about {content['title']}",
                'exercise'
            ),
            "hints": self.ai_enhancer.enhance_content(
                f"Give helpful hints for solving an exercise about {content['title']}",
                'exercise'
            )
        }

        return {
            "explanation": self.templates['module']['section']['explanation'].format(
                title=content["title"],
                ai_enhanced_explanation=ai_explanation,
                key_points=key_points,
                analogy=analogy
            ),
            "examples": [
                self.templates['module']['section']['example'].format(
                    title=ex["title"],
                    code=ex["code"],
                    language=ex["language"],
                    ai_enhanced_explanation=ex["explanation"],
                    suggestion=ex["suggestion"]
                ) for ex in examples
            ],
            "exercise": self.templates['module']['section']['exercise'].format(
                title=exercise["title"],
                objective=exercise["objective"],
                requirements=exercise["requirements"],
                steps=exercise["steps"],
                hints=exercise["hints"]
            )
        }

    def export_markdown(self, course: Dict, filename: str = None) -> str:
        """Export course to beautifully formatted Markdown"""
        if not filename:
            filename = f"{course['title'].replace(' ', '_')}.md"

        md = f"# {course['title']}\n\n"
        md += f"*{course['description']}*\n\n"

        for i, module in enumerate(course["modules"], 1):
            md += f"## {module['title']}\n\n"
            md += f"{module['introduction']}\n\n"

            for section in module["sections"]:
                md += f"### {section['title']}\n\n"
                md += f"{section['content']['explanation']}\n\n"

                if section['content']['examples']:
                    md += "### Examples\n\n"
                    for example in section['content']['examples']:
                        md += f"{example}\n\n"

                md += f"### Exercise\n\n"
                md += f"{section['content']['exercise']}\n\n"
                md += "---\n\n"

            md += f"{module['summary']}\n\n"

        md += "## Course Resources\n\n"
        for resource in course["resources"]:
            if resource["url"]:
                md += f"- [{resource['title']}]({resource['url']})\n"
            else:
                md += f"- {resource['title']}\n"

        with open(filename, "w", encoding="utf-8") as f:
            f.write(md)

        logger.info(f"Course saved to {filename}")
        return md

    def interactive_mode(self):
        """Interactive course creation interface"""
        print("\n" + "="*60)
        print("AI - COURSE BUILDER".center(60))
        print("="*60)

        topics = sorted(set(item['topic'] for item in self.knowledge))
        levels = ["beginner", "intermediate", "advanced"]

        while True:
            print("\n" + "-"*60)
            print("MAIN MENU".center(60))
            print("-"*60)
            print("1. Create New Course")
            print("2. Update Knowledge Base (Scrape)")
            print("3. List Available Topics")
            print("4. Exit")
            print("-"*60)

            choice = input("\nEnter your choice (1-4): ").strip()

            if choice == "4":
                print("\nThank you for using AI Course Generator!")
                break

            if choice == "3":
                print("\nAvailable Topics:")
                for i, topic in enumerate(topics, 1):
                    print(f"{i}. {topic.title()}")
                continue

            if choice == "2":
                print("\nUpdating knowledge base...")
                scraper = KnowledgeScraper()
                self.knowledge = scraper.scrape_knowledge_base()
                self.load_knowledge()
                print("\nKnowledge base updated successfully!")
                continue

            if choice == "1":
                print("\nAvailable Topics:")
                for i, topic in enumerate(topics, 1):
                    print(f"{i}. {topic.title()}")

                try:
                    topic_idx = int(input("\nSelect topic (number): ").strip()) - 1
                    topic = topics[topic_idx]
                except (ValueError, IndexError):
                    print("\nInvalid selection. Please try again.")
                    continue

                print("\nAvailable Levels:")
                for i, level in enumerate(levels, 1):
                    print(f"{i}. {level.title()}")

                try:
                    level_idx = int(input("\nSelect level (number): ").strip()) - 1
                    level = levels[level_idx]
                except (ValueError, IndexError):
                    print("\nInvalid selection. Please try again.")
                    continue

                print(f"\nGenerating {level} course on {topic}...")
                course = self.generate_course(topic, level)

                if "error" in course:
                    print(f"\nError: {course['error']}")
                    continue

                filename = input("\nEnter filename to save (or press enter for default): ").strip()
                if not filename:
                    filename = None

                md_content = self.export_markdown(course, filename)

                print("\n" + "="*60)
                print("COURSE GENERATED SUCCESSFULLY!".center(60))
                print("="*60)
                print(f"\nPreview (first 20 lines):\n")
                print("\n".join(md_content.split("\n")[:20]))
                print("\n... (full content saved to file)")
                print("="*60)
            else:
                print("\nInvalid choice. Please try again.")

if __name__ == "__main__":
    # Initialize the system
    generator = CourseGenerator()

    # Check if knowledge base exists, if not, scrape first
    if not generator.knowledge:
        print("\nNo knowledge base found. Scraping educational content first...")
        scraper = KnowledgeScraper()
        scraper.scrape_knowledge_base()
        generator.load_knowledge()

    # Start interactive mode
    generator.interactive_mode()