In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from urllib.parse import urljoin, urlparse, quote
import os
from typing import List, Dict, Set
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class EnhancedIITKDataScraper:
    def __init__(self):
        # Official IITK website URLs
        self.main_urls = [
            "https://www.iitk.ac.in/",
            "https://www.iitk.ac.in/new/",
            "https://www.iitk.ac.in/doaa/",
            "https://www.iitk.ac.in/dora/",
        ]

        # Student organization websites (separate domains)
        self.student_org_urls = [
            "https://voxiitk.com/",
            "https://spo.iitk.ac.in/",
            "https://www.ecelliitk.org/",
            "https://students.iitk.ac.in/gymkhana/",
            "https://www.anciitk.co.in/",
        ]

        # Department URLs
        self.department_urls = [
            "https://www.iitk.ac.in/me/",
            "https://www.iitk.ac.in/me/about-us",
            "https://www.iitk.ac.in/doaa/academic-departments",
            "https://www.iitk.ac.in/doaa/pg-manual",
            "https://www.iitk.ac.in/doaa/convocation",
            "https://cer.iitk.ac.in/",
        ]

        # Content URLs
        self.content_urls = [
            "https://www.iitk.ac.in/new/research-overview",
            "https://www.iitk.ac.in/new/admissions",
            "https://students.iitk.ac.in/",
        ]

        # Department codes to try
        self.dept_codes = [
            'ae', 'bsbe', 'ce', 'che', 'cse', 'ee', 'eco', 'hss',
            'mse', 'math', 'me', 'mth', 'phy', 'stats', 'des', 'doms'
        ]

        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.scraped_urls: Set[str] = set()
        self.scraped_data = []

    def clean_text(self, text: str) -> str:
        """Clean and normalize text content"""
        if not text:
            return ""

        # Remove HTML tags
        text = re.sub(r'<[^>]+>', '', text)
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text)
        # Remove only problematic characters, keep useful punctuation
        text = re.sub(r'[^\w\s\.\,\!\?\-\:\;\(\)\[\]\'\"\&\%\$\@\+\=\/\\]', '', text)
        # Remove repeated punctuation
        text = re.sub(r'([.!?])\1+', r'\1', text)

        return text.strip()

    def is_valid_url(self, url: str, base_domain: str = None) -> bool:
        """Check if URL is valid and from allowed domains"""
        try:
            parsed = urlparse(url)

            # List of allowed domains
            allowed_domains = [
                'iitk.ac.in',
                'voxiitk.com',
                'spo.iitk.ac.in',
                'ecelliitk.org',
                'students.iitk.ac.in',
                'anciitk.co.in'
            ]

            # Check if URL is from allowed domains
            is_allowed_domain = any(domain in parsed.netloc for domain in allowed_domains)

            # If base_domain is specified, prioritize same domain
            if base_domain and base_domain in parsed.netloc:
                is_allowed_domain = True

            return (
                parsed.scheme in ['http', 'https'] and
                is_allowed_domain and
                not any(ext in url.lower() for ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.jpg', '.png', '.gif', '.zip', '.mp4', '.mp3'])
            )
        except:
            return False

    def scrape_page(self, url: str) -> Dict:
        """Scrape a single page and extract relevant content"""
        if url in self.scraped_urls:
            return None

        try:
            response = self.session.get(url, timeout=15)
            response.raise_for_status()

            # Add to scraped URLs
            self.scraped_urls.add(url)

            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract title
            title = soup.find('title')
            title = title.get_text() if title else "No Title"

            # Remove unwanted elements
            for element in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "form"]):
                element.decompose()

            # Try multiple strategies to extract main content
            content = ""

            # Strategy 1: Look for main content containers
            main_selectors = [
                'main', 'article', '.content', '.main-content',
                '.post-content', '.entry-content', '.page-content',
                '.content-area', '.site-content', 'div.content',
                '.container', '.wrapper', '#content', '#main',
                '.main-container', '.page-wrapper', '.post',
                '.blog-post', '.article-content'
            ]

            for selector in main_selectors:
                content_elem = soup.select_one(selector)
                if content_elem:
                    content = content_elem.get_text(separator=' ', strip=True)
                    break

            # Strategy 2: For Vox Populi and other blog-style sites
            if not content or len(content.split()) < 20:
                # Look for blog post content
                blog_selectors = [
                    '.single-post', '.post-entry', '.entry', '.blog-content',
                    '.wp-content', '.post-body', '.article-body'
                ]
                for selector in blog_selectors:
                    content_elem = soup.select_one(selector)
                    if content_elem:
                        content = content_elem.get_text(separator=' ', strip=True)
                        break

            # Strategy 3: If no main content, look for specific content divs
            if not content or len(content.split()) < 20:
                content_divs = soup.find_all('div', class_=re.compile(r'content|main|article|post|text|body'))
                if content_divs:
                    content = ' '.join([div.get_text(separator=' ', strip=True) for div in content_divs])

            # Strategy 4: Extract from body but filter out navigation
            if not content or len(content.split()) < 20:
                body = soup.find('body')
                if body:
                    # Remove navigation elements
                    for nav_elem in body.find_all(['nav', 'menu', 'sidebar']):
                        nav_elem.decompose()

                    # Remove lists that look like navigation
                    for ul_elem in body.find_all(['ul', 'ol']):
                        if ul_elem.get('class'):
                            nav_classes = ' '.join(ul_elem.get('class', []))
                            if any(nav_word in nav_classes.lower() for nav_word in ['nav', 'menu', 'sidebar', 'breadcrumb']):
                                ul_elem.decompose()

                    content = body.get_text(separator=' ', strip=True)

            # Strategy 5: Get all paragraph content
            if not content or len(content.split()) < 20:
                paragraphs = soup.find_all('p')
                if paragraphs:
                    content = ' '.join([p.get_text(separator=' ', strip=True) for p in paragraphs])

            # Clean the content
            content = self.clean_text(content)

            # Extract headings and structure
            sections = []
            headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
            for heading in headings:
                heading_text = self.clean_text(heading.get_text())
                if heading_text and len(heading_text) > 3:
                    sections.append(heading_text)

            # Extract meta description
            meta_desc = soup.find('meta', attrs={'name': 'description'})
            description = meta_desc.get('content') if meta_desc else ""

            # Extract relevant links for further crawling
            links = []
            base_domain = urlparse(url).netloc

            for link in soup.find_all('a', href=True):
                href = link.get('href')
                if href and not href.startswith('#') and not href.startswith('mailto:'):
                    full_url = urljoin(url, href)
                    if self.is_valid_url(full_url, base_domain) and full_url not in self.scraped_urls:
                        # Prioritize certain types of links
                        link_text = link.get_text().lower()
                        if any(keyword in link_text for keyword in ['about', 'department', 'faculty', 'research', 'academic', 'course', 'program', 'article', 'post', 'news', 'blog']):
                            links.insert(0, full_url)  # Add to front
                        else:
                            links.append(full_url)

            word_count = len(content.split())

            # Only return if we have substantial content
            if word_count < 25:
                return None

            # Determine source type
            source_type = "official"
            if any(domain in url for domain in ['voxiitk.com', 'ecelliitk.org', 'anciitk.co.in']):
                source_type = "student_org"
            elif 'spo.iitk.ac.in' in url:
                source_type = "placement"
            elif 'students.iitk.ac.in' in url:
                source_type = "student_portal"

            return {
                'url': url,
                'title': self.clean_text(title),
                'content': content,
                'description': self.clean_text(description),
                'sections': sections,
                'word_count': word_count,
                'source_type': source_type,
                'links': links[:10]  # Limit to prevent explosion
            }

        except requests.exceptions.RequestException as e:
            logger.error(f"Error scraping {url}: {str(e)}")
            return None
        except Exception as e:
            logger.error(f"Unexpected error scraping {url}: {str(e)}")
            return None

    def scrape_student_organizations(self) -> List[Dict]:
        """Scrape student organization websites"""
        org_data = []

        for base_url in self.student_org_urls:
            logger.info(f"Scraping student organization: {base_url}")

            # Scrape main page
            page_data = self.scrape_page(base_url)
            if page_data:
                org_data.append(page_data)

            # For each organization, try to find additional pages
            additional_pages = []

            if 'voxiitk.com' in base_url:
                # Vox Populi specific pages
                vox_pages = [
                    "https://voxiitk.com/category/all-about-iitk/",
                    "https://voxiitk.com/category/flagship-series/as-we-leave/",
                    "https://voxiitk.com/category/reports-and-investigations/",
                    "https://voxiitk.com/category/administration/",
                    "https://voxiitk.com/about-us/",
                    "https://voxiitk.com/blog/",
                ]
                additional_pages.extend(vox_pages)

            elif 'spo.iitk.ac.in' in base_url:
                # SPO specific pages
                spo_pages = [
                    "https://spo.iitk.ac.in/insights",
                    "https://spo.iitk.ac.in/companies",
                    "https://spo.iitk.ac.in/students",
                    "https://spo.iitk.ac.in/about",
                    "https://spo.iitk.ac.in/statistics",
                ]
                additional_pages.extend(spo_pages)

            elif 'ecelliitk.org' in base_url:
                # E-Cell specific pages
                ecell_pages = [
                    "https://www.ecelliitk.org/about",
                    "https://www.ecelliitk.org/events",
                    "https://www.ecelliitk.org/startups",
                    "https://www.ecelliitk.org/team",
                    "https://www.ecelliitk.org/blog",
                ]
                additional_pages.extend(ecell_pages)

            elif 'students.iitk.ac.in' in base_url:
                # Gymkhana specific pages
                gymkhana_pages = [
                    "https://students.iitk.ac.in/gymkhana/about",
                    "https://students.iitk.ac.in/gymkhana/councils",
                    "https://students.iitk.ac.in/gymkhana/cells",
                    "https://students.iitk.ac.in/gymkhana/festivals",
                ]
                additional_pages.extend(gymkhana_pages)

            elif 'anciitk.co.in' in base_url:
                # AnC Council specific pages
                anc_pages = [
                    "https://www.anciitk.co.in/about",
                    "https://www.anciitk.co.in/team",
                    "https://www.anciitk.co.in/events",
                    "https://www.anciitk.co.in/resources",
                ]
                additional_pages.extend(anc_pages)

            # Scrape additional pages
            for page_url in additional_pages:
                if page_url not in self.scraped_urls:
                    page_data = self.scrape_page(page_url)
                    if page_data:
                        org_data.append(page_data)
                    time.sleep(1)

            time.sleep(2)  # Longer delay between organizations

        return org_data

    def discover_department_urls(self) -> List[str]:
        """Discover working department URLs"""
        department_urls = []

        # Try common department patterns
        for dept in self.dept_codes:
            urls_to_try = [
                f"https://www.iitk.ac.in/{dept}/",
                f"https://www.iitk.ac.in/{dept}/about",
                f"https://www.iitk.ac.in/{dept}/about-us",
                f"https://www.iitk.ac.in/{dept}/faculty",
                f"https://www.iitk.ac.in/{dept}/research",
                f"https://www.iitk.ac.in/{dept}/courses",
            ]

            for url in urls_to_try:
                try:
                    response = self.session.head(url, timeout=10)
                    if response.status_code == 200:
                        department_urls.append(url)
                        logger.info(f"Found working department URL: {url}")
                        break  # Found one for this department, move to next
                except:
                    continue

                time.sleep(0.5)  # Small delay between checks

        return department_urls

    def scrape_department_pages(self) -> List[Dict]:
        """Scrape department pages with discovered URLs"""
        department_data = []

        # First scrape known working URLs
        for url in self.department_urls:
            logger.info(f"Scraping known department URL: {url}")
            page_data = self.scrape_page(url)
            if page_data:
                department_data.append(page_data)
            time.sleep(1)

        # Then discover and scrape additional department URLs
        logger.info("Discovering additional department URLs...")
        discovered_urls = self.discover_department_urls()

        for url in discovered_urls:
            if url not in self.scraped_urls:
                logger.info(f"Scraping discovered department URL: {url}")
                page_data = self.scrape_page(url)
                if page_data:
                    department_data.append(page_data)
                time.sleep(1)

        return department_data

    def scrape_from_links(self, start_urls: List[str], max_depth: int = 2) -> List[Dict]:
        """Scrape following links from initial pages"""
        all_data = []
        urls_to_visit = start_urls.copy()
        depth = 0

        while urls_to_visit and depth < max_depth:
            current_level_urls = urls_to_visit.copy()
            urls_to_visit = []

            for url in current_level_urls:
                if url in self.scraped_urls:
                    continue

                logger.info(f"Scraping (depth {depth}): {url}")
                page_data = self.scrape_page(url)

                if page_data:
                    all_data.append(page_data)

                    # Add links from this page for next level
                    for link in page_data.get('links', []):
                        if link not in self.scraped_urls:
                            urls_to_visit.append(link)

                time.sleep(1)

                # Limit pages per depth level
                if len(all_data) > 60:
                    break

            depth += 1

            # Limit total URLs for next level
            urls_to_visit = urls_to_visit[:25]

        return all_data

    def scrape_all_sources(self) -> List[Dict]:
        """Scrape all sources with improved strategy"""
        all_data = []

        # 1. Scrape student organizations first (most valuable content)
        logger.info("=== Scraping Student Organizations ===")
        org_data = self.scrape_student_organizations()
        all_data.extend(org_data)
        logger.info(f"Scraped {len(org_data)} pages from student organizations")

        # 2. Scrape main pages
        logger.info("=== Scraping Main Pages ===")
        for url in self.main_urls:
            page_data = self.scrape_page(url)
            if page_data:
                all_data.append(page_data)
            time.sleep(1)

        # 3. Scrape known content URLs
        logger.info("=== Scraping Content Pages ===")
        for url in self.content_urls:
            page_data = self.scrape_page(url)
            if page_data:
                all_data.append(page_data)
            time.sleep(1)

        # 4. Scrape department pages
        logger.info("=== Scraping Department Pages ===")
        dept_data = self.scrape_department_pages()
        all_data.extend(dept_data)

        # 5. Follow links from main pages (limited depth)
        logger.info("=== Following Links from Main Pages ===")
        link_data = self.scrape_from_links(self.main_urls, max_depth=2)
        all_data.extend(link_data)

        # 6. Follow links from student organizations
        logger.info("=== Following Links from Student Organizations ===")
        org_link_data = self.scrape_from_links(self.student_org_urls, max_depth=2)
        all_data.extend(org_link_data)

        return all_data

    def save_data(self, data: List[Dict], filename: str = "enhanced_iitk_data.json"):
        """Save scraped data to JSON file with better filtering and organization"""
        # Filter out empty or very short content
        filtered_data = []
        seen_content = set()

        for item in data:
            if (item and
                item.get('content') and
                len(item['content'].split()) > 30 and
                len(item['content']) > 250 and
                item['content'] not in seen_content):  # Remove duplicates

                seen_content.add(item['content'])
                filtered_data.append(item)

        # Sort by source type and word count
        def sort_key(x):
            source_priority = {
                'student_org': 1,
                'placement': 2,
                'student_portal': 3,
                'official': 4
            }
            return (source_priority.get(x.get('source_type', 'official'), 5), -x.get('word_count', 0))

        filtered_data.sort(key=sort_key)

        # Save to JSON
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(filtered_data, f, indent=2, ensure_ascii=False)

        logger.info(f"Saved {len(filtered_data)} items to {filename}")

        # Also save a simple text version for easy reading
        text_filename = filename.replace('.json', '_text.txt')
        with open(text_filename, 'w', encoding='utf-8') as f:
            for i, item in enumerate(filtered_data):
                f.write(f"=== PAGE {i+1}: {item['title']} ===\n")
                f.write(f"URL: {item['url']}\n")
                f.write(f"Source Type: {item.get('source_type', 'unknown')}\n")
                f.write(f"Words: {item['word_count']}\n")
                f.write(f"Content:\n{item['content']}\n\n")

        # Create a separate file for each source type
        source_types = {}
        for item in filtered_data:
            source_type = item.get('source_type', 'unknown')
            if source_type not in source_types:
                source_types[source_type] = []
            source_types[source_type].append(item)

        for source_type, items in source_types.items():
            source_filename = f"{source_type}_data.json"
            with open(source_filename, 'w', encoding='utf-8') as f:
                json.dump(items, f, indent=2, ensure_ascii=False)
            logger.info(f"Saved {len(items)} {source_type} items to {source_filename}")

        # Print detailed statistics
        total_words = sum(item.get('word_count', 0) for item in filtered_data)
        avg_words = total_words / len(filtered_data) if filtered_data else 0

        print(f"\n=== ENHANCED DATA COLLECTION SUMMARY ===")
        print(f"Total pages scraped: {len(filtered_data)}")
        print(f"Total words: {total_words:,}")
        print(f"Average words per page: {avg_words:.1f}")
        print(f"Unique URLs visited: {len(self.scraped_urls)}")

        print(f"\n=== BREAKDOWN BY SOURCE TYPE ===")
        for source_type, items in source_types.items():
            type_words = sum(item.get('word_count', 0) for item in items)
            print(f"{source_type.replace('_', ' ').title()}: {len(items)} pages, {type_words:,} words")

        print(f"\n=== FILES CREATED ===")
        print(f"Main JSON: {filename}")
        print(f"Text version: {text_filename}")
        for source_type in source_types:
            print(f"{source_type.replace('_', ' ').title()} JSON: {source_type}_data.json")

        return filename

def main():
    scraper = EnhancedIITKDataScraper()

    print("Starting Enhanced IITK Data Scraping...")
    print("This will scrape:")
    print("- Official IITK websites")
    print("- Student organizations: Vox Populi, E-Cell, Gymkhana, AnC Council")
    print("- Student Placement Office (SPO)")
    print("- Department pages")
    print("- And follow relevant links from all sources")

    # Scrape all sources
    data = scraper.scrape_all_sources()

    # Save to file
    filename = scraper.save_data(data)

    print(f"\nScraping completed successfully!")

    # Show sample of scraped data
    if data:
        print(f"\nSample of scraped content by source type:")
        source_samples = {}
        for item in data:
            source_type = item.get('source_type', 'unknown')
            if source_type not in source_samples:
                source_samples[source_type] = item

        for source_type, item in source_samples.items():
            print(f"\n{source_type.replace('_', ' ').title()}:")
            print(f"  Title: {item['title']}")
            print(f"  URL: {item['url']}")
            print(f"  Words: {item['word_count']}")
            print(f"  Preview: {item['content'][:150]}...")

if __name__ == "__main__":
    main()

Starting Enhanced IITK Data Scraping...
This will scrape:
- Official IITK websites
- Student organizations: Vox Populi, E-Cell, Gymkhana, AnC Council
- Student Placement Office (SPO)
- Department pages
- And follow relevant links from all sources


ERROR:__main__:Error scraping https://voxiitk.com/category/administration/: 404 Client Error: Not Found for url: https://voxiitk.com/category/administration/
ERROR:__main__:Error scraping https://voxiitk.com/about-us/: 404 Client Error: Not Found for url: https://voxiitk.com/about-us/
ERROR:__main__:Error scraping https://voxiitk.com/blog/: 404 Client Error: Not Found for url: https://voxiitk.com/blog/
ERROR:__main__:Error scraping https://spo.iitk.ac.in/statistics: 404 Client Error: Not Found for url: https://spo.iitk.ac.in/statistics
ERROR:__main__:Error scraping https://www.ecelliitk.org/about: 404 Client Error: Not Found for url: https://www.ecelliitk.org/about
ERROR:__main__:Error scraping https://www.ecelliitk.org/events: 404 Client Error: Not Found for url: https://www.ecelliitk.org/events
ERROR:__main__:Error scraping https://www.ecelliitk.org/startups: 404 Client Error: Not Found for url: https://www.ecelliitk.org/startups
ERROR:__main__:Error scraping https://www.ecelliitk.or


=== ENHANCED DATA COLLECTION SUMMARY ===
Total pages scraped: 31
Total words: 51,799
Average words per page: 1670.9
Unique URLs visited: 34

=== BREAKDOWN BY SOURCE TYPE ===
Student Org: 5 pages, 2,678 words
Placement: 5 pages, 7,843 words
Official: 21 pages, 41,278 words

=== FILES CREATED ===
Main JSON: enhanced_iitk_data.json
Text version: enhanced_iitk_data_text.txt
Student Org JSON: student_org_data.json
Placement JSON: placement_data.json
Official JSON: official_data.json

Scraping completed successfully!

Sample of scraped content by source type:

Student Org:
  Title: All about IITK  Vox Populi
  URL: https://voxiitk.com/category/all-about-iitk/
  Words: 200
  Preview: Disclaimer: Vox Populi, IIT Kanpur, is the exclusive owner of the information on this website. No part of this content Disclaimer: Vox Populi, IIT Kan...

Placement:
  Title: Students' Placement Office, IIT Kanpur
  URL: https://spo.iitk.ac.in/
  Words: 864
  Preview: About IITK For companies For students Samvar

In [2]:
!pip install streamlit -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m115.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
!pip install streamlit pyngrok -q

app_code = '''
import streamlit as st
import json
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import numpy as np
from sentence_transformers import SentenceTransformer
import re
from typing import List, Dict, Tuple
import logging
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class IITKChatbot:
    def __init__(self, data_file: str = "enhanced_iitk_data.json"):
        self.data_file = data_file
        self.documents = []
        self.embeddings = None
        self.embedding_model = None
        self.qa_pipeline = None
        self.tokenizer = None
        self.qa_model = None

        # Initialize the chatbot
        self.initialize_chatbot()

    def initialize_chatbot(self):
        """Initialize the complete chatbot system"""
        try:
            self.load_data()
            self.initialize_models()
            self.create_embeddings()
            logger.info("Chatbot initialization completed successfully")
        except Exception as e:
            logger.error(f"Error during chatbot initialization: {str(e)}")
            st.error(f"Failed to initialize chatbot: {str(e)}")

    def load_data(self):
        """Load scraped data from JSON file"""
        try:
            with open(self.data_file, 'r', encoding='utf-8') as f:
                raw_data = json.load(f)

            # Process documents
            for item in raw_data:
                if item.get('content') and len(item['content'].split()) > 20:
                    # Split long content into chunks
                    chunks = self.split_into_chunks(item['content'])
                    for chunk in chunks:
                        self.documents.append({
                            'title': item.get('title', 'No Title'),
                            'content': chunk,
                            'url': item.get('url', ''),
                            'source_type': item.get('source_type', 'unknown'),
                            'sections': item.get('sections', [])
                        })

            logger.info(f"Loaded {len(self.documents)} document chunks")

        except FileNotFoundError:
            logger.warning(f"Data file {self.data_file} not found! Using sample data.")
            # Create comprehensive sample data for demonstration
            self.documents = [
                {
                    'title': 'IIT Kanpur Overview',
                    'content': 'Indian Institute of Technology Kanpur (IIT Kanpur) is one of the premier engineering institutions in India. Established in 1959, it was the first IIT to be set up with foreign assistance. The institute offers undergraduate, postgraduate, and doctoral programs in engineering, science, design, and management. IIT Kanpur is located in Kanpur, Uttar Pradesh, and spans over 1055 acres. It is consistently ranked among the top engineering institutions in India.',
                    'url': 'https://www.iitk.ac.in/',
                    'source_type': 'official',
                    'sections': ['About', 'History']
                },
                {
                    'title': 'Academic Programs at IIT Kanpur',
                    'content': 'IIT Kanpur offers various academic programs including Bachelor of Technology (B.Tech), Master of Technology (M.Tech), Master of Science (M.S.), and Doctor of Philosophy (Ph.D.). The institute has 16 academic departments covering engineering disciplines like Computer Science, Mechanical Engineering, Electrical Engineering, Civil Engineering, Chemical Engineering, and Aerospace Engineering. It also offers programs in Mathematics, Physics, Chemistry, Humanities and Social Sciences, and Management.',
                    'url': 'https://www.iitk.ac.in/academics',
                    'source_type': 'official',
                    'sections': ['Programs', 'Departments']
                },
                {
                    'title': 'Student Life at IIT Kanpur',
                    'content': 'Student life at IIT Kanpur is vibrant and diverse. The campus has 12 halls of residence (hostels) accommodating over 8000 students. The institute has numerous student clubs and societies including technical clubs, cultural clubs, and sports clubs. Major festivals include Antaragni (cultural festival), Techkriti (technical festival), and Udghosh (sports festival). The Students Gymkhana is the student government body that organizes various activities and represents student interests.',
                    'url': 'https://students.iitk.ac.in/',
                    'source_type': 'student_portal',
                    'sections': ['Hostels', 'Clubs', 'Festivals']
                },
                {
                    'title': 'Research and Innovation',
                    'content': 'IIT Kanpur is renowned for its research contributions in various fields. The institute has established several centers of excellence including the National Centre for Flexible Electronics, Advanced Centre for Materials Science, and the National Wind Tunnel Facility. Faculty and students engage in cutting-edge research in areas like artificial intelligence, robotics, nanotechnology, biotechnology, and renewable energy. The institute has strong industry partnerships and encourages innovation and entrepreneurship.',
                    'url': 'https://www.iitk.ac.in/research',
                    'source_type': 'official',
                    'sections': ['Research Areas', 'Centers', 'Innovation']
                },
                {
                    'title': 'Placement and Career Services',
                    'content': 'The Student Placement Office (SPO) at IIT Kanpur facilitates campus placements for students. Top companies from various sectors including IT, consulting, finance, and core engineering visit the campus for recruitment. The average package for B.Tech students is around 15-20 LPA, while for M.Tech and Ph.D. students, it varies based on specialization. The institute has a strong alumni network working in top positions across industries globally.',
                    'url': 'https://spo.iitk.ac.in/',
                    'source_type': 'placement',
                    'sections': ['Placements', 'Companies', 'Statistics']
                },
                {
                    'title': 'Campus Facilities',
                    'content': 'IIT Kanpur campus provides excellent facilities including modern laboratories, libraries, sports facilities, and recreational areas. The P.K. Kelkar Library is one of the largest technical libraries in India. The campus has a health center, guest house, shopping complex, and multiple dining facilities. Sports facilities include swimming pool, gymnasium, tennis courts, football ground, and cricket ground. The campus is Wi-Fi enabled and provides 24/7 internet connectivity.',
                    'url': 'https://www.iitk.ac.in/facilities',
                    'source_type': 'official',
                    'sections': ['Library', 'Sports', 'Health', 'Dining']
                }
            ]

    def split_into_chunks(self, text: str, max_length: int = 400) -> List[str]:
        """Split text into manageable chunks for better processing"""
        # First try to split by sentences
        sentences = re.split(r'[.!?]+', text)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            # Check if adding this sentence would exceed max_length
            if len(current_chunk.split()) + len(sentence.split()) <= max_length:
                current_chunk += sentence + ". "
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence + ". "

        # Add the last chunk if it exists
        if current_chunk:
            chunks.append(current_chunk.strip())

        # If no chunks were created (very long sentences), split by words
        if not chunks:
            words = text.split()
            for i in range(0, len(words), max_length):
                chunk = ' '.join(words[i:i + max_length])
                chunks.append(chunk)

        return chunks

    def initialize_models(self):
        """Initialize transformer models with better error handling"""
        try:
            # Initialize sentence transformer for embeddings
            st.info("Loading sentence transformer model...")
            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

            # Initialize QA pipeline with a lightweight model
            st.info("Loading question-answering model...")
            model_name = "distilbert-base-cased-distilled-squad"
            self.qa_pipeline = pipeline(
                "question-answering",
                model=model_name,
                tokenizer=model_name,
                device=-1  # Force CPU usage
            )

            # Also load tokenizer and model separately for more control
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name)

            logger.info("Models initialized successfully")

        except Exception as e:
            logger.error(f"Error initializing models: {str(e)}")
            st.error(f"Failed to initialize models: {str(e)}")
            raise

    def create_embeddings(self):
        """Create embeddings for all documents using sentence transformers"""
        if not self.embedding_model or not self.documents:
            logger.warning("Cannot create embeddings: missing model or documents")
            return

        try:
            st.info("Creating document embeddings...")

            # Create embeddings for all document contents
            texts = [doc['content'] for doc in self.documents]

            # Create embeddings in batches to avoid memory issues
            batch_size = 32
            all_embeddings = []

            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i + batch_size]
                batch_embeddings = self.embedding_model.encode(
                    batch_texts,
                    convert_to_tensor=False,
                    show_progress_bar=False
                )
                all_embeddings.extend(batch_embeddings)

            self.embeddings = np.array(all_embeddings)
            logger.info(f"Created embeddings for {len(self.documents)} documents")

        except Exception as e:
            logger.error(f"Error creating embeddings: {str(e)}")
            st.error(f"Failed to create embeddings: {str(e)}")

    def find_relevant_documents(self, query: str, top_k: int = 5) -> List[Dict]:
        """Find most relevant documents for a query using cosine similarity"""
        if not self.embedding_model or self.embeddings is None:
            logger.warning("Using fallback document selection")
            return self.documents[:top_k]

        try:
            # Encode query
            query_embedding = self.embedding_model.encode([query])

            # Calculate cosine similarity
            similarities = cosine_similarity(query_embedding, self.embeddings)[0]

            # Get top-k most similar documents
            top_indices = np.argsort(similarities)[::-1][:top_k]

            # Return relevant documents with scores
            relevant_docs = []
            for idx in top_indices:
                if idx < len(self.documents):
                    doc = self.documents[idx].copy()
                    doc['relevance_score'] = float(similarities[idx])
                    relevant_docs.append(doc)

            return relevant_docs

        except Exception as e:
            logger.error(f"Error finding relevant documents: {str(e)}")
            # Fallback to simple keyword matching
            return self.simple_keyword_search(query, top_k)

    def simple_keyword_search(self, query: str, top_k: int = 5) -> List[Dict]:
        """Fallback keyword-based search"""
        query_words = set(query.lower().split())
        scored_docs = []

        for doc in self.documents:
            content_words = set(doc['content'].lower().split())
            title_words = set(doc['title'].lower().split())

            # Calculate simple overlap score
            content_score = len(query_words.intersection(content_words))
            title_score = len(query_words.intersection(title_words)) * 2  # Weight title matches more

            total_score = content_score + title_score

            if total_score > 0:
                doc_copy = doc.copy()
                doc_copy['relevance_score'] = total_score
                scored_docs.append(doc_copy)

        # Sort by score and return top-k
        scored_docs.sort(key=lambda x: x['relevance_score'], reverse=True)
        return scored_docs[:top_k]

    def answer_question(self, question: str) -> Dict:
        """Answer a question using the QA pipeline"""
        if not self.qa_pipeline:
            return {
                'answer': "Sorry, the QA model is not available.",
                'confidence': 0.0,
                'context': "",
                'sources': []
            }

        try:
            # Find relevant documents
            relevant_docs = self.find_relevant_documents(question, top_k=3)

            if not relevant_docs:
                return {
                    'answer': "I couldn't find relevant information to answer your question about IIT Kanpur.",
                    'confidence': 0.0,
                    'context': "",
                    'sources': []
                }

            # Combine contexts from relevant documents
            contexts = []
            for doc in relevant_docs:
                contexts.append(doc['content'])

            # Try each context separately and pick the best answer
            best_answer = None
            best_confidence = 0.0
            best_context = ""

            for context in contexts:
                # Truncate context if too long for the model
                if len(context) > 2000:
                    context = context[:2000]

                try:
                    result = self.qa_pipeline(question=question, context=context)

                    if result['score'] > best_confidence:
                        best_answer = result['answer']
                        best_confidence = result['score']
                        best_context = context

                except Exception as e:
                    logger.warning(f"Error processing context: {str(e)}")
                    continue

            # If no answer found, try with combined context
            if not best_answer:
                combined_context = " ".join(contexts)
                if len(combined_context) > 2000:
                    combined_context = combined_context[:2000]

                try:
                    result = self.qa_pipeline(question=question, context=combined_context)
                    best_answer = result['answer']
                    best_confidence = result['score']
                    best_context = combined_context
                except Exception as e:
                    logger.error(f"Error with combined context: {str(e)}")
                    best_answer = "I found some information but couldn't extract a specific answer."
                    best_confidence = 0.1

            # Extract sources
            sources = []
            for doc in relevant_docs:
                sources.append({
                    'title': doc['title'],
                    'url': doc['url'],
                    'source_type': doc.get('source_type', 'unknown'),
                    'relevance': doc.get('relevance_score', 0.0)
                })

            return {
                'answer': best_answer,
                'confidence': best_confidence,
                'context': best_context,
                'sources': sources
            }

        except Exception as e:
            logger.error(f"Error answering question: {str(e)}")
            return {
                'answer': f"I encountered an error while processing your question. Please try rephrasing it.",
                'confidence': 0.0,
                'context': "",
                'sources': []
            }

def main():
    st.set_page_config(
        page_title="IIT Kanpur Chatbot",
        page_icon="🤖",
        layout="wide",
        initial_sidebar_state="expanded"
    )

    # Custom CSS for better styling
    st.markdown("""
    <style>
    .main-header {
        background: linear-gradient(90deg, #1e3c72 0%, #2a5298 100%);
        padding: 1rem;
        border-radius: 10px;
        margin-bottom: 2rem;
    }
    .main-header h1 {
        color: white;
        margin: 0;
        text-align: center;
    }
    .main-header p {
        color: #e0e0e0;
        margin: 0;
        text-align: center;
    }
    .chat-container {
        border: 1px solid #ddd;
        border-radius: 10px;
        padding: 1rem;
        margin-bottom: 1rem;
    }
    .source-box {
        background-color: #f8f9fa;
        border-left: 4px solid #007bff;
        padding: 0.5rem;
        margin: 0.5rem 0;
        border-radius: 0 5px 5px 0;
    }
    </style>
    """, unsafe_allow_html=True)

    # Header
    st.markdown("""
    <div class="main-header">
        <h1>IIT Kanpur Chatbot</h1>
        <p>An AI-powered chatbot to answer questions about IIT Kanpur</p>
    </div>
    """, unsafe_allow_html=True)

    # Initialize chatbot
    if 'chatbot' not in st.session_state:
        with st.spinner("Initializing PULPNET chatbot... This may take a moment."):
            try:
                st.session_state.chatbot = IITKChatbot()
                st.success("PULPNET is ready to help!")
            except Exception as e:
                st.error(f"Failed to initialize chatbot: {str(e)}")
                st.stop()

    # Sidebar
    with st.sidebar:
        st.header("About IITK ChatBot")
        st.info(
            "PULPNET is an AI-powered chatbot designed to answer questions about IIT Kanpur. "
            "It uses advanced transformer models to provide accurate and helpful responses based on "
            "official and student-led information sources."
        )

        st.header("Technical Details")
        st.write("**Embedding Model:** all-MiniLM-L6-v2")
        st.write("**QA Model:** DistilBERT-base-cased")
        st.write("**Search Method:** Cosine Similarity")

        # Statistics
        if hasattr(st.session_state.chatbot, 'documents'):
            st.header("Dataset Statistics")
            st.metric("Total Documents", len(st.session_state.chatbot.documents))

            # Show source types
            source_types = {}
            for doc in st.session_state.chatbot.documents:
                source_type = doc.get('source_type', 'unknown')
                source_types[source_type] = source_types.get(source_type, 0) + 1

            st.write("**Sources:**")
            for source_type, count in source_types.items():
                st.write(f"• {source_type.replace('_', ' ').title()}: {count}")

        # Sample questions
        st.header("Sample Questions")
        sample_questions = [
            "What is IIT Kanpur?",
            "What academic programs are offered?",
            "Tell me about student life",
            "What research areas are there?",
            "How is the placement scenario?",
            "What facilities are available on campus?",
            "Tell me about the hostels",
            "What are the major festivals?"
        ]

        for question in sample_questions:
            if st.button(question, key=f"sample_{question}", use_container_width=True):
                st.session_state.sample_question = question
                st.rerun()

    # Main chat interface
    st.subheader("Ask IITK ChatBot")

    # Input methods
    col1, col2 = st.columns([3, 1])
    with col1:
        user_question = st.text_input(
            "Enter your question about IIT Kanpur:",
            placeholder="e.g., What are the academic programs at IIT Kanpur?",
            key="user_input"
        )
    with col2:
        ask_button = st.button("Ask Question", type="primary", use_container_width=True)

    # Handle sample question
    if 'sample_question' in st.session_state:
        user_question = st.session_state.sample_question
        ask_button = True
        del st.session_state.sample_question

    # Process question
    if ask_button and user_question:
        with st.spinner("IITK ChatBot is thinking..."):
            response = st.session_state.chatbot.answer_question(user_question)

        # Display results
        st.markdown("---")

        # Answer section
        st.subheader("📝 Answer")

        # Show confidence level with color coding
        confidence = response['confidence']
        if confidence > 0.7:
            confidence_color = "green"
            confidence_text = "High"
        elif confidence > 0.4:
            confidence_color = "orange"
            confidence_text = "Medium"
        else:
            confidence_color = "red"
            confidence_text = "Low"

        col1, col2 = st.columns([3, 1])
        with col1:
            st.markdown(f"**{response['answer']}**")
        with col2:
            st.markdown(f"**Confidence:** <span style='color:{confidence_color}'>{confidence_text} ({confidence:.2%})</span>", unsafe_allow_html=True)

        # Sources section
        if response['sources']:
            st.subheader("Sources")
            for i, source in enumerate(response['sources']):
                st.markdown(f"""
                <div class="source-box">
                    <strong>{source['title']}</strong><br>
                    <small>Type: {source['source_type'].replace('_', ' ').title()} |
                    Relevance: {source['relevance']:.2f}</small><br>
                    <a href="{source['url']}" target="_blank">{source['url']}</a>
                </div>
                """, unsafe_allow_html=True)

        # Context section (expandable)
        if response['context']:
            with st.expander("Context Used (Click to expand)"):
                st.text_area("Context", response['context'], height=200, disabled=True)

    elif ask_button and not user_question:
        st.warning("Please enter a question before clicking 'Ask Question'.")

    # Footer
    st.markdown("---")
    st.markdown("""
    <div style="text-align: center; color: #666;">
        <p>IIT Kanpur Chatbot | Built with using Streamlit and Transformers</p>
        <p><small>For the best experience, ask specific questions about IIT Kanpur academics, facilities, student life, or research.</small></p>
    </div>
    """, unsafe_allow_html=True)

if __name__ == "__main__":
    main()
'''

with open("app.py", "w") as f:
    f.write(app_code)

print("app.py has been created successfully.")

app.py has been created successfully.


In [8]:
from pyngrok import ngrok

# Get your ngrok authtoken from https://dashboard.ngrok.com/get-started/your-authtoken
ngrok.set_auth_token("2zTWsXhD47dWMWJv4jHORDOoZia_5Qmo6pxYw18yokxW1bvC2")

!nohup streamlit run app.py --server.port 8501 &
public_url = ngrok.connect(8501)
print(f"IITK ChatBot Streamlit app is live! Click here: {public_url}")

nohup: appending output to 'nohup.out'
IITK ChatBot Streamlit app is live! Click here: NgrokTunnel: "https://420e-34-63-80-175.ngrok-free.app" -> "http://localhost:8501"
