In [3]:
import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox, filedialog
import threading
import json
import os
import re
import time
import math
import pickle
import numpy as np
from datetime import datetime
from collections import defaultdict, Counter
from urllib.parse import urljoin
from pathlib import Path
import webbrowser

In [None]:
# Try to import selenium and BeautifulSoup
try:
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from bs4 import BeautifulSoup
    SELENIUM_AVAILABLE = True
except ImportError:
    SELENIUM_AVAILABLE = False
    print("Warning: Selenium/BeautifulSoup not available. Crawler functionality will be limited.")


: 

In [None]:
# ==================== TEXT PREPROCESSING ====================

STOP_WORDS = {
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has',
    'he', 'in', 'is', 'it', 'its', 'of', 'on', 'or', 'that', 'the', 'to',
    'was', 'will', 'with', 'this', 'but', 'they', 'have', 'had',
    'what', 'when', 'where', 'who', 'why', 'how', 'all', 'each', 'every',
    'both', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
    'not', 'only', 'same', 'so', 'than', 'too', 'very', 'can', 'just',
    'should', 'now'
}

class TextPreprocessor:
    @staticmethod
    def preprocess(text):
        """Convert to lowercase and remove special characters"""
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        return text
    
    @staticmethod
    def tokenize(text):
        """Split text into tokens"""
        return text.split()
    
    @staticmethod
    def remove_stopwords(tokens):
        """Remove stop words from token list"""
        return [t for t in tokens if t not in STOP_WORDS and len(t) > 2]

In [None]:
# ==================== INVERTED INDEX ====================

class AdvancedInvertedIndex:
    def __init__(self):
        self.index = defaultdict(list)  # token -> [(doc_id, freq, field)]
        self.documents = {}  # doc_id -> full document data
        self.doc_count = 0
    
    def add_document(self, doc_id, doc_data):
        """Add document to index with all fields searchable"""
        self.documents[doc_id] = doc_data
        
        # Create searchable text from all fields
        searchable_fields = {
            'title': doc_data.get('title', ''),
            'authors': ' '.join(doc_data.get('authors', [])) if isinstance(doc_data.get('authors', []), list) else str(doc_data.get('authors', '')),
            'year': str(doc_data.get('year', '')),
            'abstract': doc_data.get('abstract', ''),
            'keywords': ' '.join(doc_data.get('keywords', [])) if isinstance(doc_data.get('keywords', []), list) else str(doc_data.get('keywords', ''))
        }
        
        # Index each field with different weights
        field_weights = {
            'title': 3.0,
            'authors': 2.5,
            'year': 1.5,
            'abstract': 1.0,
            'keywords': 2.0
        }
        
        for field, text in searchable_fields.items():
            if text:
                processed = TextPreprocessor.preprocess(str(text))
                tokens = TextPreprocessor.tokenize(processed)
                tokens = TextPreprocessor.remove_stopwords(tokens)
                
                for token in set(tokens):
                    freq = tokens.count(token)
                    weight = field_weights.get(field, 1.0)
                    
                    # Check if doc_id already indexed for this token
                    existing = [x for x in self.index[token] if x[0] == doc_id]
                    if existing:
                        idx = self.index[token].index(existing[0])
                        doc_id_prev, freq_prev, field_prev = self.index[token][idx]
                        self.index[token][idx] = (doc_id, freq_prev + (freq * weight), field)
                    else:
                        self.index[token].append((doc_id, freq * weight, field))
    
    def search(self, query):
        """Advanced search with relevance ranking"""
        processed = TextPreprocessor.preprocess(query)
        tokens = TextPreprocessor.tokenize(processed)
        original_tokens = tokens.copy()
        tokens = TextPreprocessor.remove_stopwords(tokens)
        
        if not tokens:
            return []
        
        results = defaultdict(float)
        term_matches = defaultdict(int)
        matched_fields = defaultdict(set)
        
        for token in tokens:
            if token in self.index:
                idf = math.log(len(self.documents) / len(set(x[0] for x in self.index[token])) + 1)
                
                for doc_id, freq, field in self.index[token]:
                    results[doc_id] += freq * idf
                    term_matches[doc_id] += 1
                    matched_fields[doc_id].add(field)
        
        # Boost documents with exact field matches
        for doc_id in results:
            if 'title' in matched_fields[doc_id]:
                results[doc_id] *= 1.5
            results[doc_id] += len(matched_fields[doc_id]) * 5
        
        sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
        return [(doc_id, self.documents[doc_id], score) for doc_id, score in sorted_results if doc_id in self.documents]
    
    def save(self, filepath):
        """Save index to file"""
        with open(filepath, 'wb') as f:
            pickle.dump({
                'index': dict(self.index),
                'documents': self.documents
            }, f)
    
    def load(self, filepath):
        """Load index from file"""
        try:
            with open(filepath, 'rb') as f:
                data = pickle.load(f)
                self.index = defaultdict(list, data['index'])
                self.documents = data['documents']
            return True
        except:
            return False

In [None]:

# ==================== WEB CRAWLER ====================

if SELENIUM_AVAILABLE:
    class ImprovedSeleniumCrawler:
        def __init__(self, callback=None):
            self.callback = callback
            self.visited_urls = set()
            self.publications = []
            self.driver = None
            self.base_domain = 'pureportal.coventry.ac.uk'
        
        def log(self, msg):
            if self.callback:
                self.callback(msg)
            print(msg)
        
        def init_driver(self):
            """Initialize Selenium WebDriver"""
            chrome_options = Options()
            chrome_options.add_argument('--disable-blink-features=AutomationControlled')
            chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
            chrome_options.add_experimental_option('useAutomationExtension', False)
            chrome_options.add_argument('--disable-images')
            chrome_options.add_argument('--headless')
            self.driver = webdriver.Chrome(options=chrome_options)
            self.driver.set_page_load_timeout(20)
        
        def close_driver(self):
            if self.driver:
                self.driver.quit()
        
        def crawl_department(self, base_url, max_pages=100):
            """Crawl department and extract publications"""
            self.log("Initializing crawler...")
            self.init_driver()
            
            try:
                self.log(f"Fetching department page: {base_url}")
                self.driver.get(base_url)
                time.sleep(3)
                
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                
                # Extract author links
                author_links = self.extract_author_links(soup, base_url)
                self.log(f"Found {len(author_links)} author profiles to crawl")
                
                # Crawl each author's profile
                for idx, author_link in enumerate(author_links[:max_pages], 1):
                    if idx > max_pages:
                        break
                    
                    self.log(f"\n[{idx}/{min(len(author_links), max_pages)}] Crawling author: {author_link}")
                    time.sleep(2)  # Polite crawling - 2 second delay
                    
                    try:
                        self.driver.get(author_link)
                        time.sleep(2)
                        
                        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                        author_name = self.extract_author_name(soup)
                        pubs = self.extract_publications_from_profile(soup, author_link, author_name)
                        
                        if pubs:
                            self.publications.extend(pubs)
                            self.log(f"  ‚úì Extracted {len(pubs)} publications from {author_name}")
                        else:
                            self.log(f"  ‚Üí No publications found on this page")
                    except Exception as e:
                        self.log(f"  ‚úó Error: {str(e)}")
                        continue
                
                self.log(f"\n‚úì Crawling completed. Found {len(self.publications)} total publications")
                return self.publications
            
            finally:
                self.close_driver()
        
        def extract_author_links(self, soup, base_url):
            """Extract all person/author profile links"""
            author_links = set()
            patterns = [r'/en/persons/[\w-]+', r'/persons/[\w-]+']
            
            links = soup.find_all('a', href=True)
            for link in links:
                href = link['href']
                for pattern in patterns:
                    if re.search(pattern, href):
                        full_url = urljoin(base_url, href)
                        if self.base_domain in full_url:
                            author_links.add(full_url)
            
            return list(author_links)[:50]
        
        def extract_author_name(self, soup):
            """Extract author name from profile page"""
            name_elem = soup.find('h1') or soup.find('h2') or soup.find('span', class_=re.compile('name', re.I))
            if name_elem:
                return name_elem.get_text(strip=True)
            return 'Unknown Author'
        
        def extract_publications_from_profile(self, soup, profile_link, author_name):
            """Extract publications from author profile"""
            publications = []
            pub_containers = (soup.find_all('article') or 
                            soup.find_all('div', class_=re.compile('publication', re.I)) or
                            soup.find_all('li', class_=re.compile('publication', re.I)))
            
            for container in pub_containers:
                try:
                    pub_data = self.parse_publication(container, profile_link, author_name)
                    if pub_data and pub_data['title']:
                        publications.append(pub_data)
                except:
                    continue
            
            return publications
        
        def parse_publication(self, container, profile_link, author_name):
            """Parse individual publication element"""
            pub_data = {
                'title': '',
                'authors': [],
                'year': 'N/A',
                'abstract': '',
                'keywords': [],
                'publication_link': '',
                'profile_link': profile_link,
                'author_profile_name': author_name,
                'crawled_at': datetime.now().isoformat()
            }
            
            # Extract title
            title_elem = container.find(['h3', 'h4', 'h2', 'a'])
            if title_elem:
                pub_data['title'] = title_elem.get_text(strip=True)
            
            # Extract year
            full_text = container.get_text(' ')
            year_match = re.search(r'(19|20)\d{2}', full_text)
            if year_match:
                pub_data['year'] = year_match.group()
            
            # Extract authors
            pub_data['authors'] = [author_name]
            
            # Extract abstract
            abstract_elem = container.find(re.compile('abstract|description', re.I))
            if abstract_elem:
                pub_data['abstract'] = abstract_elem.get_text(strip=True)[:500]
            
            # Extract publication link
            link_elem = container.find('a', href=re.compile(r'/en/publications/', re.I))
            if link_elem and link_elem.get('href'):
                pub_data['publication_link'] = urljoin(profile_link, link_elem['href'])
            
            return pub_data

In [None]:

# ==================== K-MEANS CLUSTERING ====================

class KMeansClustering:
    def __init__(self, k=3, max_iterations=100):
        self.k = k
        self.max_iterations = max_iterations
        self.centroids = None
        self.clusters = None
        self.vocabulary = set()
        self.idf_values = {}
        
    def preprocess_document(self, text):
        """Preprocess and tokenize document"""
        processed = TextPreprocessor.preprocess(text)
        tokens = TextPreprocessor.tokenize(processed)
        tokens = TextPreprocessor.remove_stopwords(tokens)
        return tokens
    
    def build_vocabulary(self, documents):
        """Build vocabulary from all documents"""
        self.vocabulary = set()
        for doc in documents:
            tokens = self.preprocess_document(doc)
            self.vocabulary.update(tokens)
        self.vocabulary = sorted(list(self.vocabulary))
        
    def calculate_idf(self, documents):
        """Calculate IDF values for terms"""
        doc_count = len(documents)
        term_doc_count = defaultdict(int)
        
        for doc in documents:
            tokens = set(self.preprocess_document(doc))
            for token in tokens:
                term_doc_count[token] += 1
        
        for term in self.vocabulary:
            self.idf_values[term] = math.log(doc_count / (term_doc_count.get(term, 0) + 1))
    
    def vectorize_document(self, text):
        """Convert document to TF-IDF vector"""
        tokens = self.preprocess_document(text)
        term_freq = Counter(tokens)
        
        vector = []
        for term in self.vocabulary:
            tf = term_freq.get(term, 0)
            idf = self.idf_values.get(term, 0)
            vector.append(tf * idf)
        
        return np.array(vector)
    
    def cosine_similarity(self, v1, v2):
        """Calculate cosine similarity between two vectors"""
        dot_product = np.dot(v1, v2)
        norm_v1 = np.linalg.norm(v1)
        norm_v2 = np.linalg.norm(v2)
        
        if norm_v1 == 0 or norm_v2 == 0:
            return 0
        
        return dot_product / (norm_v1 * norm_v2)
    
    def euclidean_distance(self, v1, v2):
        """Calculate Euclidean distance"""
        return np.linalg.norm(v1 - v2)
    
    def fit(self, documents):
        """Fit K-means model to documents"""
        # Build vocabulary and calculate IDF
        self.build_vocabulary(documents)
        self.calculate_idf(documents)
        
        # Vectorize all documents
        vectors = np.array([self.vectorize_document(doc) for doc in documents])
        
        # Initialize centroids randomly
        random_indices = np.random.choice(len(vectors), self.k, replace=False)
        self.centroids = vectors[random_indices]
        
        # K-means iterations
        for iteration in range(self.max_iterations):
            # Assign documents to nearest centroid
            clusters = [[] for _ in range(self.k)]
            
            for idx, vector in enumerate(vectors):
                distances = [self.euclidean_distance(vector, centroid) 
                           for centroid in self.centroids]
                cluster_idx = np.argmin(distances)
                clusters[cluster_idx].append(idx)
            
            # Update centroids
            new_centroids = []
            for cluster in clusters:
                if cluster:
                    cluster_vectors = vectors[cluster]
                    new_centroid = np.mean(cluster_vectors, axis=0)
                    new_centroids.append(new_centroid)
                else:
                    # Keep old centroid if cluster is empty
                    new_centroids.append(self.centroids[len(new_centroids)])
            
            # Check convergence
            if np.allclose(self.centroids, new_centroids):
                break
            
            self.centroids = np.array(new_centroids)
        
        self.clusters = clusters
        return clusters
    
    def predict(self, document):
        """Predict cluster for a new document"""
        vector = self.vectorize_document(document)
        distances = [self.euclidean_distance(vector, centroid) 
                    for centroid in self.centroids]
        return np.argmin(distances)
    
    def save_model(self, filepath):
        """Save clustering model"""
        model_data = {
            'k': self.k,
            'centroids': self.centroids,
            'vocabulary': list(self.vocabulary),
            'idf_values': self.idf_values,
            'clusters': self.clusters
        }
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
    
    def load_model(self, filepath):
        """Load clustering model"""
        try:
            with open(filepath, 'rb') as f:
                model_data = pickle.load(f)
                self.k = model_data['k']
                self.centroids = model_data['centroids']
                self.vocabulary = set(model_data['vocabulary'])
                self.idf_values = model_data['idf_values']
                self.clusters = model_data['clusters']
            return True
        except:
            return False

In [None]:
# ==================== MODERN GUI ====================

class ModernSearchEngineGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("Coventry University Research Portal")
        self.root.geometry("1400x850")
        
        # Color scheme
        self.colors = {
            'primary': '#1a73e8',
            'secondary': '#34a853',
            'background': '#f8f9fa',
            'surface': '#ffffff',
            'text': '#202124',
            'text_secondary': '#5f6368',
            'border': '#dadce0',
            'hover': '#e8f0fe',
            'accent': '#fbbc04'
        }
        
        self.root.configure(bg=self.colors['background'])
        
        # Initialize components
        self.index = AdvancedInvertedIndex()
        self.clustering_model = KMeansClustering(k=3)
        self.crawler = None
        self.index_file = "search_index.pkl"
        self.data_file = "publications.json"
        self.cluster_file = "clustering_model.pkl"
        self.documents_file = "clustered_documents.json"
        self.current_results = []
        self.clustered_documents = {'Business': [], 'Entertainment': [], 'Health': []}
        
        self.setup_styles()
        self.setup_ui()
        self.load_index()
        
    def setup_styles(self):
        """Configure custom styles"""
        style = ttk.Style()
        style.theme_use('clam')
        
        style.configure('TFrame', background=self.colors['background'])
        style.configure('Surface.TFrame', background=self.colors['surface'])
        
        style.configure('TNotebook', background=self.colors['background'], borderwidth=0)
        style.configure('TNotebook.Tab', padding=[20, 10], font=('Segoe UI', 10),
                       background=self.colors['surface'])
        style.map('TNotebook.Tab',
                 background=[('selected', self.colors['primary'])],
                 foreground=[('selected', 'white'), ('!selected', self.colors['text'])])
        
        style.configure('Primary.TButton', font=('Segoe UI', 10, 'bold'),
                       background=self.colors['primary'], foreground='white',
                       borderwidth=0, padding=[20, 10])
        style.map('Primary.TButton', background=[('active', '#1557b0')])
        
        style.configure('Secondary.TButton', font=('Segoe UI', 10),
                       background=self.colors['surface'], foreground=self.colors['text'],
                       borderwidth=1, padding=[15, 8])
        
        style.configure('Title.TLabel', font=('Segoe UI', 24, 'bold'),
                       background=self.colors['surface'], foreground=self.colors['text'])
        
        style.configure('Subtitle.TLabel', font=('Segoe UI', 11),
                       background=self.colors['surface'], foreground=self.colors['text_secondary'])
    
    def setup_ui(self):
        """Setup main UI"""
        self.create_header()
        
        content_frame = ttk.Frame(self.root, style='TFrame')
        content_frame.pack(fill=tk.BOTH, expand=True, padx=20, pady=(0, 20))
        
        self.notebook = ttk.Notebook(content_frame)
        self.notebook.pack(fill=tk.BOTH, expand=True)
        
        self.create_search_tab()
        self.create_crawler_tab()
        self.create_clustering_tab()
        self.create_stats_tab()
    
    def create_header(self):
        """Create header"""
        header = ttk.Frame(self.root, style='Surface.TFrame', height=120)
        header.pack(fill=tk.X)
        header.pack_propagate(False)
        
        header_content = ttk.Frame(header, style='Surface.TFrame')
        header_content.place(relx=0.5, rely=0.5, anchor=tk.CENTER)
        
        logo_frame = ttk.Frame(header_content, style='Surface.TFrame', width=60, height=60)
        logo_frame.pack(side=tk.LEFT, padx=(0, 15))
        logo_frame.pack_propagate(False)
        
        logo_canvas = tk.Canvas(logo_frame, width=60, height=60, 
                               bg=self.colors['primary'], highlightthickness=0)
        logo_canvas.pack()
        logo_canvas.create_text(30, 30, text="CU", font=('Segoe UI', 20, 'bold'), fill='white')
        
        text_frame = ttk.Frame(header_content, style='Surface.TFrame')
        text_frame.pack(side=tk.LEFT)
        
        ttk.Label(text_frame, text="Research Publications Portal",
                 style='Title.TLabel').pack(anchor=tk.W)
        ttk.Label(text_frame, text="Centre for Computational Science and Mathematical Modelling",
                 style='Subtitle.TLabel').pack(anchor=tk.W, pady=(2, 0))
        
        separator = ttk.Separator(self.root, orient=tk.HORIZONTAL)
        separator.pack(fill=tk.X)
    
    def create_search_tab(self):
        """Create search interface"""
        search_frame = ttk.Frame(self.notebook, style='TFrame', padding=20)
        self.notebook.add(search_frame, text="üîç Search")
        
        # Search container
        search_container = ttk.Frame(search_frame, style='Surface.TFrame')
        search_container.pack(fill=tk.X, pady=(0, 20))
        
        search_box_frame = ttk.Frame(search_container, style='Surface.TFrame')
        search_box_frame.pack(fill=tk.X, padx=20, pady=20)
        
        ttk.Label(search_box_frame, text="Search Publications",
                 font=('Segoe UI', 14, 'bold'), background=self.colors['surface'],
                 foreground=self.colors['text']).pack(anchor=tk.W, pady=(0, 10))
        
        search_input_frame = ttk.Frame(search_box_frame, style='Surface.TFrame')
        search_input_frame.pack(fill=tk.X)
        
        search_icon = ttk.Label(search_input_frame, text="üîç", font=('Segoe UI', 14),
                               background='white', foreground=self.colors['text_secondary'])
        search_icon.pack(side=tk.LEFT, padx=(5, 0))
        
        self.search_entry = ttk.Entry(search_input_frame, font=('Segoe UI', 12))
        self.search_entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=10, ipady=8)
        self.search_entry.bind('<Return>', lambda e: self.perform_search())
        
        search_btn = ttk.Button(search_input_frame, text="Search",
                               style='Primary.TButton', command=self.perform_search)
        search_btn.pack(side=tk.LEFT, padx=(0, 5))
        
        self.results_label = ttk.Label(search_box_frame, text="Enter keywords to search publications",
                                      font=('Segoe UI', 9), background=self.colors['surface'],
                                      foreground=self.colors['text_secondary'])
        self.results_label.pack(anchor=tk.W, pady=(10, 0))
        
        # Results tree
        results_card = ttk.LabelFrame(search_frame, text="Search Results", padding=15)
        results_card.pack(fill=tk.BOTH, expand=True)
        
        tree_frame = ttk.Frame(results_card)
        tree_frame.pack(fill=tk.BOTH, expand=True)
        
        columns = ('Title', 'Authors', 'Year', 'Relevance')
        self.results_tree = ttk.Treeview(tree_frame, columns=columns, 
                                        show='tree headings', height=12, selectmode='browse')
        
        self.results_tree.column('#0', width=30, stretch=tk.NO)
        self.results_tree.column('Title', width=500, anchor=tk.W)
        self.results_tree.column('Authors', width=300, anchor=tk.W)
        self.results_tree.column('Year', width=80, anchor=tk.CENTER)
        self.results_tree.column('Relevance', width=100, anchor=tk.CENTER)
        
        self.results_tree.heading('#0', text='#')
        self.results_tree.heading('Title', text='üìÑ Title')
        self.results_tree.heading('Authors', text='üë§ Authors')
        self.results_tree.heading('Year', text='üìÖ Year')
        self.results_tree.heading('Relevance', text='‚≠ê Score')
        
        vsb = ttk.Scrollbar(tree_frame, orient="vertical", command=self.results_tree.yview)
        hsb = ttk.Scrollbar(tree_frame, orient="horizontal", command=self.results_tree.xview)
        self.results_tree.configure(yscrollcommand=vsb.set, xscrollcommand=hsb.set)
        
        self.results_tree.grid(row=0, column=0, sticky='nsew')
        vsb.grid(row=0, column=1, sticky='ns')
        hsb.grid(row=1, column=0, sticky='ew')
        
        tree_frame.grid_rowconfigure(0, weight=1)
        tree_frame.grid_columnconfigure(0, weight=1)
        
        self.results_tree.bind('<Double-1>', self.show_publication_details)
        self.results_tree.bind('<<TreeviewSelect>>', self.on_result_select)
        
        # Details panel
        details_card = ttk.LabelFrame(search_frame, text="Publication Details", padding=15)
        details_card.pack(fill=tk.BOTH, expand=True, pady=(10, 0))
        
        self.details_text = scrolledtext.ScrolledText(details_card, height=8, wrap=tk.WORD,
                                                      font=('Segoe UI', 10), bg='white')
        self.details_text.pack(fill=tk.BOTH, expand=True)
        
        self.details_text.tag_config('title', font=('Segoe UI', 11, 'bold'))
        self.details_text.tag_config('label', font=('Segoe UI', 9, 'bold'))
        self.details_text.tag_config('link', foreground=self.colors['primary'], underline=True)
        
        self.details_text.tag_bind('link', '<Button-1>', self.open_link)
        self.details_text.tag_bind('link', '<Enter>', 
                                  lambda e: self.details_text.config(cursor='hand2'))
        self.details_text.tag_bind('link', '<Leave>', 
                                  lambda e: self.details_text.config(cursor=''))
        
        self.details_text.insert('1.0', 'Select a publication to view details...')
        self.details_text.config(state=tk.DISABLED)
    

    def create_crawler_tab(self):
        """Create crawler interface"""
        crawler_frame = ttk.Frame(self.notebook, style='TFrame', padding=20)
        self.notebook.add(crawler_frame, text="üï∑Ô∏è Crawler")
        
        settings_card = ttk.LabelFrame(crawler_frame, text="Crawler Settings", padding=20)
        settings_card.pack(fill=tk.X, pady=(0, 15))
        
        ttk.Label(settings_card, text="Target URL:", font=('Segoe UI', 10, 'bold')).pack(anchor=tk.W, pady=(0, 5))
        
        self.url_entry = ttk.Entry(settings_card, font=('Segoe UI', 10))
        self.url_entry.pack(fill=tk.X, ipady=6)
        self.url_entry.insert(0, "https://pureportal.coventry.ac.uk/en/organisations/ics-research-centre-for-computational-science-and-mathematical-mo")
        
        max_frame = ttk.Frame(settings_card, style='TFrame')
        max_frame.pack(fill=tk.X, pady=(15, 15))
        
        ttk.Label(max_frame, text="Maximum Authors to Crawl:", font=('Segoe UI', 10, 'bold')).pack(side=tk.LEFT)
        
        self.max_pages = ttk.Spinbox(max_frame, from_=5, to=500, width=15, font=('Segoe UI', 10))
        self.max_pages.set(30)
        self.max_pages.pack(side=tk.LEFT, padx=10)
        
        button_frame = ttk.Frame(settings_card, style='TFrame')
        button_frame.pack(fill=tk.X)
        
        self.crawl_btn = ttk.Button(button_frame, text="‚ñ∂ Start Crawling",
                                    style='Primary.TButton', command=self.start_crawling)
        self.crawl_btn.pack(side=tk.LEFT, padx=(0, 10))
        
        ttk.Button(button_frame, text="üì¶ Load Sample Data",
                style='Secondary.TButton', command=self.load_sample_data).pack(side=tk.LEFT)
        
        ttk.Button(button_frame, text="üíæ Save Index",
                style='Secondary.TButton', command=self.save_index).pack(side=tk.LEFT, padx=(10, 0))
        
        # Progress section
        progress_card = ttk.LabelFrame(crawler_frame, text="Crawling Progress", padding=20)
        progress_card.pack(fill=tk.BOTH, expand=True)
        
        self.progress_var = tk.DoubleVar()
        self.progress_bar = ttk.Progressbar(progress_card, variable=self.progress_var,
                                        mode='indeterminate', length=300)
        self.progress_bar.pack(fill=tk.X, pady=(0, 10))
        
        self.status_label = ttk.Label(progress_card, text="Ready to crawl",
                                    font=('Segoe UI', 9), 
                                    foreground=self.colors['text_secondary'])
        self.status_label.pack(anchor=tk.W, pady=(0, 10))
        
        # Log section
        log_container = ttk.Frame(progress_card, style='TFrame')
        log_container.pack(fill=tk.BOTH, expand=True)
        
        self.log_text = scrolledtext.ScrolledText(log_container, height=15,
                                                font=('Consolas', 9), bg='#f5f5f5',
                                                fg=self.colors['text'], relief=tk.FLAT, 
                                                borderwidth=1)
        self.log_text.pack(fill=tk.BOTH, expand=True)
        
        # Add initial message
        self.log_text.insert('1.0', 'Crawler Ready\n')
        self.log_text.insert(tk.END, '='*60 + '\n')
        self.log_text.insert(tk.END, 'Instructions:\n')
        self.log_text.insert(tk.END, '1. Enter target URL or use default\n')
        self.log_text.insert(tk.END, '2. Set maximum authors to crawl (recommended: 20-50)\n')
        self.log_text.insert(tk.END, '3. Click "Start Crawling" or "Load Sample Data"\n')
        self.log_text.insert(tk.END, '4. Wait for completion and check results in Search tab\n')
        self.log_text.insert(tk.END, '='*60 + '\n\n')
        self.log_text.config(state=tk.DISABLED)

    #Helper method for saving index
    def save_index(self):
        """Save current index to file"""
        try:
            self.index.save(self.index_file)
            messagebox.showinfo("Success", "Index saved successfully!")
        except Exception as e:
            messagebox.showerror("Error", f"Failed to save index: {str(e)}")

    def create_clustering_tab(self):
        """Create document clustering interface"""
        cluster_frame = ttk.Frame(self.notebook, style='TFrame', padding=20)
        self.notebook.add(cluster_frame, text="üìä Clustering")
        
        # Training section
        train_card = ttk.LabelFrame(cluster_frame, text="Train Clustering Model", padding=20)
        train_card.pack(fill=tk.X, pady=(0, 15))
        
        ttk.Label(train_card, text="Load documents for clustering (Business, Entertainment, Health)",
                 font=('Segoe UI', 10)).pack(anchor=tk.W, pady=(0, 10))
        
        btn_frame = ttk.Frame(train_card)
        btn_frame.pack(fill=tk.X)
        
        ttk.Button(btn_frame, text="üìÅ Load Documents", style='Secondary.TButton',
                  command=self.load_documents_for_clustering).pack(side=tk.LEFT, padx=(0, 10))
        ttk.Button(btn_frame, text="üéØ Train Model", style='Primary.TButton',
                  command=self.train_clustering_model).pack(side=tk.LEFT)
        
        self.cluster_status = ttk.Label(train_card, text="No model trained",
                                       font=('Segoe UI', 9), foreground=self.colors['text_secondary'])
        self.cluster_status.pack(anchor=tk.W, pady=(10, 0))

        # Prediction section
        predict_card = ttk.LabelFrame(cluster_frame, text="Classify New Document", padding=20)
        predict_card.pack(fill=tk.BOTH, expand=True)
        
        ttk.Label(predict_card, text="Enter document text:",
                 font=('Segoe UI', 10, 'bold')).pack(anchor=tk.W, pady=(0, 5))
        
        self.cluster_input = scrolledtext.ScrolledText(predict_card, height=6, 
                                                       font=('Segoe UI', 10), wrap=tk.WORD)
        self.cluster_input.pack(fill=tk.BOTH, expand=True, pady=(0, 10))
        
        ttk.Button(predict_card, text="üîÆ Predict Cluster", style='Primary.TButton',
                  command=self.predict_cluster).pack(anchor=tk.W)
        
        self.cluster_result = ttk.Label(predict_card, text="",
                                       font=('Segoe UI', 12, 'bold'),
                                       foreground=self.colors['primary'])
        self.cluster_result.pack(anchor=tk.W, pady=(10, 0))

    def create_stats_tab(self):
        """Create statistics tab"""
        stats_frame = ttk.Frame(self.notebook, style='TFrame', padding=20)
        self.notebook.add(stats_frame, text="üìà Statistics")
        
        quick_stats = ttk.Frame(stats_frame, style='TFrame')
        quick_stats.pack(fill=tk.X, pady=(0, 20))
        
        self.stat_pubs = self.create_stat_card(quick_stats, "üìö", "Total Publications", "0", 0)
        self.stat_authors = self.create_stat_card(quick_stats, "üë•", "Unique Authors", "0", 1)
        self.stat_terms = self.create_stat_card(quick_stats, "üî§", "Indexed Terms", "0", 2)
        
        # Additional stats
        info_card = ttk.LabelFrame(stats_frame, text="Index Information", padding=20)
        info_card.pack(fill=tk.BOTH, expand=True)
        
        self.stats_text = scrolledtext.ScrolledText(info_card, height=15,
                                                    font=('Consolas', 9), wrap=tk.WORD)
        self.stats_text.pack(fill=tk.BOTH, expand=True)
        
        ttk.Button(stats_frame, text="üîÑ Refresh Statistics", style='Secondary.TButton',
                  command=self.update_statistics).pack(pady=(10, 0))
        
    def create_stat_card(self, parent, icon, label, value, col):
        """Create statistics card"""
        card = ttk.Frame(parent, style='Surface.TFrame', relief=tk.RAISED, borderwidth=1)
        card.grid(row=0, column=col, padx=10, sticky='ew')
        parent.grid_columnconfigure(col, weight=1)
        
        content = ttk.Frame(card, style='Surface.TFrame')
        content.pack(padx=20, pady=20)
        
        ttk.Label(content, text=icon, font=('Segoe UI', 32),
                 background=self.colors['surface']).pack()
        
        value_label = ttk.Label(content, text=value, font=('Segoe UI', 24, 'bold'),
                               background=self.colors['surface'], foreground=self.colors['primary'])
        value_label.pack()
        
        ttk.Label(content, text=label, font=('Segoe UI', 10),
                 background=self.colors['surface'], foreground=self.colors['text_secondary']).pack()
        
        return value_label
    
    # Search functionality
    def perform_search(self):
        """Execute search query"""
        query = self.search_entry.get().strip()
        if not query:
            messagebox.showwarning("Empty Query", "Please enter a search query")
            return
        
        results = self.index.search(query)
        self.current_results = results
        
        # Clear tree
        for item in self.results_tree.get_children():
            self.results_tree.delete(item)
        
        # Populate results
        for idx, (doc_id, doc_data, score) in enumerate(results, 1):
            authors = doc_data.get('authors', [])
            if isinstance(authors, list):
                authors_str = ', '.join(authors[:3])
            else:
                authors_str = str(authors)
            if len(authors) > 3:
                authors_str += f" +{len(authors)-3} more"
            
            self.results_tree.insert('', tk.END, text=str(idx),
                                    values=(doc_data.get('title', 'N/A'),
                                           authors_str,
                                           doc_data.get('year', 'N/A'),
                                           f"{score:.2f}"))
        
        self.results_label.config(text=f"Found {len(results)} result(s) for '{query}'")

    def on_result_select(self, event):
        """Handle result selection"""
        selection = self.results_tree.selection()
        if not selection:
            return
        
        item = self.results_tree.item(selection[0])
        row_num = int(item['text']) - 1
        
        if 0 <= row_num < len(self.current_results):
            _, doc_data, _ = self.current_results[row_num]
            self.display_details(doc_data)

    def display_details(self, doc_data):
        """Display publication details"""
        self.details_text.config(state=tk.NORMAL)
        self.details_text.delete('1.0', tk.END)
        
        # Title
        self.details_text.insert(tk.END, doc_data.get('title', 'N/A'), 'title')
        self.details_text.insert(tk.END, '\n\n')
        
        # Authors
        self.details_text.insert(tk.END, 'üë§ Authors: ', 'label')
        authors = doc_data.get('authors', [])
        if isinstance(authors, list):
            self.details_text.insert(tk.END, ', '.join(authors) + '\n\n')
        else:
            self.details_text.insert(tk.END, str(authors) + '\n\n')
        
        # Year
        self.details_text.insert(tk.END, 'üìÖ Year: ', 'label')
        self.details_text.insert(tk.END, str(doc_data.get('year', 'N/A')) + '\n\n')

        # Abstract
        abstract = doc_data.get('abstract', '')
        if abstract:
            self.details_text.insert(tk.END, 'üìù Abstract: ', 'label')
            self.details_text.insert(tk.END, abstract + '\n\n')
        
        # Links
        pub_link = doc_data.get('publication_link', '')
        if pub_link:
            self.details_text.insert(tk.END, 'üîó Publication: ', 'label')
            self.details_text.insert(tk.END, pub_link, ('link', pub_link))
            self.details_text.insert(tk.END, '\n\n')
        
        profile_link = doc_data.get('profile_link', '')
        if profile_link:
            self.details_text.insert(tk.END, 'üë§ Profile: ', 'label')
            self.details_text.insert(tk.END, profile_link, ('link', profile_link))
        
        self.details_text.config(state=tk.DISABLED)

    def open_link(self, event):
        """Open clicked link"""
        index = self.details_text.index(tk.CURRENT)
        tags = self.details_text.tag_names(index)
        
        for tag in tags:
            if tag.startswith('http'):
                webbrowser.open(tag)
                break
    
    def show_publication_details(self, event):
        """Show details on double-click"""
        pass  # Already handled by selection

    # Crawler functionality
    def start_crawling(self):
        """Start web crawling"""
        if not SELENIUM_AVAILABLE:
            messagebox.showerror("Error", "Selenium is not installed. Please install selenium and beautifulsoup4.")
            return
        
        url = self.url_entry.get().strip()
        if not url:
            messagebox.showwarning("Empty URL", "Please enter a target URL")
            return
        
        max_pages = int(self.max_pages.get())
        
        self.crawl_btn.config(state=tk.DISABLED)
        self.progress_bar.start()
        self.log_text.delete('1.0', tk.END)
        
        def crawl_thread():
            try:
                crawler = ImprovedSeleniumCrawler(callback=self.log_message)
                publications = crawler.crawl_department(url, max_pages)
                
                # Save publications
                with open(self.data_file, 'w', encoding='utf-8') as f:
                    json.dump(publications, f, indent=2, ensure_ascii=False)
                
                # Build index
                self.index = AdvancedInvertedIndex()
                for idx, pub in enumerate(publications):
                    self.index.add_document(f"pub_{idx}", pub)
                
                self.index.save(self.index_file)
                
                self.root.after(0, lambda: messagebox.showinfo("Success", 
                    f"Crawled {len(publications)} publications successfully!"))
                self.root.after(0, self.update_statistics)
                
            except Exception as e:
                self.root.after(0, lambda: messagebox.showerror("Error", str(e)))
            finally:
                self.root.after(0, lambda: self.crawl_btn.config(state=tk.NORMAL))
                self.root.after(0, self.progress_bar.stop)
        
        threading.Thread(target=crawl_thread, daemon=True).start()
    
    def log_message(self, msg):
        """Log crawler message"""
        def update():
            self.log_text.insert(tk.END, msg + '\n')
            self.log_text.see(tk.END)
            self.status_label.config(text=msg[:100])
        
        self.root.after(0, update)
    
    def load_sample_data(self):
        """Load sample data"""
        sample_data = [
            {
                'title': 'Machine Learning Applications in Healthcare',
                'authors': ['Dr. John Smith', 'Prof. Jane Doe'],
                'year': '2023',
                'abstract': 'This paper explores machine learning applications in medical diagnosis...',
                'keywords': ['machine learning', 'healthcare', 'AI'],
                'publication_link': 'https://example.com/pub1',
                'profile_link': 'https://example.com/profile1'
            },
            {
                'title': 'Computational Modeling of Climate Systems',
                'authors': ['Prof. Alice Johnson'],
                'year': '2022',
                'abstract': 'A comprehensive study on climate modeling using computational methods...',
                'keywords': ['climate', 'modeling', 'simulation'],
                'publication_link': 'https://example.com/pub2',
                'profile_link': 'https://example.com/profile2'
            }
        ]
        
        with open(self.data_file, 'w', encoding='utf-8') as f:
            json.dump(sample_data, f, indent=2)
        
        self.index = AdvancedInvertedIndex()
        for idx, pub in enumerate(sample_data):
            self.index.add_document(f"pub_{idx}", pub)
        
        self.index.save(self.index_file)
        self.update_statistics()
        messagebox.showinfo("Success", "Sample data loaded successfully!")
    
    # Clustering functionality
    def load_documents_for_clustering(self):
        """Load documents for clustering"""
        filepath = filedialog.askopenfilename(
            title="Select Documents File",
            filetypes=[("JSON files", "*.json"), ("All files", "*.*")]
        )
        
        if not filepath:
            return
        
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                self.clustered_documents = json.load(f)
            
            total = sum(len(docs) for docs in self.clustered_documents.values())
            self.cluster_status.config(
                text=f"Loaded {total} documents across {len(self.clustered_documents)} categories"
            )
            messagebox.showinfo("Success", f"Loaded {total} documents successfully!")
        except Exception as e:
            messagebox.showerror("Error", f"Failed to load documents: {str(e)}")
    
    def train_clustering_model(self):
        """Train K-means clustering model"""
        all_docs = []
        for category, docs in self.clustered_documents.items():
            all_docs.extend(docs)
        
        if len(all_docs) < 10:
            messagebox.showwarning("Insufficient Data", 
                "Please load at least 10 documents for training")
            return
        
        try:
            self.clustering_model = KMeansClustering(k=3)
            self.clustering_model.fit(all_docs)
            self.clustering_model.save_model(self.cluster_file)
            
            self.cluster_status.config(
                text=f"Model trained on {len(all_docs)} documents with 3 clusters",
                foreground=self.colors['secondary']
            )
            messagebox.showinfo("Success", "Clustering model trained successfully!")
        except Exception as e:
            messagebox.showerror("Error", f"Training failed: {str(e)}")
    
    def predict_cluster(self):
        """Predict cluster for input text"""
        text = self.cluster_input.get('1.0', tk.END).strip()
        
        if not text:
            messagebox.showwarning("Empty Input", "Please enter some text")
            return
        
        if self.clustering_model.centroids is None:
            messagebox.showwarning("No Model", "Please train the model first")
            return
        
        try:
            cluster_id = self.clustering_model.predict(text)
            cluster_names = ['Business', 'Entertainment', 'Health']
            cluster_name = cluster_names[cluster_id] if cluster_id < len(cluster_names) else f"Cluster {cluster_id}"
            
            self.cluster_result.config(text=f"‚úì Predicted Category: {cluster_name}")
        except Exception as e:
            messagebox.showerror("Error", f"Prediction failed: {str(e)}")
    
    # Utility functions
    def load_index(self):
        """Load saved index"""
        if os.path.exists(self.index_file):
            if self.index.load(self.index_file):
                self.update_statistics()
    
    def update_statistics(self):
        """Update statistics display"""
        num_docs = len(self.index.documents)
        num_terms = len(self.index.index)
        
        # Count unique authors
        all_authors = set()
        for doc in self.index.documents.values():
            authors = doc.get('authors', [])
            if isinstance(authors, list):
                all_authors.update(authors)
            else:
                all_authors.add(str(authors))
        
        self.stat_pubs.config(text=str(num_docs))
        self.stat_authors.config(text=str(len(all_authors)))
        self.stat_terms.config(text=str(num_terms))
        
        # Detailed stats
        self.stats_text.delete('1.0', tk.END)
        self.stats_text.insert(tk.END, f"Total Publications: {num_docs}\n")
        self.stats_text.insert(tk.END, f"Unique Authors: {len(all_authors)}\n")
        self.stats_text.insert(tk.END, f"Indexed Terms: {num_terms}\n\n")
        
        if num_docs > 0:
            years = [doc.get('year', 'N/A') for doc in self.index.documents.values()]
            year_counts = Counter(years)
            
            self.stats_text.insert(tk.END, "Publications by Year:\n")
            for year, count in sorted(year_counts.items(), reverse=True)[:10]:
                self.stats_text.insert(tk.END, f"  {year}: {count}\n")






In [None]:
# ================ MAIN ====================

def main():
    root = tk.Tk()
    app = ModernSearchEngineGUI(root)
    root.mainloop()

if __name__ == "__main__":
    main()