In [1]:
#Imports and NLTK Downloads
import requests
from bs4 import BeautifulSoup
import re
from collections import defaultdict
import tkinter as tk
from tkinter import ttk
import webbrowser
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import schedule
import time
import threading

# Ensure NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\xVEXx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xVEXx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\xVEXx\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#Preprocessing Functions

# Initialize NLTK resources
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Preprocessing functions
def preprocess(text):
    tokens = re.split(r'\W+', text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [ps.stem(word) for word in tokens]
    return tokens

def preprocess_query(query):
    return preprocess(query)

def expand_query(query):
    query_terms = preprocess_query(query)
    expanded_terms = set(query_terms)
    for term in query_terms:
        for syn in wordnet.synsets(term):
            for lemma in syn.lemmas():
                expanded_terms.add(lemma.name())
        if '-' in term:
            expanded_terms.update(term.split('-'))
    return ' '.join(expanded_terms)


In [3]:
# Web crawling functions
def polite_crawl(url, delay=5):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            time.sleep(delay)
            return response
        else:
            print(f"Failed to crawl {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error crawling {url}: {e}")
        return None

def crawl_and_parse(url):
    response = polite_crawl(url)
    if response:
        soup = BeautifulSoup(response.content, 'html.parser')
        publications = soup.find_all('div', class_='result-container')
        results = []
        for pub in publications:
            title_tag = pub.find('h3', class_='title')
            title = title_tag.text.strip() if title_tag else 'No title'
            link = title_tag.find('a')['href'] if title_tag and title_tag.find('a') else 'No link'

            author_links = []
            link_authors = []
            for author_tag in pub.find_all('a', class_='link person'):
                link_authors.append(author_tag.text.strip())
                author_links.append(author_tag['href'])

            unlink_authors = [author_tag.text.strip() for author_tag in pub.find_all('span') if "class" not in author_tag.attrs]

            if any(title.lower() in author.lower() for author in unlink_authors):
                unlink_authors = [author.replace(title, "").strip() for author in unlink_authors]

            authors = link_authors + unlink_authors
            authors = list(set(authors))
            authors = ', '.join(authors) if authors else 'No authors'

            date_tag = pub.find('span', class_='date')
            publication_date = date_tag.text.strip() if date_tag else 'No date'
            results.append({
                'title': title,
                'link': link,
                'authors': authors,
                'author_links': author_links,  
                'publication_date': publication_date,
                'content': f"{title} {authors} {publication_date}"
            })
        return results
    else:
        return []


In [4]:
# Inverted index functions
def create_inverted_index(documents):
    inverted_index = defaultdict(list)
    for doc_id, doc in enumerate(documents):
        content = doc['content']
        words = preprocess(content)
        for word in words:
            if doc_id not in inverted_index[word]:
                inverted_index[word].append(doc_id)
    return inverted_index

# TF-IDF functions
def calculate_tf_idf(documents):
    corpus = [doc['content'] for doc in documents]
    vectorizer = TfidfVectorizer(tokenizer=preprocess)
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return tfidf_matrix, vectorizer


In [5]:
# Search function
def search(query, inverted_index, documents, tfidf_matrix, vectorizer):
    expanded_query = expand_query(query)
    query_vector = vectorizer.transform([expanded_query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    query_terms = set(preprocess(query))
    regex_query = re.compile(r'\b(?:' + '|'.join(re.escape(term) for term in query_terms) + r')\b', re.IGNORECASE)

    relevant_docs_indices = []
    for doc_id, doc in enumerate(documents):
        if regex_query.search(doc['content']):
            relevant_docs_indices.append((doc_id, cosine_similarities[doc_id]))

    relevant_docs_indices = sorted(relevant_docs_indices, key=lambda x: x[1], reverse=True)

    results = [(documents[idx], score) for idx, score in relevant_docs_indices if score > 0]
    return results


In [6]:
# Search and display results function
def search_and_display_results():
    query = search_entry.get()
    search_results = search(query, inverted_index, documents, tfidf_matrix, vectorizer)
    for row in result_tree.get_children():
        result_tree.delete(row)
    
    if not search_results:
        result_tree.insert("", "end", values=("No results found", "", "", "", "", ""))
    else:
        for result, score in search_results:
            author_links = ', '.join(result['author_links'])  # Join author links with ', ' for horizontal display
            result_tree.insert("", "end", values=(result['title'], result['link'], result['authors'], result['publication_date'], author_links, f"{score:.2f}"))


In [7]:
# Clear function
def clear_fields():
    search_entry.delete(0, tk.END)
    for row in result_tree.get_children():
        result_tree.delete(row)


In [8]:
# Treeview click event function
def on_treeview_click(event):
    item = result_tree.identify('item', event.x, event.y)
    column = result_tree.identify_column(event.x)
    if column == '#2':
        link = result_tree.item(item, "values")[1]
        if link != 'No link':
            webbrowser.open(link)
    elif column == '#5':
        author_links = result_tree.item(item, "values")[4].split(', ')  
        for link in author_links:
            if link != 'No link':
                webbrowser.open(link)


In [9]:
# Scheduled crawl function
def scheduled_crawl():
    global documents, inverted_index, tfidf_matrix, vectorizer
    url = 'https://pureportal.coventry.ac.uk/en/organisations/eec-school-of-computing-mathematics-and-data-sciences-cmds/publications/'
    new_documents = crawl_and_parse(url)
    if new_documents:
        documents.extend(new_documents)
        inverted_index = create_inverted_index(documents)
        tfidf_matrix, vectorizer = calculate_tf_idf(documents)
    print("Scheduled crawl completed")


In [10]:
# Run scheduled tasks function
def run_scheduled_tasks():
    while True:
        schedule.run_pending()
        time.sleep(1)


In [11]:
# Initial crawl
url = 'https://pureportal.coventry.ac.uk/en/organisations/eec-school-of-computing-mathematics-and-data-sciences-cmds/publications/'
documents = crawl_and_parse(url)
inverted_index = create_inverted_index(documents)
tfidf_matrix, vectorizer = calculate_tf_idf(documents)

# Schedule the crawl task to run once a week
schedule.every().week.do(scheduled_crawl)
thread = threading.Thread(target=run_scheduled_tasks)
thread.daemon = True
thread.start()




In [None]:
#GUI

# Define dark mode colors
dark_mode = {
    "background_color": "#1e1e1e",
    "foreground_color": "#e8e8e8",
    "button_color": "#3c6e71",
    "button_hover_color": "#284b63",
    "entry_background_color": "#2d2d2d",
    "entry_foreground_color": "#000000",
    "treeview_background_color": "#2d2d2d",
    "treeview_foreground_color": "#e8e8e8",
    "treeview_heading_background": "#3c6e71",
    "treeview_heading_foreground": "#000000"
}

# Define light mode colors
light_mode = {
    "background_color": "#ffffff",
    "foreground_color": "#000000",
    "button_color": "#dcdcdc",
    "button_hover_color": "#b0b0b0",
    "entry_background_color": "#f0f0f0",
    "entry_foreground_color": "#000000",
    "treeview_background_color": "#ffffff",
    "treeview_foreground_color": "#000000",
    "treeview_heading_background": "#dcdcdc",
    "treeview_heading_foreground": "#000000"
}

# Current mode
current_mode = dark_mode


# Tkinter GUI
root = tk.Tk()
root.title("CMDS Publication Search Engine")
root.configure(bg=current_mode["background_color"])

# Styles
style = ttk.Style()
style.theme_use("clam")

def apply_styles(mode):
    root.configure(bg=mode["background_color"])
    style.configure("TFrame", background=mode["background_color"])
    style.configure("TLabel", background=mode["background_color"], foreground=mode["foreground_color"], font=("Arial", 12))
    style.configure("TEntry", background=mode["entry_background_color"], foreground=mode["entry_foreground_color"], font=("Arial", 12))
    style.configure("TButton", background=mode["button_color"], foreground=mode["foreground_color"], font=("Arial", 12, "bold"))
    style.map("TButton", background=[('active', mode["button_hover_color"])], foreground=[('active', mode["foreground_color"])])
    style.configure("Treeview", background=mode["treeview_background_color"], foreground=mode["treeview_foreground_color"], font=("Arial", 10), rowheight=25)
    style.configure("Treeview.Heading", background=mode["treeview_heading_background"], foreground=mode["treeview_heading_foreground"], font=("Arial", 12, "bold"))

apply_styles(current_mode)

def toggle_mode():
    global current_mode
    if current_mode == dark_mode:
        current_mode = light_mode
        toggle_button.config(text="Dark Mode")
    else:
        current_mode = dark_mode
        toggle_button.config(text="Light Mode")
    apply_styles(current_mode)


# Search frame
search_frame = ttk.Frame(root, padding="10")
search_frame.grid(row=0, column=0, pady=(20, 10), padx=20, sticky=(tk.W, tk.E))

search_label = ttk.Label(search_frame, text="Enter your search query:")
search_label.grid(row=0, column=0, sticky=tk.W, padx=(0, 10))

search_entry = ttk.Entry(search_frame, width=50, style="TEntry")
search_entry.grid(row=0, column=1, sticky=(tk.W, tk.E), padx=(0, 10))

search_button = ttk.Button(search_frame, text="Search", command=search_and_display_results, style="TButton")
search_button.grid(row=0, column=2, sticky=(tk.W, tk.E), padx=(0, 10))

clear_button = ttk.Button(search_frame, text="Clear", command=clear_fields, style="TButton")
clear_button.grid(row=0, column=3, sticky=(tk.W, tk.E), padx=(0, 10))

toggle_button = ttk.Button(search_frame, text="Light Mode", command=toggle_mode, style="TButton")
toggle_button.grid(row=0, column=4, sticky=(tk.W, tk.E), padx=(10, 0))

# Bind Enter key to search
search_entry.bind("<Return>", lambda event: search_and_display_results())


# Result frame
result_frame = ttk.Frame(root, padding="10")
result_frame.grid(row=1, column=0, pady=(10, 20), padx=20, sticky=(tk.W, tk.E, tk.N, tk.S))

columns = ("Title", "Link", "Authors", "Publication Date", "Author Links", "Relevance Score")
result_tree = ttk.Treeview(result_frame, columns=columns, show='headings', selectmode='browse')
result_tree.heading("Title", text="Title")
result_tree.heading("Link", text="Link")
result_tree.heading("Authors", text="Authors")
result_tree.heading("Publication Date", text="Publication Date")
result_tree.heading("Author Links", text="Author Links") 
result_tree.heading("Relevance Score", text="Relevance Score")
result_tree.column("Title", width=250)
result_tree.column("Link", width=250)
result_tree.column("Authors", width=200)
result_tree.column("Publication Date", width=150)
result_tree.column("Author Links", width=250)  
result_tree.column("Relevance Score", width=150)
result_tree.pack(expand=True, fill='both')
result_tree.bind("<Double-1>", on_treeview_click)

# Adding hover effect to Treeview items
def on_motion(event):
    row_id = result_tree.identify_row(event.y)
    if row_id:
        result_tree.tag_configure('hover', background=current_mode["button_hover_color"])
        for item in result_tree.get_children():
            result_tree.item(item, tags=())
        result_tree.item(row_id, tags=('hover',))

result_tree.bind('<Motion>', on_motion)

# Grid configuration to make Treeview expand
root.grid_rowconfigure(1, weight=1)
root.grid_columnconfigure(0, weight=1)

# Run the application
root.mainloop()
