<a href="https://colab.research.google.com/github/Manojgutta19/Projects/blob/main/vertical%20search%20engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from IPython.display import HTML, display
import requests
from bs4 import BeautifulSoup
import ipywidgets as widgets

url = 'https://pureportal.coventry.ac.uk/en/publications/'
class_name = 'link person'

# crawler to scrap the data from the webpage
def crawl(url):
    req = requests.get(url)
    ext = BeautifulSoup(req.content, 'html.parser')
    ele = ext.find_all("div", class_="result-container")
    data = []
    for element in ele:
        title = element.find("h3", class_="title").text.strip()
        authors = []
        aut_elements = element.find_all("a", class_=class_name)
        for aut_element in aut_elements:
            authors.append(aut_element.text.strip())
        authors = ', '.join(authors) if authors else "Unknown"
        pub = element.find("span", class_="date")
        publication = pub.text.strip() if pub else "Unknown"
        link = element.find("a", class_="link")
        page = link["href"] if link and 'href' in link.attrs else "unknown"
        joi = []
        alink = element.find("a", class_="link person")
        if alink and 'href' in alink.attrs:
            joi.append(alink["href"])
        aut_link ="/n".join(joi) if alink and 'href' in alink.attrs else "Unknown"
        data.append({
            'title': title,
            'authors': authors,
            'author link': aut_link,
            'publication': publication,
            'page': page
        })
    return data
# Inverted Index is used to improve the speed of vertical search engine
def index(data):
    inverted_index = {}
    for inv, item in enumerate(data):
        words = item['title'].lower().split()
        for word in words:
            if word not in inverted_index:
                inverted_index[word] = []
            inverted_index[word].append(inv)
    return inverted_index

# Query preprocessing
def search(query, data, inverted_index):
    result = []
    search_url = url + f'?search={query.replace(" ","+")}&organisations=coventry-university%2Fcentre-for-global-learning'
    url_list = crawl(search_url)
    if url_list:
        return url_list
    for item in data:
        if query.lower() in item['title'].lower():
            result.append(item)
    return result


def result(results):
    if len(results) > 0:
        for index, result in enumerate(results):
            display(HTML(f"<h3>Result #{index+1}</h3>"))
            display(HTML(f"<b>Title:</b> {result['title']}"))
            display(HTML(f"<b>Authors:</b> {result['authors']}"))
            display(HTML(f"<b>Publication Data:</b> {result['publication']}"))

            # Create clickable links for author link and publication link
            display(HTML(f"<b>Author Link:</b> <a href='{result['author link']}'>{result['author link']}</a>"))
            display(HTML(f"<b>Publication link:</b> <a href='{result['page']}'>{result['page']}</a>"))

            display(HTML("<hr>"))
    else:
        print("No results found.")

query_input = widgets.Text(description='Publication Search:')
search_button = widgets.Button(description='Search')
output = widgets.Output()

def button_click(b):
    output.clear_output()
    with output:
        query = query_input.value
        search_result = search(query, data, inverted_index)
        result(search_result)

search_button.on_click(button_click)

# Display the widgets and search results
display(query_input)
display(search_button)
display(output)

# Initial data retrieval and indexing
data = crawl(url)
inverted_index = index(data)


Text(value='', description='Publication Search:')

Button(description='Search', style=ButtonStyle())

Output()