In [4]:
import os
import whoosh
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin 
from flask import Flask, request, render_template

# Create a folder for the index to be saved
# Check if an index exists and open if possible


def spider(index_path, website):
    """
    Crawls the given website and sub-pages for html content
    Creates an index at the given path

    Params:
    index_path: path of the directory where the index will be saved
    website: string http-link of the searched website
    """
    #Schema for index creation
    schema = Schema(title=TEXT(stored=True), content=TEXT)

    # Create a folder for the index to be saved
    # Check if an index exists and open if possible
    if not os.path.exists(index_path):
        os.mkdir(index_path)
        ix = whoosh.index.create_in(index_path, schema)
    else:
        try:
            ix = whoosh.index.open_dir(index_path)
        except:
            ix = whoosh.index.create_in(index_path, schema)

    # Create the writer for adding docs to the index
    writer = ix.writer()
    queue = [website]
    visited_links = set()

    while queue:
        # Getting the next URL to search through
        current_url = queue.pop(0)
        
        # But only the ones we haven't searched yet
        if current_url not in visited_links:

            request = requests.get(current_url, timeout=4).text
            soup = BeautifulSoup(request, 'html.parser')
            # We don't need the meta data of the html, only content related text
            words = soup.get_text()
            # Update already visited list
            visited_links.add(current_url)

            # Add the URL and all included words to the writer of the index

            writer.add_document(title = soup.title, content = words)
            writer.commit()

            # Update our stack of URLS
            # find the anchor elements in the html used to create hyperlinks
            for link in soup.find_all('a'):
                # retrieving the URL that the anchor points to
                href = link.get('href')

                # fusing the main URL with the new href part of the link
                if href:
                    absolute_url = urljoin(current_url, href)

                    if absolute_url.startswith("https://vm009.rz.uos.de/crawl/") and absolute_url not in visited_links:
                        queue.append(absolute_url)

spider("indexdir","https://vm009.rz.uos.de/crawl/")

# Search function
def search(index_path, query):
    """
    Searches the given whoosh index for the given words

    Params:
    index_path: path where the whoosh index is saved
    query: words string that will be searched

    """
   
    ix = whoosh.index.open_dir(index_path)
    with ix.searcher() as searcher: 
        # find entries with the words in query
        query = QueryParser("content", ix.schema).parse(query)
        results = searcher.search(query)
    
    return results 

search("indexdir", "platypus first last" )

#-------------------------------------------- FLASK PART ------------------------------------------

# WHat does this do exactly? 
app = Flask(__name__)

# creates the first view, a start page where user can input query
@app.route("/", methods=["GET"])
def home():
    return render_template("home.html")

# creates the second view, a result page with the corresponding matches to query
@app.route("/search", methods=["GET"])
def search():
    # safe the query from start view
    query = request.args.get('q')
    if query:
        # get the matching websites to the query 
        matches = search2(query.split())
        return render_template("search.html", matches=matches, query=query)
    else:
        return "Please enter a query."


LockError: 

In [None]:
import os
import whoosh
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin 
from flask import Flask, request, render_template

# Create a folder for the index to be saved
# Check if an index exists and open if possible


def spider(index_path, website):
    """
    Crawls the given website and sub-pages for html content
    Creates an index at the given path

    Params:
    index_path: path of the directory where the index will be saved
    website: string http-link of the searched website
    """
    #Schema for index creation
    schema = Schema(title=TEXT(stored=True), content=TEXT)

    # Create a folder for the index to be saved
    # Check if an index exists and open if possible
    if not os.path.exists(index_path):
        os.mkdir(index_path)
        ix = whoosh.index.create_in(index_path, schema)
    else:
        try:
            ix = whoosh.index.open_dir(index_path)
        except:
            ix = whoosh.index.create_in(index_path, schema)

    # Create the writer for adding docs to the index
    writer = ix.writer()
    queue = [website]
    visited_links = set()

    while queue:
        # Getting the next URL to search through
        current_url = queue.pop(0)
        
        # But only the ones we haven't searched yet
        if current_url not in visited_links:

            request = requests.get(current_url, timeout=4).text
            soup = BeautifulSoup(request, 'html.parser')
            # We don't need the meta data of the html, only content related text
            words = soup.get_text()
            # Update already visited list
            visited_links.add(current_url)

            # Add the URL and all included words to the writer of the index

            writer.add_document(title = soup.title, content = words)
            writer.commit()

            # Update our stack of URLS
            # find the anchor elements in the html used to create hyperlinks
            for link in soup.find_all('a'):
                # retrieving the URL that the anchor points to
                href = link.get('href')

                # fusing the main URL with the new href part of the link
                if href:
                    absolute_url = urljoin(current_url, href)

                    if absolute_url.startswith("https://vm009.rz.uos.de/crawl/") and absolute_url not in visited_links:
                        queue.append(absolute_url)

spider("indexdir","https://vm009.rz.uos.de/crawl/")

# Search function
def search(index_path, query):
    """
    Searches the given whoosh index for the given words

    Params:
    index_path: path where the whoosh index is saved
    query: words string that will be searched

    """
   
    ix = whoosh.index.open_dir(index_path)
    with ix.searcher() as searcher: 
        # find entries with the words in query
        query = QueryParser("content", ix.schema).parse(query)
        results = searcher.search(query)
    
    return results 

search("indexdir", "platypus first last" )

#-------------------------------------------- FLASK PART ------------------------------------------

# WHat does this do exactly? 
app = Flask(__name__)

# creates the first view, a start page where user can input query
@app.route("/", methods=["GET"])
def home():
    return render_template("home.html")

# creates the second view, a result page with the corresponding matches to query
@app.route("/search", methods=["GET"])
def search():
    # safe the query from start view
    query = request.args.get('q')
    if query:
        # get the matching websites to the query 
        matches = search2(query.split())
        return render_template("search.html", matches=matches, query=query)
    else:
        return "Please enter a query."
