In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Muhammad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Muhammad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Muhammad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Muhammad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [15]:
pip install nltk requests scikit-learn numpy Flask

^C
Note: you may need to restart the kernel to use updated packages.






In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and t.isalpha()]
    return tokens

In [5]:
# ------------------------------ 2. Load Gutenberg Books ------------------------------
book_links = [
    "https://www.gutenberg.org/files/1342/1342-0.txt",  # Pride and Prejudice
    "https://www.gutenberg.org/files/11/11-0.txt",      # Alice in Wonderland
    "https://www.gutenberg.org/files/98/98-0.txt"       # A Tale of Two Cities
]

def load_gutenberg_book(url):
    r = requests.get(url)
    r.encoding = "utf-8"
    return r.text

def clean_gutenberg_text(text):
    start = "*** START OF THIS PROJECT GUTENBERG EBOOK"
    end = "*** END OF THIS PROJECT GUTENBERG EBOOK"
    if start in text and end in text:
        text = text.split(start)[1].split(end)[0]
    return text

docs = []
for link in book_links:
    raw_text = load_gutenberg_book(link)
    clean_text = clean_gutenberg_text(raw_text)
    docs.append(clean_text)

In [6]:
# ------------------------------ 3. Build Positional Index ------------------------------
def build_index(docs):
    index = {}  # term -> {doc_id: [positions]}
    for doc_id, doc in enumerate(docs):
        tokens = preprocess(doc)
        for pos, token in enumerate(tokens):
            if token not in index:
                index[token] = {}
            if doc_id not in index[token]:
                index[token][doc_id] = []
            index[token][doc_id].append(pos)
    return index

index = build_index(docs)
print("Index built for Gutenberg books!")

Index built for Gutenberg books!


In [7]:
# ------------------------------ 4. Boolean + Phrase Search ------------------------------
def boolean_search(query, index):
    q_tokens = preprocess(query)
    if not q_tokens:
        return []

    if q_tokens[0] not in index:
        return []
    result_docs = set(index[q_tokens[0]].keys())
    for term in q_tokens[1:]:
        if term not in index:
            return []
        result_docs &= set(index[term].keys())

    # Phrase check
    if len(q_tokens) > 1:
        valid_docs = []
        for doc_id in result_docs:
            pos_lists = [sorted(index[t][doc_id]) for t in q_tokens]
            for i in range(len(pos_lists[0])):
                if all(pos_lists[j][0] + j == pos_lists[0][i] + j for j in range(1, len(pos_lists)) if i + j < len(pos_lists[j])):
                    valid_docs.append(doc_id)
                    break
        result_docs = valid_docs

    return list(result_docs)

In [8]:
# ------------------------------ 5. Edit Distance Search ------------------------------
def edit_distance(s1, s2):
    m, n = len(s1), len(s2)
    dp = [[0]*(n+1) for _ in range(m+1)]
    for i in range(m+1):
        dp[i][0] = i
    for j in range(n+1):
        dp[0][j] = j
    for i in range(1,m+1):
        for j in range(1,n+1):
            if s1[i-1]==s2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j-1], dp[i-1][j], dp[i][j-1])
    return dp[m][n]

def fuzzy_search(query, index, max_dist=2):
    q_tokens = preprocess(query)
    matched_docs = set()
    for term in index.keys():
        for qt in q_tokens:
            if edit_distance(term, qt) <= max_dist:
                matched_docs.update(index[term].keys())
    return list(matched_docs)


In [9]:
# ------------------------------ 6. Soundex Search ------------------------------
def soundex(word):
    word = word.upper()
    codes = {"B":"1","F":"1","P":"1","V":"1",
             "C":"2","G":"2","J":"2","K":"2","Q":"2","S":"2","X":"2","Z":"2",
             "D":"3","T":"3",
             "L":"4",
             "M":"5","N":"5",
             "R":"6"}
    sound = word[0]
    for char in word[1:]:
        code = codes.get(char,"0")
        if code != sound[-1]:
            sound += code
    sound = sound.replace("0","")
    return (sound+"000")[:4]

def soundex_search(query, index):
    q_tokens = preprocess(query)
    matched_docs = set()
    query_sdx = [soundex(q) for q in q_tokens]
    for term in index.keys():
        if soundex(term) in query_sdx:
            matched_docs.update(index[term].keys())
    return list(matched_docs)

In [10]:
# ------------------------------ 7. TF-IDF Ranking ------------------------------
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(docs)

def rank_documents(query, docs_list, doc_ids):
    if not doc_ids:
        return []

    query_vec = np.zeros((1, len(vectorizer.get_feature_names_out())))
    q_tokens = preprocess(query)
    for token in q_tokens:
        if token in vectorizer.vocabulary_:
            idx = vectorizer.vocabulary_[token]
            query_vec[0, idx] = 1

    tfidf_dense = tfidf_matrix.toarray()
    scores = np.dot(tfidf_dense, query_vec.T).flatten()

    ranked_idx = np.argsort(scores)[::-1]
    ranked_docs = [(i, docs_list[i], scores[i]) for i in ranked_idx if i in doc_ids and scores[i] > 0]
    return ranked_docs

In [11]:
class SearchEngine:
    def __init__(self, docs, index, vectorizer, tfidf_matrix, book_links):
        self.docs = docs
        self.index = index
        self.vectorizer = vectorizer
        self.tfidf_matrix = tfidf_matrix
        self.book_links = book_links

    def boolean_search(self, query):
        return boolean_search(query, self.index)

    def fuzzy_search(self, query, max_dist=2):
        return fuzzy_search(query, self.index, max_dist)

    def soundex_search(self, query):
        return soundex_search(query, self.index)

    def ranked_search(self, query, search_type='tfidf'):
        doc_ids = []
        if search_type == 'boolean':
            doc_ids = self.boolean_search(query)
        elif search_type == 'fuzzy':
            doc_ids = self.fuzzy_search(query)
        elif search_type == 'soundex':
            doc_ids = self.soundex_search(query)
        elif search_type == 'tfidf':
            doc_ids = list(range(len(self.docs)))
        else:
            print("Invalid search type. Using TF-IDF for ranking all documents.")
            doc_ids = list(range(len(self.docs)))

        if not doc_ids:
            return []

        return rank_documents(query, self.docs, doc_ids)

print("SearchEngine class defined.")

SearchEngine class defined.


In [12]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(docs)

search_engine_instance = SearchEngine(docs, index, vectorizer, tfidf_matrix, book_links)
print("SearchEngine instance initialized.")

SearchEngine instance initialized.


In [16]:
from flask import Flask, request, render_template

app = Flask(__name__)

# Ensure search_engine_instance is accessible
# This assumes search_engine_instance has already been created in a previous cell
# and is available in the global scope.

@app.route('/', methods=['GET', 'POST'])
def search():
    query = None
    results = []
    if request.method == 'POST':
        query = request.form['query']
        if query:
            # Use the search_engine_instance to get ranked results
            # search_engine_instance is defined in the previous cell
            ranked_results = search_engine_instance.ranked_search(query, search_type='tfidf') # Defaulting to tfidf for this example
            results = [
                {
                    "doc_id": doc_id,
                    "score": f"{score:.4f}",
                    "link": search_engine_instance.book_links[doc_id]
                }
                for doc_id, _, score in ranked_results
            ]
        return render_template('index.html', query=query, results=results)
    else:
        return render_template('index.html', query=None, results=[])


if __name__ == '__main__':
    print("Flask app is running. Open a browser to http://127.0.0.1:5000/")
    app.run(debug=True, port=5000, use_reloader=False) # use_reloader=False to avoid running twice in some environments

Flask app is running. Open a browser to http://127.0.0.1:5000/
 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [23/Dec/2025 22:25:12] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [23/Dec/2025 22:25:13] "GET /static/style.css HTTP/1.1" 304 -
127.0.0.1 - - [23/Dec/2025 22:25:30] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [23/Dec/2025 22:25:30] "GET /static/style.css HTTP/1.1" 304 -
