In [7]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

class DocumentCrawler(CrawlSpider):
    name = "document_crawler"
    custom_settings = {
        'DEPTH_LIMIT': 3,    # Maximum depth of crawl (max_depth)
        'CLOSESPIDER_PAGECOUNT': 100,  # Max pages to crawl
        'AUTOTHROTTLE_ENABLED': True,  # Enable AutoThrottle
        'HTTPCACHE_ENABLED': True,     # Enable caching
        'ROBOTSTXT_OBEY': True
    }

    def __init__(self, *a, **kw):
        super(DocumentCrawler, self).__init__(*a, **kw)
        self.allowed_domains = [kw.get('domain')]  # Set the allowed domain from seed domain
        self.start_urls = [f"http://{kw.get('domain')}"]  # Set the start URL

    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        filename = response.url.split("/")[-2] + '.html'
        with open(filename, 'wb') as f:
            f.write(response.body)

if __name__ == "__main__":
    domain = 'example.com'  # Seed domain
    process = CrawlerProcess(get_project_settings())
    process.crawl(DocumentCrawler, domain=domain)
    process.start()


ModuleNotFoundError: No module named 'scrapy'

In [8]:
import os
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import glob

class Indexer:
    def __init__(self):
        self.documents = []
        self.tfidf_matrix = None
        self.vectorizer = TfidfVectorizer()

    def add_documents(self, folder_path):
        for file_path in glob.glob(f"{folder_path}/*.html"):
            with open(file_path, 'r', encoding='utf-8') as file:
                self.documents.append(file.read())

    def create_index(self):
        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents)

    def save_index(self, filename='index.pkl'):
        with open(filename, 'wb') as f:
            pickle.dump((self.vectorizer, self.tfidf_matrix), f)

    def query(self, text, top_k=5):
        query_vec = self.vectorizer.transform([text])
        scores = cosine_similarity(query_vec, self.tfidf_matrix)[0]
        top_indices = np.argsort(scores)[::-1][:top_k]
        return top_indices, np.sort(scores)[::-1][:top_k]

indexer = Indexer()
indexer.add_documents('./html_documents')  # Folder containing HTML documents
indexer.create_index()
indexer.save_index()


ModuleNotFoundError: No module named 'sklearn'

In [9]:
from flask import Flask, request, jsonify
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

app = Flask(__name__)

# Load the pre-computed index
with open('index.pkl', 'rb') as f:
    vectorizer, tfidf_matrix = pickle.load(f)

@app.route('/query', methods=['POST'])
def query():
    content = request.json
    try:
        text = content['query']
        top_k = int(content.get('top_k', 5))
        query_vec = vectorizer.transform([text])
        scores = cosine_similarity(query_vec, tfidf_matrix)[0]
        top_indices = scores.argsort()[-top_k:][::-1]
        result_scores = scores[top_indices]
        results = {'documents': list(top


SyntaxError: incomplete input (4041855115.py, line 22)