In [10]:
# file: scrapy_crawler/spider.py
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

class DocumentSpider(scrapy.Spider):
    name = "document_spider"
    
    def __init__(self, seed_url, max_pages, max_depth, *args, **kwargs):
        super(DocumentSpider, self).__init__(*args, **kwargs)
        self.start_urls = [seed_url]  # Seed URL
        self.max_pages = max_pages    # Max pages to crawl
        self.max_depth = max_depth    # Max depth of crawling
        self.count = 0

    def parse(self, response):
        if self.count < self.max_pages:
            page = response.url.split("/")[-2]
            filename = f'documents/{page}.html'
            with open(filename, 'wb') as f:
                f.write(response.body)
            self.count += 1
            yield from response.follow_all(css='a::attr(href)', callback=self.parse)

# settings.py
BOT_NAME = 'scrapy_crawler'
SPIDER_MODULES = ['scrapy_crawler.spiders']
NEWSPIDER_MODULE = 'scrapy_crawler.spiders'
AUTOTHROTTLE_ENABLED = True  # Enable auto-throttle
AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_MAX_DELAY = 60
DEPTH_LIMIT = 3  # Max depth set here

# command to run: scrapy crawl document_spider -a seed_url=http://example.com -a max_pages=100 -a max_depth=3


ModuleNotFoundError: No module named 'scrapy'

In [11]:
# file: indexer.py
import os
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class Indexer:
    def __init__(self):
        self.documents = []
        self.vectorizer = TfidfVectorizer()

    def add_document(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            self.documents.append(file.read())

    def create_index(self):
        self.matrix = self.vectorizer.fit_transform(self.documents)

    def save_index(self, path='index.pkl'):
        with open(path, 'wb') as file:
            pickle.dump((self.vectorizer, self.matrix), file)

    def load_index(self, path='index.pkl'):
        with open(path, 'rb') as file:
            self.vectorizer, self.matrix = pickle.load(file)

    def query(self, text, top_k=5):
        query_vec = self.vectorizer.transform([text])
        scores = cosine_similarity(query_vec, self.matrix)
        return scores.argsort()[0][-top_k:][::-1]  # Top-K results

# Usage
indexer = Indexer()
for document_file in os.listdir('documents'):
    indexer.add_document(f'documents/{document_file}')
indexer.create_index()
indexer.save_index()


ModuleNotFoundError: No module named 'sklearn'

In [12]:
# file: app.py
from flask import Flask, request, jsonify
from indexer import Indexer

app = Flask(__name__)
indexer = Indexer()
indexer.load_index()

@app.route('/query', methods=['POST'])
def query():
    data = request.json
    query_text = data.get('query')
    top_k = data.get('top_k', 5)
    if not query_text:
        return jsonify({'error': 'No query provided'}), 400
    results = indexer.query(query_text, top_k)
    return jsonify({'results': results})

if __name__ == '__main__':
    app.run(debug=True)

# Run this Flask app and use POST requests with JSON body {"query": "example search", "top_k": 5} to /query


ModuleNotFoundError: No module named 'flask'