# Set Up Environment
Install and import the necessary libraries, including Python 3.10+, sklearn 1.2+, Scrapy 2.11+, and Flask 2.2+.

In [3]:
# Import necessary libraries
import sklearn
import scrapy
from flask import Flask

# Create Scrapy Crawler
Create a Scrapy based Crawler for downloading web documents in html format.

In [4]:
# Import necessary Scrapy components
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

# Define the Spider class
class MySpider(CrawlSpider):
    name = 'my_spider'
    allowed_domains = ['seed_domain.com']  # Replace with your seed domain
    start_urls = ['http://www.seed_domain.com']  # Replace with your seed URL

    # Define the rules for crawling
    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

    # Define the maximum depth for crawling
    max_depth = 2  # Replace with your max depth

    # Define the method for parsing items
    def parse_item(self, response):
        filename = response.url.split("/")[-2] + '.html'
        with open(filename, 'wb') as f:
            f.write(response.body)

# Initialize the CrawlerProcess
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'DOWNLOAD_MAXSIZE': 1000000,  # Limit the size of the downloaded content
    'CONCURRENT_REQUESTS': 10,  # Adjust this for concurrent crawling
    'DEPTH_LIMIT': MySpider.max_depth,  # Set the depth limit for crawling
})

# Start the crawling process
process.crawl(MySpider)
process.start()

2024-04-19 20:52:31 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scrapybot)
2024-04-19 20:52:31 [scrapy.utils.log] INFO: Versions: lxml 5.2.1.0, libxml2 2.12.6, cssselect 1.2.0, parsel 1.9.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.12.2 (main, Mar 12 2024, 11:22:28) [GCC 10.2.1 20210110], pyOpenSSL 24.1.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.5, Platform Linux-6.2.0-1019-azure-x86_64-with-glibc2.31
2024-04-19 20:52:31 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-04-19 20:52:31 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2024-04-19 20:52:31 [scrapy.extensions.telnet] INFO: Telnet Password: e4d6e7db4d7d2eba
2024-04-19 20:52:31 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusag

2024-04-19 20:52:31 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2024-04-19 20:52:31 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.ref

# Configure Crawler Settings
Initialize the crawler using seed URL/Domain, Max Pages, Max Depth. Optionally, configure Concurrent crawling (AutoThrottle), Distributed crawling (scrapyd).

In [11]:
from scrapy.spiders import CrawlSpider
from scrapy.crawler import CrawlerRunner  # Import the CrawlerRunner class
from twisted.internet import reactor  # Import the reactor module

# Define the Spider class
class MySpider(CrawlSpider):
    name = 'my_spider'
    allowed_domains = ['new_seed_domain.com']  # Replace with your new seed domain
    start_urls = ['http://www.new_seed_domain.com']  # Replace with your new seed URL
    max_depth = 3  # Replace with your new max depth

# Update the CrawlerProcess settings
process.settings.set('CONCURRENT_REQUESTS', 20)  # Adjust this for concurrent crawling
process.settings.set('DEPTH_LIMIT', MySpider.max_depth)  # Set the new depth limit for crawling

# Define the function to start the crawling process
def start_crawling():
    runner = CrawlerRunner()
    d = runner.crawl(MySpider)
    d.addBoth(lambda _: reactor.stop())

# Start the new crawling process
start_crawling()


2024-04-19 21:14:39 [scrapy.addons] INFO: Enabled addons:
[]
2024-04-19 21:14:39 [scrapy.extensions.telnet] INFO: Telnet Password: c4f7a21896e90b70
2024-04-19 21:14:39 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2024-04-19 21:14:39 [scrapy.crawler] INFO: Overridden settings:
{}
2024-04-19 21:14:39 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrap

# Create Scikit-Learn Indexer
Create a Scikit-Learn based Indexer for constructing an inverted index in pickle format.

In [10]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import pickle

# Define the list of documents
documents = []

# Read the downloaded HTML files
# Check if documents contain any stop words
if not any(documents):
    raise ValueError("Documents are empty or only contain stop words")

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Save the tfidf_matrix and cosine_sim_matrix as pickle files
with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

with open('cosine_sim_matrix.pkl', 'wb') as f:
    pickle.dump(cosine_sim_matrix, f)

ValueError: empty vocabulary; perhaps the documents only contain stop words

# Configure Indexer Settings
Configure the indexer to use TF-IDF score/weight representation, Cosine similarity. Optionally, configure Vector embedding representation (word2vec), Neural/Semantic search kNN similarity (FAISS).

In [None]:
# Optional: Configure Vector embedding representation (word2vec)
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot

# Prepare corpus for Word2Vec
corpus = [doc.split(' ') for doc in documents]

# Train Word2Vec model
model = Word2Vec(corpus, min_count=1)

# Save model
model.save('model.bin')

# Load model
model = Word2Vec.load('model.bin')

# Fit a 2D PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

# Optional: Configure Neural/Semantic search kNN similarity (FAISS)
import numpy as np
import faiss

# Prepare data for FAISS
data = np.array(result).astype('float32')

# Build the index
index = faiss.IndexFlatL2(data.shape[1])

# Add vectors to the index
index.add(data)

# Save the index
faiss.write_index(index, 'vector.index')

# Create Flask Processor
Create a Flask based Processor for handling free text queries in json format.

In [None]:
# Import necessary libraries
from flask import Flask, request, jsonify
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import nltk
from nltk.corpus import wordnet

# Load the tfidf_matrix and cosine_sim_matrix
with open('tfidf_matrix.pkl', 'rb') as f:
    tfidf_matrix = pickle.load(f)

with open('cosine_sim_matrix.pkl', 'rb') as f:
    cosine_sim_matrix = pickle.load(f)

# Create a Flask app
app = Flask(__name__)

# Define the route for handling free text queries
@app.route('/query', methods=['POST'])
def handle_query():
    # Get the query from the request
    query = request.json.get('query', '')

    # Validate the query
    if not query:
        return jsonify({'error': 'Invalid query'}), 400

    # Optional: Query spelling-correction/suggestion
    corrected_query = ' '.join([nltk.corpus.wordnet.morphy(word) if nltk.corpus.wordnet.morphy(word) else word for word in query.split()])

    # Transform the query using the TfidfVectorizer
    query_vector = tfidf_matrix.transform([corrected_query])

    # Calculate the cosine similarity between the query and the documents
    sim_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get the top-K ranked results
    top_k = sim_scores.argsort()[-10:][::-1]

    # Return the results
    return jsonify({'results': top_k.tolist()})

# Run the Flask app
if __name__ == '__main__':
    app.run(port=5000)

# Configure Processor Settings
Configure the processor to handle Query validation/error-checking, Top-K ranked results. Optionally, configure Query spelling-correction/suggestion (NLTK), query expansion (WordNet).

In [None]:
# Import necessary libraries
from flask import Flask, request, jsonify
import nltk
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Load the tfidf_matrix
with open('tfidf_matrix.pkl', 'rb') as f:
    tfidf_matrix = pickle.load(f)

# Create a Flask app
app = Flask(__name__)

# Define the route for handling free text queries
@app.route('/query', methods=['POST'])
def handle_query():
    # Get the query from the request
    query = request.json.get('query', '')

    # Validate the query
    if not query:
        return jsonify({'error': 'Invalid query'}), 400

    # Optional: Query spelling-correction/suggestion (NLTK)
    corrected_query = ' '.join([nltk.corpus.wordnet.morphy(word) if nltk.corpus.wordnet.morphy(word) else word for word in query.split()])

    # Transform the query using the TfidfVectorizer
    vectorizer = TfidfVectorizer()
    query_vector = vectorizer.transform([corrected_query])

    # Calculate the cosine similarity between the query and the documents
    sim_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get the top-K ranked results
    top_k = sim_scores.argsort()[-10:][::-1]

    # Return the results
    return jsonify({'results': top_k.tolist()})

# Optional: Query expansion (WordNet)
@app.route('/expand', methods=['POST'])
def expand_query():
    # Get the query from the request
    query = request.json.get('query', '')

    # Validate the query
    if not query:
        return jsonify({'error': 'Invalid query'}), 400

    # Expand the query using WordNet
    expanded_query = ' '.join([syn.lemmas()[0].name() for syn in wordnet.synsets(query)])

    # Return the expanded query
    return jsonify({'expanded_query': expanded_query})

# Run the Flask app
if __name__ == '__main__':
    app.run(port=5000)

# Test the System
Run tests to ensure the system is working as expected.

In [None]:
# Import necessary libraries
import requests
import json

# Define the base URL for the Flask app
base_url = 'http://localhost:5000'

# Define a list of test queries
test_queries = ['test query 1', 'test query 2', 'test query 3']

# Loop through the test queries
for query in test_queries:
    # Send a POST request to the /query endpoint
    response = requests.post(f'{base_url}/query', json={'query': query})

    # Check if the request was successful
    if response.status_code == 200:
        # Print the results
        print(f'Results for "{query}":', response.json()['results'])
    else:
        # Print the error
        print(f'Error for "{query}":', response.json()['error'])

    # Send a POST request to the /expand endpoint
    response = requests.post(f'{base_url}/expand', json={'query': query})

    # Check if the request was successful
    if response.status_code == 200:
        # Print the expanded query
        print(f'Expanded query for "{query}":', response.json()['expanded_query'])
    else:
        # Print the error
        print(f'Error for "{query}":', response.json()['error'])