# Set Up Environment
Install and import the necessary libraries, including Python 3.10+, sklearn 1.2+, Scrapy 2.11+, and Flask 2.2+.

In [3]:
# Install specific versions of libraries if not already installed
!pip install python==3.10
!pip install scikit-learn==1.2
!pip install Scrapy==2.11
!pip install Flask==2.2

# Import necessary libraries
!pip install scikit-learn
import sklearn
import scrapy
from flask import Flask

Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement python==3.10 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for python==3.10[0m[31m
[0mDefaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn==1.2
  Using cached scikit-learn-1.2.0.tar.gz (7.2 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[33 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "/usr/local/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py", line 353, in <module>
  [31m   [0m     main()
  [31m   [0m   File "/u

ImportError: cannot import name 'url_quote' from 'werkzeug.urls' (/home/vscode/.local/lib/python3.12/site-packages/werkzeug/urls.py)

# Create Scrapy Crawler
Create a Scrapy based Crawler for downloading web documents in html format.

In [None]:
# Import necessary Scrapy components
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

# Define the Spider class
class MySpider(CrawlSpider):
    name = 'my_spider'
    allowed_domains = ['seed_domain.com']  # Replace with your seed domain
    start_urls = ['http://www.seed_domain.com']  # Replace with your seed URL

    # Define the rules for crawling
    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

    # Define the maximum depth for crawling
    max_depth = 2  # Replace with your max depth

    # Define the method for parsing items
    def parse_item(self, response):
        filename = response.url.split("/")[-2] + '.html'
        with open(filename, 'wb') as f:
            f.write(response.body)

# Initialize the CrawlerProcess
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'DOWNLOAD_MAXSIZE': 1000000,  # Limit the size of the downloaded content
    'CONCURRENT_REQUESTS': 10,  # Adjust this for concurrent crawling
    'DEPTH_LIMIT': MySpider.max_depth,  # Set the depth limit for crawling
})

# Start the crawling process
process.crawl(MySpider)
process.start()

# Configure Crawler Settings
Initialize the crawler using seed URL/Domain, Max Pages, Max Depth. Optionally, configure Concurrent crawling (AutoThrottle), Distributed crawling (scrapyd).

In [None]:
# Configure Crawler Settings
MySpider.allowed_domains = ['new_seed_domain.com']  # Replace with your new seed domain
MySpider.start_urls = ['http://www.new_seed_domain.com']  # Replace with your new seed URL
MySpider.max_depth = 3  # Replace with your new max depth

# Optional: Configure AutoThrottle for concurrent crawling
process.settings.set('AUTOTHROTTLE_ENABLED', True)
process.settings.set('AUTOTHROTTLE_START_DELAY', 5)
process.settings.set('AUTOTHROTTLE_MAX_DELAY', 60)
process.settings.set('AUTOTHROTTLE_TARGET_CONCURRENCY', 1.0)

# Optional: Configure Scrapyd for distributed crawling
# Note: You need to have Scrapyd server running and accessible
process.settings.set('SCRAPYD_SERVER', 'localhost:6800')  # Replace with your Scrapyd server
process.settings.set('SCRAPYD_PROJECT', 'my_project')  # Replace with your Scrapyd project

# Update the CrawlerProcess settings
process.settings.set('CONCURRENT_REQUESTS', 20)  # Adjust this for concurrent crawling
process.settings.set('DEPTH_LIMIT', MySpider.max_depth)  # Set the new depth limit for crawling

# Start the new crawling process
process.crawl(MySpider)
process.start()  # the script will block here until the crawling is finished

# Create Scikit-Learn Indexer
Create a Scikit-Learn based Indexer for constructing an inverted index in pickle format.

In [None]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import pickle

# Define the list of documents
documents = []

# Read the downloaded HTML files
for file in os.listdir():
    if file.endswith(".html"):
        with open(file, 'r') as f:
            documents.append(f.read())

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Calculate the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Save the tfidf_matrix and cosine_sim_matrix as pickle files
with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

with open('cosine_sim_matrix.pkl', 'wb') as f:
    pickle.dump(cosine_sim_matrix, f)

# Configure Indexer Settings
Configure the indexer to use TF-IDF score/weight representation, Cosine similarity. Optionally, configure Vector embedding representation (word2vec), Neural/Semantic search kNN similarity (FAISS).

In [None]:
# Optional: Configure Vector embedding representation (word2vec)
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot

# Prepare corpus for Word2Vec
corpus = [doc.split(' ') for doc in documents]

# Train Word2Vec model
model = Word2Vec(corpus, min_count=1)

# Save model
model.save('model.bin')

# Load model
model = Word2Vec.load('model.bin')

# Fit a 2D PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

# Optional: Configure Neural/Semantic search kNN similarity (FAISS)
import numpy as np
import faiss

# Prepare data for FAISS
data = np.array(result).astype('float32')

# Build the index
index = faiss.IndexFlatL2(data.shape[1])

# Add vectors to the index
index.add(data)

# Save the index
faiss.write_index(index, 'vector.index')

# Create Flask Processor
Create a Flask based Processor for handling free text queries in json format.

In [None]:
# Import necessary libraries
from flask import Flask, request, jsonify
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import nltk
from nltk.corpus import wordnet

# Load the tfidf_matrix and cosine_sim_matrix
with open('tfidf_matrix.pkl', 'rb') as f:
    tfidf_matrix = pickle.load(f)

with open('cosine_sim_matrix.pkl', 'rb') as f:
    cosine_sim_matrix = pickle.load(f)

# Create a Flask app
app = Flask(__name__)

# Define the route for handling free text queries
@app.route('/query', methods=['POST'])
def handle_query():
    # Get the query from the request
    query = request.json.get('query', '')

    # Validate the query
    if not query:
        return jsonify({'error': 'Invalid query'}), 400

    # Optional: Query spelling-correction/suggestion
    corrected_query = ' '.join([nltk.corpus.wordnet.morphy(word) if nltk.corpus.wordnet.morphy(word) else word for word in query.split()])

    # Transform the query using the TfidfVectorizer
    query_vector = tfidf_matrix.transform([corrected_query])

    # Calculate the cosine similarity between the query and the documents
    sim_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get the top-K ranked results
    top_k = sim_scores.argsort()[-10:][::-1]

    # Return the results
    return jsonify({'results': top_k.tolist()})

# Run the Flask app
if __name__ == '__main__':
    app.run(port=5000)

# Configure Processor Settings
Configure the processor to handle Query validation/error-checking, Top-K ranked results. Optionally, configure Query spelling-correction/suggestion (NLTK), query expansion (WordNet).

In [None]:
# Import necessary libraries
from flask import Flask, request, jsonify
import nltk
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Load the tfidf_matrix
with open('tfidf_matrix.pkl', 'rb') as f:
    tfidf_matrix = pickle.load(f)

# Create a Flask app
app = Flask(__name__)

# Define the route for handling free text queries
@app.route('/query', methods=['POST'])
def handle_query():
    # Get the query from the request
    query = request.json.get('query', '')

    # Validate the query
    if not query:
        return jsonify({'error': 'Invalid query'}), 400

    # Optional: Query spelling-correction/suggestion (NLTK)
    corrected_query = ' '.join([nltk.corpus.wordnet.morphy(word) if nltk.corpus.wordnet.morphy(word) else word for word in query.split()])

    # Transform the query using the TfidfVectorizer
    vectorizer = TfidfVectorizer()
    query_vector = vectorizer.transform([corrected_query])

    # Calculate the cosine similarity between the query and the documents
    sim_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get the top-K ranked results
    top_k = sim_scores.argsort()[-10:][::-1]

    # Return the results
    return jsonify({'results': top_k.tolist()})

# Optional: Query expansion (WordNet)
@app.route('/expand', methods=['POST'])
def expand_query():
    # Get the query from the request
    query = request.json.get('query', '')

    # Validate the query
    if not query:
        return jsonify({'error': 'Invalid query'}), 400

    # Expand the query using WordNet
    expanded_query = ' '.join([syn.lemmas()[0].name() for syn in wordnet.synsets(query)])

    # Return the expanded query
    return jsonify({'expanded_query': expanded_query})

# Run the Flask app
if __name__ == '__main__':
    app.run(port=5000)

# Test the System
Run tests to ensure the system is working as expected.

In [None]:
# Import necessary libraries
import requests
import json

# Define the base URL for the Flask app
base_url = 'http://localhost:5000'

# Define a list of test queries
test_queries = ['test query 1', 'test query 2', 'test query 3']

# Loop through the test queries
for query in test_queries:
    # Send a POST request to the /query endpoint
    response = requests.post(f'{base_url}/query', json={'query': query})

    # Check if the request was successful
    if response.status_code == 200:
        # Print the results
        print(f'Results for "{query}":', response.json()['results'])
    else:
        # Print the error
        print(f'Error for "{query}":', response.json()['error'])

    # Send a POST request to the /expand endpoint
    response = requests.post(f'{base_url}/expand', json={'query': query})

    # Check if the request was successful
    if response.status_code == 200:
        # Print the expanded query
        print(f'Expanded query for "{query}":', response.json()['expanded_query'])
    else:
        # Print the error
        print(f'Error for "{query}":', response.json()['error'])