In [2]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import collections


In [3]:
class Crawler:

    def __init__(self):
        self.to_visit = list()
        self.visited = set()

    def fetch(self, url):
        print('Fetching:', url)
        try:
            res = requests.get(url)
            res.raise_for_status()
            return res.content
        except requests.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            return None

    def get_current_url(self):
        while self.to_visit:
            url = self.to_visit.pop()
            if url not in self.visited:
                return url
        return None

    def get_links(self, content):
        soup = BeautifulSoup(content, 'html.parser')
        urls = [link.get('href') for link in soup.find_all('a', href=True)]
        for url in urls:
            if url.startswith('http'):
                self.to_visit.append(url)

    def crawl(self, url, depth=10):
        self.to_visit.append(url)
        while len(self.visited) < depth:
            current_url = self.get_current_url()
            if current_url is None:
                break
            content = self.fetch(current_url)
            if content:
                self.visited.add(current_url)
                self.get_links(content)
        return list(self.visited)

In [4]:
class Preprocessor:

    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def clean_html(self, html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        text = soup.get_text(separator=' ')
        return text

    def preprocess_text(self, text):
        tokens = word_tokenize(text)
        tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in self.stop_words]
        stemmed_tokens = [self.stemmer.stem(word) for word in tokens]
        return ' '.join(stemmed_tokens), tokens


In [5]:
class Indexer:

    def __init__(self):
        self.vectorizer = TfidfVectorizer()
        self.inverted_index = collections.defaultdict(list)
        self.positional_index = collections.defaultdict(lambda: collections.defaultdict(list))

    def create_index(self, documents):
        tfidf_matrix = self.vectorizer.fit_transform(documents)
        feature_names = self.vectorizer.get_feature_names_out()

        for doc_id, doc in enumerate(documents):
            terms = doc.split()
            for pos, term in enumerate(terms):
                self.inverted_index[term].append(doc_id)
                self.positional_index[term][doc_id].append(pos)

        return tfidf_matrix, feature_names


In [6]:
class Searcher1:

    def __init__(self, tfidf_matrix, feature_names, vectorizer, inverted_index, positional_index):
        self.tfidf_matrix = tfidf_matrix
        self.feature_names = feature_names
        self.vectorizer = vectorizer
        self.inverted_index = inverted_index
        self.positional_index = positional_index

    def search_cosine(self, query, top_n=10):
        query_vector = self.vectorizer.transform([query])
        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        related_docs_indices = cosine_similarities.argsort()[:-top_n-1:-1]
        return related_docs_indices, cosine_similarities[related_docs_indices]

    def search_boolean(self, query, operator='AND'):
        terms = query.split()
        result_sets = [set(self.inverted_index[term]) for term in terms if term in self.inverted_index]
        
        if operator == 'AND':
            result = set.intersection(*result_sets) if result_sets else set()
        elif operator == 'OR':
            result = set.union(*result_sets) if result_sets else set()
        elif operator == 'NOT':
            result = set(range(len(self.tfidf_matrix))) - set.union(*result_sets) if result_sets else set(range(len(self.tfidf_matrix)))
        
        return list(result)


In [8]:
class TolerantRetrieval1:

    def __init__(self, terms):
        self.terms = terms

    def edit_distance(self, word1, word2):
        dp = [[0] * (len(word2) + 1) for _ in range(len(word1) + 1)]
        for i in range(len(word1) + 1):
            for j in range(len(word2) + 1):
                if i == 0:
                    dp[i][j] = j
                elif j == 0:
                    dp[i][j] = i
                elif word1[i - 1] == word2[j - 1]:
                    dp[i][j] = dp[i - 1][j - 1]
                else:
                    dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
        return dp[-1][-1]
    def correct_spelling(self, query):
        corrections = []
        for word in query.split():
            min_dist = float('inf')
            corrected_word = word
            for term in self.terms:
                dist = self.edit_distance(word, term)
                if dist < min_dist:
                    min_dist = dist
                    corrected_word = term
            corrections.append(corrected_word)
        return ' '.join(corrections)



In [9]:
crawler = Crawler()
preprocessor = Preprocessor()
indexer = Indexer()

In [10]:
urls = crawler.crawl('https://en.wikipedia.org/wiki/Natural_language_processing', depth=10)

Fetching: https://en.wikipedia.org/wiki/Natural_language_processing
Fetching: https://www.mediawiki.org/
Fetching: https://wikimediafoundation.org/
Fetching: https://automattic.com/privacy-notice/
Fetching: https://subscribe.wordpress.com/
Fetching: https://automattic.com/work-with-us/
Fetching: https://wordpress.com/read/blogs/54117/posts/238666
Fetching: https://wordpress.com/abuse/?report_url=https://automattic.com/work-with-us/
Fetching: https://automattic.com/
Fetching: https://wordpress.com/read/blogs/54117/posts/39806


In [11]:
documents = []
raw_documents = []
for url in urls:
    content = crawler.fetch(url)
    if content:
        cleaned_text = preprocessor.clean_html(content)
        preprocessed_text, _ = preprocessor.preprocess_text(cleaned_text)
        documents.append(preprocessed_text)
        raw_documents.append(cleaned_text)

Fetching: https://wordpress.com/read/blogs/54117/posts/238666
Fetching: https://automattic.com/privacy-notice/
Fetching: https://www.mediawiki.org/
Fetching: https://automattic.com/work-with-us/
Fetching: https://automattic.com/
Fetching: https://en.wikipedia.org/wiki/Natural_language_processing
Fetching: https://wikimediafoundation.org/
Fetching: https://subscribe.wordpress.com/
Fetching: https://wordpress.com/read/blogs/54117/posts/39806
Fetching: https://wordpress.com/abuse/?report_url=https://automattic.com/work-with-us/


In [12]:
documents

['pleas enabl javascript browser enjoy',
 'privaci notic visitor user site automatt automatt home us news work us privaci notic visitor user site hi privaci notic explain automatt process inform visitor user websit connect servic provid jetpack includ woocommerc ship tax intensedeb akismet read privaci notic cover let talk first automatt privaci notic cover folk behind varieti product servic design allow blogger small busi owner creat publish manag websit offer design featur support bring websit life jetpack websit owner host websit elsewher connect websit featur tool avail woocommerc ship tax crowdsign help site owner creat quizz survey poll fit brand vision intens debat give site owner tool manag comment websit akismet help keep spam control filter spam million everi day keep thing simpl privaci notic refer user servic provid product websit administr contributor author user refer user websit visitor site read publish content interact site featur comment like respons follow put togeth

In [48]:
tfidf_matrix, feature_names = indexer.create_index(documents)

In [81]:
print("Inverted Index:")
for term, doc_ids in indexer.inverted_index.items():
    print(f"Term: {term}")
    print("Document IDs:", doc_ids)
    print()

Inverted Index:
Term: pleas
Document IDs: [0, 1, 1, 1, 1, 1, 5, 8, 9, 0, 1, 1, 1, 1, 1, 5, 8, 9]

Term: enabl
Document IDs: [0, 1, 1, 5, 5, 8, 0, 1, 1, 5, 5, 8]

Term: javascript
Document IDs: [0, 8, 0, 8]

Term: browser
Document IDs: [0, 1, 1, 1, 1, 1, 1, 8, 0, 1, 1, 1, 1, 1, 1, 8]

Term: enjoy
Document IDs: [0, 8, 0, 8]

Term: privaci
Document IDs: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 9, 9, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 9, 9, 9]

Term: notic
Document IDs: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 7, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 7, 9]

Term: visitor
Document IDs: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [82]:
print("Positional Index:")
for term, positions in indexer.positional_index.items():
    print(f"Term: {term}")
    for doc_id, pos_list in positions.items():
           print(f"Document ID: {doc_id}, Positions: {pos_list}")
    print()


Positional Index:
Term: pleas
Document ID: 0, Positions: [0, 0]
Document ID: 1, Positions: [606, 679, 873, 1140, 1227, 606, 679, 873, 1140, 1227]
Document ID: 5, Positions: [280, 280]
Document ID: 8, Positions: [0, 0]
Document ID: 9, Positions: [43, 43]

Term: enabl
Document ID: 0, Positions: [1, 1]
Document ID: 1, Positions: [656, 660, 656, 660]
Document ID: 5, Positions: [2613, 2634, 2613, 2634]
Document ID: 8, Positions: [1, 1]

Term: javascript
Document ID: 0, Positions: [2, 2]
Document ID: 8, Positions: [2, 2]

Term: browser
Document ID: 0, Positions: [3, 3]
Document ID: 1, Positions: [424, 435, 551, 622, 1064, 1175, 424, 435, 551, 622, 1064, 1175]
Document ID: 8, Positions: [3, 3]

Term: enjoy
Document ID: 0, Positions: [4, 4]
Document ID: 8, Positions: [4, 4]

Term: privaci
Document ID: 1, Positions: [0, 12, 18, 38, 45, 117, 146, 162, 174, 185, 609, 749, 816, 851, 950, 966, 1142, 1245, 1249, 1285, 0, 12, 18, 38, 45, 117, 146, 162, 174, 185, 609, 749, 816, 851, 950, 966, 1142, 12

In [17]:
searcher = Searcher1(tfidf_matrix, feature_names, indexer.vectorizer, indexer.inverted_index, indexer.positional_index)
tolerant_retrieval = TolerantRetrieval1(indexer.inverted_index.keys())

In [69]:
query = " cool"

In [70]:
corrected_query = tolerant_retrieval.correct_spelling(query)

In [71]:
preprocessed_query, _ = preprocessor.preprocess_text(corrected_query)

In [72]:
preprocessed_query

'tool'

In [73]:
indices, scores = searcher.search_cosine(preprocessed_query)
print("Cosine Similarity Search Results:")
for index, score in zip(indices, scores):
    print(f"Document: {urls[index]}, Score: {score}")

Cosine Similarity Search Results:
Document: https://automattic.com/, Score: 0.06869120588596025
Document: https://www.mediawiki.org/, Score: 0.06749311136672842
Document: https://automattic.com/privacy-notice/, Score: 0.02921349182012214
Document: https://wikimediafoundation.org/, Score: 0.011407022629637671
Document: https://en.wikipedia.org/wiki/Natural_language_processing, Score: 0.011119258672387002
Document: https://wordpress.com/abuse/?report_url=https://automattic.com/work-with-us/, Score: 0.0
Document: https://wordpress.com/read/blogs/54117/posts/39806, Score: 0.0
Document: https://subscribe.wordpress.com/, Score: 0.0
Document: https://automattic.com/work-with-us/, Score: 0.0
Document: https://wordpress.com/read/blogs/54117/posts/238666, Score: 0.0


In [77]:
boolean_query = "cool"
preprocessed_boolean_query, _ = preprocessor.preprocess_text(boolean_query)
boolean_results = searcher.search_boolean(preprocessed_boolean_query, operator='AND')

In [78]:
boolean_results

[]

In [79]:
print("\nBoolean Search Results:")
for index in boolean_results:
    print(f"Document: {urls[index]}")



Boolean Search Results:
