<a href="https://colab.research.google.com/github/JamesMungai254/Information-Retrieval-System/blob/main/Information_Retrieval_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Step 1
> Preprocessing Document

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [4]:
def preprocess(text):
  tokens = nltk.word_tokenize(text.lower())
  tokens = [stemmer.stem(t) for t in tokens if t.isalnum() and t not in stop_words]
  return tokens

In [6]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Step 2
> Build an Inverted Index

In [7]:
from collections import defaultdict

corpus = {
    'doc1': 'the quick brown fox',
    'doc2': 'the lazy dog jumps over the brown fox'
}

inverted_index = defaultdict(dict)

for doc_id, text in corpus.items():
  tokens = preprocess(text)
  for token in tokens:
    if doc_id not in inverted_index[token]:
      inverted_index[token][doc_id] = 0
    inverted_index[token][doc_id] += 1

print(inverted_index)

defaultdict(<class 'dict'>, {'quick': {'doc1': 1}, 'brown': {'doc1': 1, 'doc2': 1}, 'fox': {'doc1': 1, 'doc2': 1}, 'lazi': {'doc2': 1}, 'dog': {'doc2': 1}, 'jump': {'doc2': 1}})


## Step 3
> compute TF-IDF scores

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [preprocess(doc) for doc in corpus.values()]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in documents])
print(tfidf_matrix.toarray())


[[0.50154891 0.         0.50154891 0.         0.         0.70490949]
 [0.35520009 0.49922133 0.35520009 0.49922133 0.49922133 0.        ]]


## Step 4
> Query processing and Ranking

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
query = 'brown fox'
query_tokens = preprocess(query)

#Calculate TF-IDF for the query
query_vec = vectorizer.transform([' '.join(query_tokens)])

#Compute COSINE similarity
scores = cosine_similarity(query_vec, tfidf_matrix)
print(scores)

[[0.70929727 0.50232878]]


## Semantic Search using Sentence BERT

In [11]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(list(corpus.values()))
querry_embeddings = model.encode(['brown fox'])

scores = cosine_similarity(querry_embeddings, doc_embeddings)
print(scores)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[[0.8616927 0.5332887]]


In [18]:


import numpy as np

doc_scores = dict(zip(corpus.keys(), scores[0]))

# Find the document with the highest score
highest_scoring_doc = max(doc_scores, key=doc_scores.get)

print(f"The document with the highest score is: {highest_scoring_doc}")
print(f"Its score is: {doc_scores[highest_scoring_doc]}")


The document with the highest score is: doc1
Its score is: 0.8616927266120911


In [20]:

highest_scoring_doc = max(doc_scores, key=doc_scores.get)

print(f"{corpus[highest_scoring_doc]}")


the quick brown fox
