## Solution 1

In [2]:
##### Importing necessary libraries

import pandas as pd
import numpy as np
from datasets import load_dataset
from preprocessor import Preprocessor
from indexer import Indexer
from linkedlist import LinkedList
from collections import OrderedDict
import linkedlist
import inspect as inspector
from tqdm import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marianivethaantonypushparaj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [136]:
##### Make sure huggingface_hub datasets in up-to-date

!pip install --upgrade huggingface_hub datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
data = load_dataset("harshi321/netflix-movies_shows")
data = data['train'][:2000]

In [21]:
##### Checking the data

print("Data: ", data['description'])
print("Type of text data: ",type(data['description']))
print("Length of the dataset: ",len( data['description']))


Type of text data:  <class 'list'>
Length of the dataset:  2000


In [22]:
class Run_query:
    def __init__(self):
        # Initialize preprocessor and indexer objects
        self.preprocessor = Preprocessor()
        self.indexer = Indexer()
    
    def _merge(self, posting_list_1, posting_list_2, skip):
        # Merge two posting lists with optional skip optimization
        result = LinkedList()  # Result linked list to store merged values
        comparisons_count = 0  # Counter to track number of comparisons
        
        if skip:
            # If skip optimization is enabled, traverse with skipping mechanism
            ptr1 = posting_list_1.start_node
            ptr2 = posting_list_2.start_node
            while ptr1 is not None and ptr2 is not None:
                if ptr1.value == ptr2.value:
                    result.insert_at_end(ptr1.value)  # Match found, insert into result
                    ptr1 = ptr1.next
                elif ptr1.value < ptr2.value:
                    # Handle skip optimization for ptr1
                    if ptr1.skip:
                        if ptr1.skip.value < ptr2.value:
                            ptr1 = ptr1.skip  # Move skip pointer forward
                        else:
                            ptr1 = ptr1.next  # Move regular pointer forward
                    else:
                        ptr1 = ptr1.next
                    comparisons_count += 1
                else:
                    # Handle skip optimization for ptr2
                    if ptr2.skip:
                        if ptr2.skip.value < ptr1.value:
                            ptr2 = ptr2.skip  # Move skip pointer forward
                        else:
                            ptr2 = ptr2.next  # Move regular pointer forward
                    else:
                        ptr2 = ptr2.next
                    comparisons_count += 1
            result.add_skip_connections()  # Add skip links to result list
        else:
            # If no skip optimization, simply merge by iterating through both lists
            ptr1 = posting_list_1.start_node
            ptr2 = posting_list_2.start_node
            while ptr1 is not None and ptr2 is not None:
                if ptr1.value == ptr2.value:
                    result.insert_at_end(ptr1.value)  # Match found, insert into result
                    ptr1 = ptr1.next
                elif ptr1.value < ptr2.value:
                    ptr1 = ptr1.next  # Move pointer forward in list 1
                    comparisons_count += 1
                else:
                    ptr2 = ptr2.next  # Move pointer forward in list 2
                    comparisons_count += 1
        
        return result, comparisons_count  # Return merged list and comparison count

    def _daat_and(self, query_terms, skip=False, tf_idf=False):
        # Perform DAAT AND operation on a list of query terms with optional skip and TF-IDF
        posting_lists = []
        for query_term in query_terms:
            try:
                posting_lists.append(self.indexer.get_index()[query_term])  # Get posting list for each term
            except KeyError:
                posting_lists.append(LinkedList())  # If term not found, use an empty list
        
        # Sort posting lists by their lengths (smallest list first for efficient merging)
        sorted_posting_lists = sorted(posting_lists, key=lambda x: x.length)
        result = sorted_posting_lists[0]  # Start with the smallest list
        comparisons_count = 0
        
        # Merge posting lists
        for posting_list in sorted_posting_lists[1:]:
            result, temp_comparisons_count = self._merge(result, posting_list, skip)
            comparisons_count += temp_comparisons_count
        
        result = result.traverse_list()  # Traverse merged result list
        if tf_idf:
            # If TF-IDF scoring is enabled, calculate and sort by TF-IDF scores
            tf_idf_scores = {}
            for doc_id in result:
                if doc_id not in tf_idf_scores:
                    tf_idf_scores[doc_id] = []
                for query_term in query_terms:
                    tf_idf_scores[doc_id].append(self.indexer.tf_idf[query_term][doc_id])
            
            tf_idf_max = {}
            # Find the maximum TF-IDF score for each document
            for doc_id in tf_idf_scores:
                tf_idf_max[doc_id] = max(tf_idf_scores[doc_id])
            
            # Sort documents by their highest TF-IDF score in descending order
            result = sorted(result, key=lambda x: tf_idf_max[x], reverse=True)
        
        return result[:5], comparisons_count, len(result)  # Return top 5 results, comparison count, and total results

    def _get_postings(self, term):
        # Get postings list for a given term from the index
        ps_list = self.indexer.get_index()[term]
        return ps_list

    def _output_formatter(self, op):
        # Format output results for queries
        if op is None or len(op) == 0:
            return [], 0
        op_no_score = [int(i) for i in op]  # Convert to list of integers (document IDs)
        results_cnt = len(op_no_score)
        return op_no_score, results_cnt  # Return formatted result and count of documents

    def run_indexer(self, corpus):
        # Run the indexing process on the corpus of documents
        docs = {}
        for i in range(len(corpus)):
            docs[i] = self.preprocessor.tokenizer(corpus[i])  # Tokenize each document
        
        doc_ids = list(docs.keys())
        # Generate inverted index for each document
        for doc_id in doc_ids:
            self.indexer.generate_inverted_index(doc_id, docs[doc_id])

        # Sort terms, add skip connections, and calculate TF-IDF
        self.indexer.sort_terms()
        self.indexer.add_skip_connections()
        self.indexer.calculate_tf_idf()

    def run_queries(self, query_list):
        # Run the queries on the preprocessed index and return results
        output_dict = {
            'postingsList': {},
            'postingsListSkip': {},
            'daatAndTfIdf': {},
            'daatAndSkipTfIdf': {}
        }
        
        # Process each query in the query_list
        for query in tqdm(query_list):
            input_term_arr = self.preprocessor.tokenizer(query)  # Tokenized query
            
            # For each query term, get the postings list and skip postings list
            for term in input_term_arr:
                postings, skip_postings = None, None

                try:
                    ps_list = self._get_postings(term)
                    postings = ps_list.traverse_list()  # Get list of document IDs for term
                    skip_postings = ps_list.traverse_skips()  # Get list with skip pointers
                    output_dict['postingsList'][term] = postings
                    output_dict['postingsListSkip'][term] = skip_postings
                except KeyError:
                    # If no postings found for the term, return empty lists
                    output_dict['postingsList'][term] = []
                    output_dict['postingsListSkip'][term] = []

            # Perform AND operations with and without skip and with TF-IDF
            and_op_no_skip_sorted, and_comparisons_no_skip_sorted, and_results_cnt_no_skip_sorted = self._daat_and(input_term_arr, tf_idf=True)
            and_op_no_score_skip_sorted, and_comparisons_skip_sorted, and_results_cnt_skip_sorted = self._daat_and(input_term_arr, skip=True, tf_idf=True)

            # Store results of AND operations in output dictionary
            output_dict['daatAndTfIdf'][query.strip()] = {
                'results': and_op_no_skip_sorted,
                'num_docs': and_results_cnt_no_skip_sorted,
                'num_comparisons': and_comparisons_no_skip_sorted
            }

            output_dict['daatAndSkipTfIdf'][query.strip()] = {
                'results': and_op_no_score_skip_sorted,
                'num_docs': and_results_cnt_skip_sorted,
                'num_comparisons': and_comparisons_skip_sorted
            }
        
        return output_dict  # Return all results for the queries


In [30]:
runner = Run_query()
runner.run_indexer(data['description'])


queries = ['romance']  ############### Alter the queries here

output_dict = runner.run_queries(queries)
print(output_dict['daatAndSkipTfIdf'])


##### Postings list with skip pointers

for query, documents in output_dict['daatAndSkipTfIdf'].items():
    print(f"Query: {query}")
    print("Retrieved Documents with skip pointers:")
    for doc_id in documents['results']:
        print(data['title'][doc_id])
    print("-" * 40)


100%|██████████| 1/1 [00:00<00:00, 6364.65it/s]

{'romance': {'results': [798, 402, 492, 1038, 1269], 'num_docs': 42, 'num_comparisons': 0}}
Query: romance
Retrieved Documents with skip pointers:
Love Jones
The Last Letter From Your Lover
Midnight Sun
Dancing Angels
Geez & Ann
----------------------------------------





## Solution 2

In [34]:
##### Importing necessary libraries

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [43]:
class QueryProcessor:
    def __init__(self, titles, descriptions):
        self.vectorizer = TfidfVectorizer()
        self.titles = titles 
        self.descriptions = descriptions
    
    # Convert documents to TF-IDF vectors using descriptions
    def compute_tfidf_vectors(self):
        tfidf_matrix = self.vectorizer.fit_transform(self.descriptions)
        return tfidf_matrix

    # Compute cosine similarity between the query and documents
    def compute_cosine_similarity(self, tfidf_matrix, query):
        query_tfidf = self.vectorizer.transform([query])
        cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
        return cosine_similarities
    
    # Retrieve top N document indices based on cosine similarity
    def get_top_n_results(self, cosine_similarities, top_n=5):
        top_n_indices = np.argsort(cosine_similarities)[::-1][:top_n]
        return top_n_indices

    def process_query(self, query, top_n=5):
        tfidf_matrix = self.compute_tfidf_vectors()
        cosine_similarities = self.compute_cosine_similarity(tfidf_matrix, query)
        top_n_indices = self.get_top_n_results(cosine_similarities, top_n)

        # Print top N results (Title and Description)
        print(f"Top {top_n} Results for the Query '{query}':")
        for idx in top_n_indices:
            print(f"Document {idx + 1}: Title: {self.titles[idx]} | Description: {self.descriptions[idx]} (Cosine Similarity: {cosine_similarities[idx]:.4f})")


In [47]:
titles = data['title'] 
descriptions = data['description']

query_processor = QueryProcessor(titles=titles, descriptions=descriptions)

query = "I need thriller movies"
query_processor.process_query(query, top_n=5)

Top 5 Results for the Query 'I need thriller movies':
Document 201: Title: Krishna Cottage | Description: True love is put to the test when another woman comes between a pair of star-crossed young lovers in this thriller. (Cosine Similarity: 0.2157)
Document 766: Title: Xtreme | Description: In this fast-paced and action-packed thriller, a retired hitman — along with his sister and a troubled teen — takes revenge on his lethal stepbrother. (Cosine Similarity: 0.1880)
Document 1679: Title: Raman Raghav 2.0 | Description: A corrupt cop and a serial killer obsessed with a psychopath from the '60s get caught up in a ruthless cat-and-mouse game in this Indian thriller. (Cosine Similarity: 0.1865)
Document 802: Title: Never Back Down 2: The Beatdown | Description: A group of mixed martial arts fighters stars in this action thriller that follows a quartet of brawlers as they prepare for a major underground event. (Cosine Similarity: 0.1844)
Document 902: Title: Deadly Switch | Description: In

## Solution 3

In [16]:
pip install huggingface-hub==0.25.2

Collecting huggingface-hub==0.25.2
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.25.2-py3-none-any.whl (436 kB)
Installing collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.29.1
    Uninstalling huggingface-hub-0.29.1:
      Successfully uninstalled huggingface-hub-0.29.1
Successfully installed huggingface-hub-0.25.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [49]:
##### Importing necessary libraries

from sentence_transformers import SentenceTransformer, CrossEncoder
from sentence_transformers.util import cos_sim

In [50]:
class EncoderSearch:
    def __init__(self, titles, descriptions, model):
        self.titles = titles 
        self.descriptions = descriptions 
        self.model = model
        self.corpus_embeddings = self.model.encode(descriptions, convert_to_numpy=True) 

    def search(self, query, top_n=5):
        query_embedding = self.model.encode(query, convert_to_numpy=True)

        # Compute cosine similarity
        similarity_scores = np.dot(self.corpus_embeddings, query_embedding) / (
            np.linalg.norm(self.corpus_embeddings, axis=1) * np.linalg.norm(query_embedding)
        )

        # Get top N results
        top_indices = np.argsort(similarity_scores)[::-1][:top_n]

        # Print results with titles and descriptions
        print("\nTop Results:")
        for i, idx in enumerate(top_indices):
            print(f"{i+1}. Title: {self.titles[idx]} | Description: {self.descriptions[idx]} (Score: {similarity_scores[idx]:.4f})")

In [54]:
titles = data['title'][:100] # List of titles
descriptions = data['description'][:100]  # List of descriptions
binary_model = SentenceTransformer('all-MiniLM-L6-v2')  # Replace with your specific model

searcher = EncoderSearch(titles=titles, descriptions=descriptions, model=binary_model)

query = "I need crime movies"
searcher.search(query, top_n=5)


Top Results:
1. Title: The Women and the Murderer | Description: This documentary traces the capture of serial killer Guy Georges through the tireless work of two women: a police chief and a victim's mother. (Score: 0.4125)
2. Title: Crime Stories: India Detectives | Description: Cameras following Bengaluru police on the job offer a rare glimpse into the complex and challenging inner workings of four major crime investigations. (Score: 0.3945)
3. Title: Omo Ghetto: the Saga | Description: Twins are reunited as a good-hearted female gangster and her uptight rich sister take on family, crime, cops and all of the trouble that follows them. (Score: 0.3187)
4. Title: Show Dogs | Description: A rough and tough police dog must go undercover with an FBI agent as a prim and proper pet at a dog show to save a baby panda from an illegal sale. (Score: 0.3049)
5. Title: Vendetta: Truth, Lies and The Mafia | Description: Sicily boasts a bold "Anti-Mafia" coalition. But what happens when those tryin