In [1]:
def cur_decomposition(X, rank):
    """Perform CUR decomposition on a matrix X with a specified rank."""
    col_norms = np.sum(X**2, axis=0)
    row_norms = np.sum(X**2, axis=1)
    prob_cols = col_norms / np.sum(col_norms)
    prob_rows = row_norms / np.sum(row_norms)

    selected_cols = np.random.choice(X.shape[1], rank, replace=False, p=prob_cols)
    selected_rows = np.random.choice(X.shape[0], rank, replace=False, p=prob_rows)

    C = X[:, selected_cols]
    R = X[selected_rows, :]

    W = X[np.ix_(selected_rows, selected_cols)]
    U = np.linalg.pinv(W)

    return C, U, R

In [2]:
def lda_gibbs_sampling_with_cur(docs, num_topics, num_iter=1000, alpha=0.1, beta=0.1, rank=4, verbose=False):
    # Step 1: Create Term-Document Matrix
    vocab = list(set(word for doc in docs for word in doc.split()))
    term_doc_matrix = np.zeros((len(docs), len(vocab)))

    for i, doc in enumerate(docs):
        for word in doc.split():
            term_doc_matrix[i, vocab.index(word)] += 1

    # Step 2: Apply CUR decomposition
    C, U, R = cur_decomposition(term_doc_matrix, rank)
    reduced_matrix = C @ U @ R

    # Step 3: Use reduced_matrix for LDA
    num_docs, vocab_size = reduced_matrix.shape
    topic_assignments = np.random.randint(0, num_topics, size=(num_docs, vocab_size))

    doc_topic_counts = np.zeros((num_docs, num_topics))
    topic_word_counts = np.zeros((num_topics, vocab_size))
    topic_counts = np.zeros(num_topics)

    for d in range(num_docs):
        for w in range(vocab_size):
            topic = topic_assignments[d, w]
            doc_topic_counts[d, topic] += reduced_matrix[d, w]
            topic_word_counts[topic, w] += reduced_matrix[d, w]
            topic_counts[topic] += reduced_matrix[d, w]

    for iteration in range(num_iter):
        if verbose:
            print(f"Iteration {iteration + 1}/{num_iter}")

        for d in range(num_docs):
            if verbose and d % 10 == 0:  # Print progress for every 10 documents
                print(f"Processing document {d + 1}/{num_docs}")

            for w in range(vocab_size):
                word_count = reduced_matrix[d, w]
                if word_count == 0:
                    continue

                topic = topic_assignments[d, w]
                doc_topic_counts[d, topic] -= word_count
                topic_word_counts[topic, w] -= word_count
                topic_counts[topic] -= word_count

                topic_probs = (
                    (doc_topic_counts[d, :] + alpha)
                    * (topic_word_counts[:, w] + beta)
                    / (topic_counts + beta * vocab_size)
                )
                topic_probs = np.maximum(topic_probs, 0)  # Ensure non-negative
                prob_sum = topic_probs.sum()
                if prob_sum > 0:
                    topic_probs /= prob_sum
                else:
                    topic_probs = np.ones(num_topics) / num_topics

                new_topic = np.random.choice(num_topics, p=topic_probs)
                topic_assignments[d, w] = new_topic
                doc_topic_counts[d, new_topic] += word_count
                topic_word_counts[new_topic, w] += word_count
                topic_counts[new_topic] += word_count

    doc_topic_dist = (doc_topic_counts + alpha) / (doc_topic_counts.sum(axis=1, keepdims=True) + num_topics * alpha)
    topic_word_dist = (topic_word_counts + beta) / (topic_word_counts.sum(axis=1, keepdims=True) + vocab_size * beta)

    return doc_topic_dist, topic_word_dist, vocab

In [3]:
def dynamic_query_expansion(query, documents, model, top_n=3, original_weight=0.7, expanded_weight=0.3):
    unique_terms = set(' '.join(documents).split())
    term_embeddings = {term: model.encode(term) for term in unique_terms}

    query_terms = query.split()
    query_embeddings = [model.encode(term) for term in query_terms]

    query_embedding = np.mean(query_embeddings, axis=0)
    similarities = {
        term: cosine_similarity([query_embedding], [embedding])[0][0]
        for term, embedding in term_embeddings.items()
    }
    expanded_terms = sorted(similarities, key=similarities.get, reverse=True)[:top_n]

    expanded_embeddings = [term_embeddings[term] for term in expanded_terms]
    combined_embedding = (
        original_weight * np.mean(query_embeddings, axis=0) +
        expanded_weight * np.mean(expanded_embeddings, axis=0)
    )

    expanded_query_terms = set(query_terms).union(expanded_terms)
    return combined_embedding, ' '.join(expanded_query_terms)

In [4]:
def retrieve_documents_with_cur(query, documents, model, lda_topics, lda_vocab, rank=10, top_n=5):
    query_embedding, expanded_query = dynamic_query_expansion(query, documents, model)
    print(f"Expanded Query: '{expanded_query}'\n")
    document_embeddings = model.encode(documents)

    print("Applying CUR Decomposition to reduce document embeddings dimensionality...")
    C, U, R = cur_decomposition(document_embeddings, rank=rank)
    reduced_document_embeddings = C @ U
    reduced_query_embedding = query_embedding @ (R).T

    similarities = cosine_similarity([reduced_query_embedding], reduced_document_embeddings).flatten()
    top_indices = np.argsort(-similarities)[:top_n]

    print("LDA Topics Distribution for Top Documents:")
    for idx in top_indices:
        doc_topics = lda_topics[idx]
        print(f"Document: {documents[idx]}\nTopic Distribution: {doc_topics}\n")

    return top_indices

In [5]:
import pandas as pd
collection_data_df=pd.read_csv('collection_data_df_5000.csv')


In [6]:
collection_data_df

Unnamed: 0.1,Unnamed: 0,document_id,document_text,processed_document
0,0,1,The Manhattan Project and its atomic bomb help...,manhattan project atomic bomb helped bring end...
1,1,2,Essay on The Manhattan Project - The Manhattan...,essay manhattan project manhattan project manh...
2,2,3,The Manhattan Project was the name for a proje...,manhattan project name project conduct world w...
3,3,4,versions of each volume as well as complementa...,version volume well complementary website firs...
4,4,5,The Manhattan Project. This once classified ph...,manhattan project classify photograph feature ...
...,...,...,...,...
4995,4995,4996,For a 2 micron cell diameter (a relatively lar...,micron cell diameter relatively large bacteriu...
4996,4996,4997,Assuming that the potential changes linearly o...,assume potential change linearly entire thickn...
4997,4997,4998,Using Franklinâs own dimensions for the size...,use franklinâs dimension size oil slick ie â½ ...
4998,4998,4999,R-value of about 19.5. The ICC-ES report for H...,rvalue icce report home foam insulthane esr is...


In [7]:
import os
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

# Set the proxy environment variables
os.environ['HTTP_PROXY'] = 'socks5h://127.0.0.1:1080'
os.environ['HTTPS_PROXY'] = 'socks5h://127.0.0.1:1080'

# Explicitly download the model
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Wrap it with SentenceTransformer for compatibility
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model loaded successfully.")


Model loaded successfully.


In [8]:
# Use the processed text for LDA and CUR
documents = collection_data_df['processed_document'].tolist()

In [9]:
import numpy as np

In [10]:
#from sentence_transformers import SentenceTransformer

# Load a SentenceTransformer model
#model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


# Generate document embeddings
document_embeddings = sentence_model.encode(documents)

In [11]:
print(document_embeddings)
print(document_embeddings.shape)

[[ 0.00369316  0.12447743 -0.03030166 ... -0.0348379  -0.04860682
   0.0244041 ]
 [-0.04892228  0.07967986  0.00721633 ... -0.00510206 -0.04808084
   0.00623703]
 [-0.0628251   0.03445498 -0.0747632  ...  0.01157742 -0.0669866
   0.02865066]
 ...
 [-0.01500327 -0.08697923  0.02888349 ...  0.07529388 -0.05132559
  -0.02862591]
 [ 0.03148302 -0.02373952 -0.04966878 ... -0.00221377  0.03664904
   0.04657203]
 [ 0.00851863  0.00096832  0.07085533 ... -0.05709013 -0.06484171
   0.0143047 ]]
(5000, 384)


CUR:

In [12]:
# Apply CUR decomposition
rank = 200  # You can tune this based on the dataset size
C, U, R = cur_decomposition(document_embeddings, rank)

print("C matrix shape:", C.shape)
print("U matrix shape:", U.shape)
print("R matrix shape:", R.shape)


C matrix shape: (5000, 200)
U matrix shape: (200, 200)
R matrix shape: (200, 384)


LDA:

In [13]:
# Perform LDA using the Gibbs sampling function
num_topics = 20  # Number of topics to extract
num_iter = 1000  # Number of iterations for Gibbs sampling

doc_topic_dist, topic_word_dist, vocab = lda_gibbs_sampling_with_cur(documents, num_topics, num_iter,verbose=True,rank=rank)

print("Document-Topic Distribution Shape:", doc_topic_dist.shape)
print("Topic-Word Distribution Shape:", topic_word_dist.shape)
print("Vocabulary Size:", len(vocab))

Iteration 1/1000
Processing document 1/5000
Processing document 11/5000
Processing document 21/5000
Processing document 31/5000
Processing document 41/5000
Processing document 51/5000
Processing document 61/5000
Processing document 71/5000
Processing document 81/5000
Processing document 91/5000
Processing document 101/5000
Processing document 111/5000
Processing document 121/5000
Processing document 131/5000
Processing document 141/5000
Processing document 151/5000
Processing document 161/5000
Processing document 171/5000
Processing document 181/5000
Processing document 191/5000
Processing document 201/5000
Processing document 211/5000
Processing document 221/5000
Processing document 231/5000
Processing document 241/5000
Processing document 251/5000
Processing document 261/5000
Processing document 271/5000
Processing document 281/5000
Processing document 291/5000
Processing document 301/5000
Processing document 311/5000
Processing document 321/5000
Processing document 331/5000
Processi

KeyboardInterrupt: 

In [None]:
import pickle

In [None]:
lda_model_5000 = {
    'doc_topic_dist': doc_topic_dist,
    'topic_word_dist': topic_word_dist,
    'vocab': vocab
}

In [None]:
with open('lda_model_5000.pkl', 'wb') as file:
    pickle.dump(lda_model_5000, file)

In [None]:
with open('lda_model_5000.pkl', 'rb') as file:
    loaded_lda_model = pickle.load(file)

# Access the components
doc_topic_dist = loaded_lda_model['doc_topic_dist']
topic_word_dist = loaded_lda_model['topic_word_dist']
vocab = loaded_lda_model['vocab']

# Verify the contents
print("Loaded Document-Topic Distribution Shape:",doc_topic_dist.shape)
print("Loaded Topic-Word Distribution Shape:",topic_word_dist.shape)
print("Loaded Vocabulary Size:", len(vocab))

Loaded Document-Topic Distribution Shape: (1000, 5)
Loaded Topic-Word Distribution Shape: (5, 6396)
Loaded Vocabulary Size: 6396


In [None]:
print(doc_topic_dist)

[[0.2        0.2        0.2        0.2        0.2       ]
 [0.2        0.2        0.2        0.2        0.2       ]
 [0.2        0.2        0.2        0.2        0.2       ]
 ...
 [0.00699301 0.00699301 0.00699301 0.97202797 0.00699301]
 [0.00153846 0.00153846 0.62461538 0.00153846 0.37076923]
 [0.00699301 0.00699301 0.00699301 0.97202797 0.00699301]]


In [None]:
print(topic_word_dist)

[[ 3.05957272e-05  3.05957272e-05  3.05957272e-05 ...  3.05957272e-05
   3.05957272e-05  3.05957272e-05]
 [ 2.70237527e-05  2.70237527e-05  2.70237527e-05 ...  2.70237527e-05
  -1.40993493e-05  2.70237527e-05]
 [ 4.42620134e-05  4.42620134e-05  8.89666469e-03 ...  4.42620134e-05
   1.24592757e-02  4.42620134e-05]
 [ 2.36071284e-05  2.36071284e-05  2.36071284e-05 ...  2.36071284e-05
  -7.93661394e-04  2.36071284e-05]
 [ 3.38483704e-05  3.38483704e-05  3.38483704e-05 ...  3.38483704e-05
   9.82338576e-05  3.38483704e-05]]


In [None]:
noisy_queries=pd.read_csv('/home/student/vishaka/noisy_queries_df_5000.csv')

In [None]:
noisy_queries

Unnamed: 0.1,Unnamed: 0,query_id,query
0,0,312651,how much does an average person make for tutoring
1,1,852302,what is the unit for pulse
2,2,900736,what teas are good for what
3,3,608727,what county is lodi california
4,4,738038,what is defamation harm
5,5,645590,what does physical medicine do
6,6,852919,what is the vehicle height on rv
7,7,637313,what does extreme obesity mean
8,8,850072,what is the temp in amsterdam
9,9,80385,can you use a calculator on the compass test


In [None]:
# Dynamic Query Expansion with Weighting
def dynamic_query_expansion(query, documents, model, top_n=3, original_weight=0.82, expanded_weight=0.18):
    unique_terms = set(' '.join(documents).split())
    term_embeddings = {term: sentence_model.encode(term) for term in unique_terms}

    query_terms = query.split()
    query_embeddings = [sentence_model.encode(term) for term in query_terms]

    query_embedding = np.mean(query_embeddings, axis=0)
    similarities = {
        term: cosine_similarity([query_embedding], [embedding])[0][0]
        for term, embedding in term_embeddings.items()
    }
    expanded_terms = sorted(similarities, key=similarities.get, reverse=True)[:top_n]

    expanded_embeddings = [term_embeddings[term] for term in expanded_terms]
    combined_embedding = (
        original_weight * np.mean(query_embeddings, axis=0) +
        expanded_weight * np.mean(expanded_embeddings, axis=0)
    )

    expanded_query_terms = set(query_terms).union(expanded_terms)
    return combined_embedding, ' '.join(expanded_query_terms)


In [None]:
def retrieve_documents_with_cur(query, documents, C, U, R, model, lda_topics, lda_vocab, top_n=5):
    """
    Retrieve documents using CUR-reduced embeddings and query expansion.
    """
    # Approximate the embeddings using CUR
    reduced_embeddings = C@U@R

    # Expand the query dynamically
    query_embedding, expanded_query = dynamic_query_expansion(query, documents, model)
    print(f"Original Query: '{query}'")
    print(f"Expanded Query: '{expanded_query}'")

    # Compute similarity with CUR-reduced embeddings
    similarities = cosine_similarity([query_embedding], reduced_embeddings).flatten()
    top_indices = np.argsort(-similarities)[:top_n]

    print("\nLDA Topics Distribution for Top Documents:")
    for idx in top_indices:
        doc_topics = lda_topics[idx]
        print(f"Document: {documents[idx]}\nTopic Distribution: {doc_topics}\n")

    return top_indices


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
top_indices = retrieve_documents_with_cur(queries['query'][0], documents, C, U, R, model, loaded_doc_topic_dist, vocab)

print("\nTop Matching Documents (with CUR):")
for idx in top_indices:
    print(f"Document: {documents[idx]}")

Original Query: 'how much does an average person make for tutoring'
Expanded Query: 'person does one make much an making tutoring how for average'

LDA Topics Distribution for Top Documents:
Document: researchers university rochester new york show whether people engage video game healthy way consequence whether certain basic need feeling competence autonomy relatedness meet livesn randomise control trial clinically depressed sample adult positive influence video game show include reduction tension anger depression fatigue increase vigour
Topic Distribution: [-0.17346725 -0.0070754   1.16576225  0.00326788  0.01151252]

Document: make someone happy sake one rewarding feeling world brighten personâs day whether person good friend waiter bring good karma make day bright turnake someone happy sake one rewarding feeling world brighten personâs day whether person good friend waiter bring good karma make day bright turn
Topic Distribution: [0.98854415 0.00286396 0.00286396 0.00286396 0.002863

In [None]:
expanded_query_embeddings = []
expanded_queries = []

for query in noisy_queries['query']:
    expanded_embedding, expanded_query = dynamic_query_expansion(
        query=query,
        documents=documents,
        model=sentence_model,
        top_n=3,
        original_weight=0.82,
        expanded_weight=0.18
    )
    expanded_query_embeddings.append(expanded_embedding)
    expanded_queries.append(expanded_query)

In [None]:
noisy_queries['expanded_query'] = expanded_queries
expanded_query_embeddings = np.array(expanded_query_embeddings)

In [None]:
lda_word_dist = loaded_lda_model['topic_word_dist']
vocab = loaded_lda_model['vocab']
lda_topics = doc_topic_dist 

In [None]:
document_embeddings = sentence_model.encode(documents)

In [None]:
'''def infer_query_topic_distribution(query, lda_word_dist, vocab, alpha=0.1):
    query_terms = query.split()
    query_indices = [vocab.index(word) for word in query_terms if word in vocab]

    if not query_indices:
        raise ValueError("No query terms match the vocabulary of the LDA model.")

    topic_probs = np.zeros(lda_word_dist.shape[0])  # Number of topics
    for topic_idx in range(lda_word_dist.shape[0]):
        for word_idx in query_indices:
            topic_probs[topic_idx] += lda_word_dist[topic_idx, word_idx]

    topic_probs = (topic_probs + alpha) / (np.sum(topic_probs + alpha))
    return topic_probs'''

In [None]:
'''def retrieve_documents_for_all_queries(queries, documents, lda_topics, lda_word_dist, vocab, model, top_n=5):
    results = []
    for idx, query in enumerate(queries):
        expanded_embedding, expanded_query = dynamic_query_expansion(
            query=query,
            documents=documents,
            model=model,
            top_n=3,
            original_weight=0.7,
            expanded_weight=0.3
        )
        
        try:
            query_topic_dist = infer_query_topic_distribution(expanded_query, lda_word_dist, vocab)
        except ValueError as e:
            print(f"Error for query {idx + 1}: {e}")
            query_topic_dist = np.zeros(lda_topics.shape[1])
        
        similarities_lda = cosine_similarity([query_topic_dist], lda_topics).flatten()
        top_indices_lda = np.argsort(-similarities_lda)[:top_n]

        C, U, R = cur_decomposition(document_embeddings, rank=4)
        reduced_document_embeddings = C @ U
        reduced_query_embedding = expanded_embedding @ (R).T

        similarities_direct = cosine_similarity([reduced_query_embedding], reduced_document_embeddings).flatten()
        top_indices_direct = np.argsort(-similarities_direct)[:top_n]

        # Store results
        results.append({
            'query_index': idx + 1,
            'original_query': query,
            'expanded_query': expanded_query,
            'lda_top_documents': [documents[i] for i in top_indices_lda],
            'direct_top_documents': [documents[i] for i in top_indices_direct],
            'lda_similarities': similarities_lda[top_indices_lda].tolist(),
            'direct_similarities': similarities_direct[top_indices_direct].tolist()
        })
        
        # Optional: Print results for each query
        print(f"Query {idx + 1}:")
        print(f"Original Query: '{query}'")
        print(f"Expanded Query: '{expanded_query}'")
        print(f"Expanded Query Topic Distribution: {query_topic_dist}")
        print("\nTop Matching Documents (Expanded Query with LDA):")
        for i, doc_idx in enumerate(top_indices_lda):
            print(f"Document {i + 1}: {documents[doc_idx]} - Similarity: {similarities_lda[doc_idx]}")
        print("\nTop Matching Documents (Expanded Query with Direct Embedding Similarity):")
        for i, doc_idx in enumerate(top_indices_direct):
            print(f"Document {i + 1}: {documents[doc_idx]} - Similarity: {similarities_direct[doc_idx]}")
        print("="*50)

    return results'''

In [None]:
C, U, R = cur_decomposition(document_embeddings, rank=100)

In [None]:
all_queries = noisy_queries['query'].tolist()
all_expanded_queries = noisy_queries['expanded_query'].tolist()
all_expanded_embeddings = expanded_query_embeddings 

In [None]:
def infer_query_topic_distribution(query, lda_word_dist, vocab, alpha=0.1):
    query_terms = query.split()
    query_indices = [vocab.index(word) for word in query_terms if word in vocab]

    if not query_indices:
        raise ValueError("No query terms match the vocabulary of the LDA model.")

    topic_probs = np.zeros(lda_word_dist.shape[0])  # Number of topics
    for topic_idx in range(lda_word_dist.shape[0]):
        for word_idx in query_indices:
            topic_probs[topic_idx] += lda_word_dist[topic_idx, word_idx]

    topic_probs = (topic_probs + alpha) / (np.sum(topic_probs + alpha))
    return topic_probs

In [None]:
'''retrieval_results = retrieve_documents_for_all_queries(
    queries=all_queries,
    documents=documents,
    lda_topics=lda_topics,
    lda_word_dist=lda_word_dist,
    vocab=vocab,
    model=model,
    top_n=5
)'''

Query 1:
Original Query: 'how much does an average person make for tutoring'
Expanded Query: 'person does one make much an making tutoring how for average'
Expanded Query Topic Distribution: [0.24071379 0.17456687 0.21950725 0.17729228 0.18791981]

Top Matching Documents (Expanded Query with LDA):
Document 1: answer thibaut descarte gavrilo princip assassinate archduke franz ferdinand sarejevo gavrilo princip member black hand serbian nationalist group gavrilo princip assassinate archduke franz ferdinand sarejevo gavrilo princip member black hand serbian nationalist group - Similarity: 0.9917371494595331
Document 2: gavrilo princip assassin affiliate serbian black hand terrorist organization kill archduke franz ferdinand - Similarity: 0.9917371494595025
Document 3: good answer though gavrilo princip kill franz ferdinand pawn huge assasination plan black hand serbian freedom fighter group hope eliminate archduke due fact archduke plan give concession south slavs therefore make unified g

In [None]:
'''import json
with open('retrieval_results.json', 'w') as f:
    json.dump(retrieval_results, f, indent=4)

# Optionally, convert to DataFrame and save as CSV
results_df = pd.DataFrame(retrieval_results)
results_df.to_csv('retrieval_results.csv', index=False)'''

In [None]:
def retrieve_documents_for_all_queries(queries, expanded_queries, expanded_query_embeddings, documents, lda_topics, lda_word_dist, vocab, model, C, U, R, top_n=5):
    results = []
    reduced_document_embeddings = C @ U  # Precompute reduced document embeddings
    for idx, (original_query, expanded_query, expanded_embedding) in enumerate(zip(queries, expanded_queries, expanded_query_embeddings)):
        try:
            query_topic_dist = infer_query_topic_distribution(expanded_query, lda_word_dist, vocab)
        except ValueError as e:
            print(f"Error for query {idx + 1}: {e}")
            query_topic_dist = np.zeros(lda_topics.shape[1])
        
        similarities_lda = cosine_similarity([query_topic_dist], lda_topics).flatten()
        top_indices_lda = np.argsort(-similarities_lda)[:top_n]

        reduced_query_embedding = expanded_embedding @ R.T
        similarities_direct = cosine_similarity([reduced_query_embedding], reduced_document_embeddings).flatten()
        top_indices_direct = np.argsort(-similarities_direct)[:top_n]

        # Store results
        results.append({
            'query_index': idx + 1,
            'original_query': original_query,
            'expanded_query': expanded_query,
            'lda_top_documents': [documents[i] for i in top_indices_lda],
            'direct_top_documents': [documents[i] for i in top_indices_direct],
            'lda_similarities': similarities_lda[top_indices_lda].tolist(),
            'direct_similarities': similarities_direct[top_indices_direct].tolist()
        })
        
        # Optional: Print results for each query
        print(f"Query {idx + 1}:")
        print(f"Original Query: '{original_query}'")
        print(f"Expanded Query: '{expanded_query}'")
        print(f"Expanded Query Topic Distribution: {query_topic_dist}")
        print("\nTop Matching Documents (Expanded Query with LDA):")
        for i, doc_idx in enumerate(top_indices_lda):
            print(f"Document {i + 1}: {documents[doc_idx]} - Similarity: {similarities_lda[doc_idx]}")
        print("\nTop Matching Documents (Expanded Query with Direct Embedding Similarity):")
        for i, doc_idx in enumerate(top_indices_direct):
            print(f"Document {i + 1}: {documents[doc_idx]} - Similarity: {similarities_direct[doc_idx]}")
        print("="*50)

    return results

In [None]:
retrieval_results = retrieve_documents_for_all_queries(
    queries=all_queries,
    expanded_queries=all_expanded_queries,
    expanded_query_embeddings=all_expanded_embeddings,
    documents=documents,
    lda_topics=lda_topics,
    lda_word_dist=lda_word_dist,
    vocab=vocab,
    model=sentence_model,
    C=C,
    U=U,
    R=R,
    top_n=5
)

Query 1:
Original Query: 'how much does an average person make for tutoring'
Expanded Query: 'person does one make much an making tutoring how for average'
Expanded Query Topic Distribution: [0.24071379 0.17456687 0.21950725 0.17729228 0.18791981]

Top Matching Documents (Expanded Query with LDA):
Document 1: answer thibaut descarte gavrilo princip assassinate archduke franz ferdinand sarejevo gavrilo princip member black hand serbian nationalist group gavrilo princip assassinate archduke franz ferdinand sarejevo gavrilo princip member black hand serbian nationalist group - Similarity: 0.9917371494595331
Document 2: gavrilo princip assassin affiliate serbian black hand terrorist organization kill archduke franz ferdinand - Similarity: 0.9917371494595025
Document 3: good answer though gavrilo princip kill franz ferdinand pawn huge assasination plan black hand serbian freedom fighter group hope eliminate archduke due fact archduke plan give concession south slavs therefore make unified g

In [None]:
import json
with open('retrieval_results1.json', 'w') as f:
    json.dump(retrieval_results, f, indent=4)

# Optionally, convert to DataFrame and save as CSV
results_df = pd.DataFrame(retrieval_results)
results_df.to_csv('retrieval_results2.csv', index=False)

In [None]:
retrieval_results2=pd.read_csv("/home/student/vishaka/retrieval_results2.csv")
retrieval_results2.head()

Unnamed: 0,query_index,original_query,expanded_query,lda_top_documents,direct_top_documents,lda_similarities,direct_similarities
0,1,how much does an average person make for tutoring,person does one make much an making tutoring h...,['answer thibaut descarte gavrilo princip assa...,['carmen episode replace aimee garcia play ric...,"[0.9917371494595331, 0.9917371494595025, 0.991...","[0.9836052656173706, 0.9707045555114746, 0.961..."
1,2,what is the unit for pulse,â pulse the what for is unit,['answer thibaut descarte gavrilo princip assa...,['xylem vessel water travel bulk flow rather c...,"[0.9994532912390769, 0.9994532912390729, 0.999...","[0.9861881732940674, 0.9806166887283325, 0.977..."
2,3,what teas are good for what,teas are what good something for,['guideline apply guideline apply scotts halt ...,['abnormal pupillary reflex mean do not happen...,"[0.9999209327885186, 0.9999209327885161, 0.999...","[0.92095547914505, 0.9096858501434326, 0.88800..."
3,4,what county is lodi california,lodi what california county is,['white sox would end may high note sweep bost...,['lodi can population race total population lo...,"[0.9999012276219352, 0.9999012276219345, 0.999...","[0.9908603429794312, 0.8959419131278992, 0.887..."
4,5,what is defamation harm,harm is defamation what,['white sox would end may high note sweep bost...,['oregon bottle bill carbonate beverage contai...,"[0.9999010936447079, 0.9999010936447073, 0.999...","[0.9438936114311218, 0.9308743476867676, 0.916..."


In [None]:
qrels_train_df = pd.read_csv('/home/student/vishaka/data/qrels.train.tsv', sep='\t')

In [None]:
collection_data_df=pd.read_csv('/home/student/vishaka/collection_data_df_5000.csv')

In [None]:
qrels_train_df=pd.read_csv("/home/student/vishaka/qrels_train_df.csv")

In [None]:
doc_text_to_id = collection_data_df.set_index('processed_document')['document_id'].to_dict()

In [None]:
relevant_docs = {}
for query_id in noisy_queries:
    relevant = qrels_train_df[qrels_train_df['query_id'] == query_id]['document_id'].tolist()
    relevant_docs[query_id] = relevant

In [None]:
def calculate_precision_recall(retrieved, relevant, k):
    retrieved_top_k = retrieved[:k]
    relevant_set = set(relevant)
    retrieved_set = set(retrieved_top_k)
    intersection = retrieved_set.intersection(relevant_set)
    precision = len(intersection) / k
    recall = len(intersection) / len(relevant_set)
    return precision, recall

In [None]:
import json

# Load the JSON file directly into a list of dictionaries
with open('retrieval_results2.json', 'r') as f:
    retrieval_results2 = json.load(f)

In [None]:
k = 5  # Define your k
precision_scores = []
recall_scores = []

for result in retrieval_results1:
    query_index = result['query_index']
    retrieved_documents = result['lda_top_documents']
    retrieved_doc_ids = [doc_text_to_id.get(doc, -1) for doc in retrieved_documents]
    retrieved_doc_ids = [doc_id for doc_id in retrieved_doc_ids if doc_id != -1]
    
    # Retrieve query_id based on query_index
    query_id = query_index_to_id.get(query_index, None)
    if query_id is None:
        print(f"Skipping query index {query_index} due to missing query_id.")
        continue
    
    # Get relevant document_ids for this query
    relevant = relevant_docs.get(query_id, [])
    
    # Calculate Precision@k and Recall@k
    precision, recall = calculate_precision_recall(retrieved_doc_ids, relevant, k)
    precision_scores.append(precision)
    recall_scores.append(recall)

TypeError: string indices must be integers, not 'str'

In [None]:
avg_precision = sum(precision_scores) / len(precision_scores)
avg_recall = sum(recall_scores) / len(recall_scores)

print(f"Average Precision@{k}: {avg_precision:.4f}")
print(f"Average Recall@{k}: {avg_recall:.4f}")

In [None]:
print(retrieval_results)

In [None]:
qrels_train_df['relevance_label'].value_counts()

In [None]:
def calculate_precision_recall(retrieved, k):
    retrieved_top_k = retrieved[:k]
    precision = len(retrieved_top_k) / k
    recall = len(retrieved_top_k) / len(retrieved)  # Assuming all retrieved docs are relevant
    return precision, recall

k = 5  # Define your k
precision_scores = []
recall_scores = []

for result in retrieval_results1:
    query_index = result['query_index']
    retrieved_documents = result['lda_top_documents']
    retrieved_doc_ids = [doc_text_to_id.get(doc, -1) for doc in retrieved_documents]
    retrieved_doc_ids = [doc_id for doc_id in retrieved_doc_ids if doc_id != -1]

    # Retrieve query_id based on query_index
    query_id = query_index_to_id.get(query_index, None)
    if query_id is None:
        print(f"Skipping query index {query_index} due to missing query_id.")
        continue

    # Calculate Precision@k and Recall@k
    precision, recall = calculate_precision_recall(retrieved_doc_ids, k)
    precision_scores.append(precision)
    recall_scores.append(recall)

In [None]:
avg_precision = sum(precision_scores) / len(precision_scores)
avg_recall = sum(recall_scores) / len(recall_scores)

print(f"Average Precision@{k}: {avg_precision:.4f}")
print(f"Average Recall@{k}: {avg_recall:.4f}")

In [None]:
def calculate_map(qrels_df, retrieval_results, k):
    map_scores = []

    for query_id in qrels_df['query_id'].unique():
        if query_id not in retrieval_results:
            continue

        relevant_docs = set(qrels_df[qrels_df['query_id'] == query_id]['document_id'].tolist())
        retrieved_docs = set(retrieval_results[query_id])

        k = min(len(retrieved_docs), k)
        precision_at_k = []

        for i in range(1, k+1):
            if retrieved_docs[:i].intersection(relevant_docs):
                precision_at_k.append(len(retrieved_docs[:i].intersection(relevant_docs)) / i)
            else:
                precision_at_k.append(0)

        if len(relevant_docs) > 0:  # Check if relevant documents exist
            ap = sum(precision_at_k) / len(relevant_docs)
            map_scores.append(ap)

    if len(map_scores) > 0:
        map_score = sum(map_scores) / len(map_scores)
    else:
        map_score = 0  # Or handle it differently

    return map_score

In [None]:
calculate_map(qrels_train_df, retrieval_results1, k)

In [None]:
def calculate_map(qrels_df, retrieval_results, k):
    map_scores = []

    for query_id in qrels_df['query_id'].unique():
        if query_id not in retrieval_results:
            continue

        relevant_docs = set(qrels_df[qrels_df['query_id'] == query_id]['document_id'].tolist())
        retrieved_docs = set(retrieval_results[query_id][:k])

        if not relevant_docs:  # Handle queries with no relevant documents
            continue

        precision_at_k = []
        for i in range(1, k+1):
            if retrieved_docs[:i].intersection(relevant_docs):
                precision_at_k.append(len(retrieved_docs[:i].intersection(relevant_docs)) / i)
            else:
                precision_at_k.append(0)

        ap = sum(precision_at_k) / len(relevant_docs)
        map_scores.append(ap)

    if len(map_scores) > 0:
        map_score = sum(map_scores) / len(map_scores)
    else:
        map_score = 0

    return map_score

In [None]:
calculate_map(qrels_train_df, retrieval_results1, k)