In [1]:
def cur_decomposition(X, rank):
    """Perform CUR decomposition on a matrix X with a specified rank."""
    col_norms = np.sum(X**2, axis=0)
    row_norms = np.sum(X**2, axis=1)
    prob_cols = col_norms / np.sum(col_norms)
    prob_rows = row_norms / np.sum(row_norms)

    selected_cols = np.random.choice(X.shape[1], rank, replace=False, p=prob_cols)
    selected_rows = np.random.choice(X.shape[0], rank, replace=False, p=prob_rows)

    C = X[:, selected_cols]
    R = X[selected_rows, :]

    W = X[np.ix_(selected_rows, selected_cols)]
    U = np.linalg.pinv(W)

    return C, U, R

In [None]:
def lda_gibbs_sampling_with_cur(docs, num_topics, num_iter=1000, alpha=0.1, beta=0.1, rank=4, verbose=False):
    # Step 1: Create Term-Document Matrix
    vocab = list(set(word for doc in docs for word in doc.split()))
    term_doc_matrix = np.zeros((len(docs), len(vocab)))

    for i, doc in enumerate(docs):
        for word in doc.split():
            term_doc_matrix[i, vocab.index(word)] += 1

    # Step 2: Apply CUR decomposition
    C, U, R = cur_decomposition(term_doc_matrix, rank)
    reduced_matrix = C @ U @ R

    # Step 3: Use reduced_matrix for LDA
    num_docs, vocab_size = reduced_matrix.shape
    topic_assignments = np.random.randint(0, num_topics, size=(num_docs, vocab_size))

    doc_topic_counts = np.zeros((num_docs, num_topics))
    topic_word_counts = np.zeros((num_topics, vocab_size))
    topic_counts = np.zeros(num_topics)

    for d in range(num_docs):
        for w in range(vocab_size):
            topic = topic_assignments[d, w]
            doc_topic_counts[d, topic] += reduced_matrix[d, w]
            topic_word_counts[topic, w] += reduced_matrix[d, w]
            topic_counts[topic] += reduced_matrix[d, w]

    for iteration in range(num_iter):
        if verbose:
            print(f"Iteration {iteration + 1}/{num_iter}")

        for d in range(num_docs):
            if verbose and d % 10 == 0:  # Print progress for every 10 documents
                print(f"Processing document {d + 1}/{num_docs}")

            for w in range(vocab_size):
                word_count = reduced_matrix[d, w]
                if word_count == 0:
                    continue

                topic = topic_assignments[d, w]
                doc_topic_counts[d, topic] -= word_count
                topic_word_counts[topic, w] -= word_count
                topic_counts[topic] -= word_count

                topic_probs = (
                    (doc_topic_counts[d, :] + alpha)
                    * (topic_word_counts[:, w] + beta)
                    / (topic_counts + beta * vocab_size)
                )
                topic_probs /= topic_probs.sum()

                new_topic = np.random.choice(num_topics, p=topic_probs)
                topic_assignments[d, w] = new_topic
                doc_topic_counts[d, new_topic] += word_count
                topic_word_counts[new_topic, w] += word_count
                topic_counts[new_topic] += word_count

    doc_topic_dist = (doc_topic_counts + alpha) / (doc_topic_counts.sum(axis=1, keepdims=True) + num_topics * alpha)
    topic_word_dist = (topic_word_counts + beta) / (topic_word_counts.sum(axis=1, keepdims=True) + vocab_size * beta)

    return doc_topic_dist, topic_word_dist, vocab

In [3]:
def dynamic_query_expansion(query, documents, model, top_n=3, original_weight=0.7, expanded_weight=0.3):
    unique_terms = set(' '.join(documents).split())
    term_embeddings = {term: model.encode(term) for term in unique_terms}

    query_terms = query.split()
    query_embeddings = [model.encode(term) for term in query_terms]

    query_embedding = np.mean(query_embeddings, axis=0)
    similarities = {
        term: cosine_similarity([query_embedding], [embedding])[0][0]
        for term, embedding in term_embeddings.items()
    }
    expanded_terms = sorted(similarities, key=similarities.get, reverse=True)[:top_n]

    expanded_embeddings = [term_embeddings[term] for term in expanded_terms]
    combined_embedding = (
        original_weight * np.mean(query_embeddings, axis=0) +
        expanded_weight * np.mean(expanded_embeddings, axis=0)
    )

    expanded_query_terms = set(query_terms).union(expanded_terms)
    return combined_embedding, ' '.join(expanded_query_terms)

In [4]:
def retrieve_documents_with_cur(query, documents, model, lda_topics, lda_vocab, rank=10, top_n=5):
    query_embedding, expanded_query = dynamic_query_expansion(query, documents, model)
    print(f"Expanded Query: '{expanded_query}'\n")
    document_embeddings = model.encode(documents)

    print("Applying CUR Decomposition to reduce document embeddings dimensionality...")
    C, U, R = cur_decomposition(document_embeddings, rank=rank)
    reduced_document_embeddings = C @ U
    reduced_query_embedding = query_embedding @ (R).T

    similarities = cosine_similarity([reduced_query_embedding], reduced_document_embeddings).flatten()
    top_indices = np.argsort(-similarities)[:top_n]

    print("LDA Topics Distribution for Top Documents:")
    for idx in top_indices:
        doc_topics = lda_topics[idx]
        print(f"Document: {documents[idx]}\nTopic Distribution: {doc_topics}\n")

    return top_indices

In [5]:
import pandas as pd
collection_data_df=pd.read_csv('collection_data_df_3000.csv')


In [6]:
collection_data_df

Unnamed: 0.1,Unnamed: 0,document_id,document_text,processed_document
0,0,1,The Manhattan Project and its atomic bomb help...,manhattan project atomic bomb helped bring end...
1,1,2,Essay on The Manhattan Project - The Manhattan...,essay manhattan project manhattan project manh...
2,2,3,The Manhattan Project was the name for a proje...,manhattan project name project conduct world w...
3,3,4,versions of each volume as well as complementa...,version volume well complementary website firs...
4,4,5,The Manhattan Project. This once classified ph...,manhattan project classify photograph feature ...
...,...,...,...,...
2995,2995,2996,"By Matt Rosenberg. Near the equator, from abou...",matt rosenberg near equator â north â south no...
2996,2996,2997,The Intertropical Convergence Zone has been ca...,intertropical convergence zone call doldrum sa...
2997,2997,2998,In the seamen's speech the zone is referred to...,seamen speech zone refer doldrum erratic weath...
2998,2998,2999,The Tropical Climate Control. The most importa...,tropical climate control important climate con...


In [7]:
import os
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

# Set the proxy environment variables
os.environ['HTTP_PROXY'] = 'socks5h://127.0.0.1:1080'
os.environ['HTTPS_PROXY'] = 'socks5h://127.0.0.1:1080'

# Explicitly download the model
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Wrap it with SentenceTransformer for compatibility
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model loaded successfully.")


Model loaded successfully.


In [8]:
# Use the processed text for LDA and CUR
documents = collection_data_df['processed_document'].tolist()

In [9]:
import numpy as np

In [10]:
#from sentence_transformers import SentenceTransformer

# Load a SentenceTransformer model
#model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


# Generate document embeddings
document_embeddings = sentence_model.encode(documents)

In [11]:
print(document_embeddings)
print(document_embeddings.shape)

[[ 0.00369316  0.12447743 -0.03030166 ... -0.0348379  -0.04860682
   0.0244041 ]
 [-0.04892228  0.07967986  0.00721633 ... -0.00510206 -0.04808084
   0.00623703]
 [-0.0628251   0.03445498 -0.0747632  ...  0.01157742 -0.0669866
   0.02865066]
 ...
 [ 0.05595329 -0.0701858   0.03365416 ... -0.00026667 -0.07590848
  -0.01522881]
 [ 0.04044628 -0.0114108   0.0430193  ...  0.02154456 -0.12004092
   0.00237025]
 [-0.00127448 -0.08036597 -0.01599905 ... -0.03733192 -0.09562884
  -0.02541723]]
(3000, 384)


CUR:

In [12]:
# Apply CUR decomposition
rank = 100  # You can tune this based on the dataset size
C, U, R = cur_decomposition(document_embeddings, rank)

print("C matrix shape:", C.shape)
print("U matrix shape:", U.shape)
print("R matrix shape:", R.shape)


C matrix shape: (3000, 100)
U matrix shape: (100, 100)
R matrix shape: (100, 384)


LDA:

In [13]:
# Perform LDA using the Gibbs sampling function
num_topics = 5  # Number of topics to extract
num_iter = 500  # Number of iterations for Gibbs sampling

doc_topic_dist, topic_word_dist, vocab = lda_gibbs_sampling_with_cur(documents, num_topics, num_iter,verbose=True,rank=rank)

print("Document-Topic Distribution Shape:", doc_topic_dist.shape)
print("Topic-Word Distribution Shape:", topic_word_dist.shape)
print("Vocabulary Size:", len(vocab))

Iteration 1/500
Processing document 1/3000


ValueError: probabilities are not non-negative

In [None]:
import pickle

In [None]:
lda_model_3000 = {
    'doc_topic_dist': doc_topic_dist,
    'topic_word_dist': topic_word_dist,
    'vocab': vocab
}

In [None]:
with open('lda_model_3000.pkl', 'wb') as file:
    pickle.dump(lda_model_3000, file)

In [None]:
with open('lda_model_3000.pkl', 'rb') as file:
    loaded_lda_model = pickle.load(file)

# Access the components
doc_topic_dist = loaded_lda_model['doc_topic_dist']
topic_word_dist = loaded_lda_model['topic_word_dist']
vocab = loaded_lda_model['vocab']

# Verify the contents
print("Loaded Document-Topic Distribution Shape:",doc_topic_dist.shape)
print("Loaded Topic-Word Distribution Shape:",topic_word_dist.shape)
print("Loaded Vocabulary Size:", len(vocab))

Loaded Document-Topic Distribution Shape: (3000, 5)
Loaded Topic-Word Distribution Shape: (5, 13001)
Loaded Vocabulary Size: 13001


In [None]:
print(doc_topic_dist)

[[ 0.2         0.2         0.2         0.2         0.2       ]
 [-0.35222973  0.2720542   0.00321556 -0.00238597  1.07934594]
 [-0.04399411  0.77853118  0.65619221 -0.10638485 -0.28434443]
 ...
 [-0.05476308  0.00603818 -0.15745474 -0.00293658  1.20911622]
 [ 0.374967    0.23980763  0.44821701 -0.05987136 -0.00312028]
 [ 0.2         0.2         0.2         0.2         0.2       ]]


In [None]:
print(topic_word_dist)

[[-8.05757388e-04  4.72946355e-06  5.10782635e-03 ...  4.72946355e-06
   4.72946355e-06 -1.56420159e-04]
 [ 5.16961465e-05  6.15560284e-06  3.42723364e-03 ...  6.15560284e-06
   6.15560284e-06 -4.05723745e-04]
 [ 5.16668603e-03  2.24799909e-05 -1.93137249e-02 ...  2.24799909e-05
   2.24799909e-05 -1.16244021e-05]
 [-1.98928670e-05  1.78097202e-05 -3.54415682e-03 ...  1.78097202e-05
   1.78097202e-05  2.78267765e-03]
 [ 2.36039603e-05  7.95907220e-06 -4.22975745e-04 ...  7.95907220e-06
   7.95907220e-06 -7.89976275e-04]]


In [7]:
noisy_queries=pd.read_csv('/home/student/vishaka/noisy_queries_df_3000.csv')

In [8]:
noisy_queries

Unnamed: 0.1,Unnamed: 0,query_id,query
0,0,312651,how much does an average person make for tutoring
1,1,484187,put yourself on child support in texas
2,2,585440,what causes arm and shoulder pain
3,3,892800,what season does november fall in
4,4,455279,mode of acquisition
...,...,...,...
115,115,994926,where is the cecum located in the body
116,116,1028406,who hired comey as fbi director
117,117,932391,what's the meaning of wifi
118,118,401029,is a static stretch safe


In [9]:
# Dynamic Query Expansion with Weighting
def dynamic_query_expansion(query, documents, model, top_n=3, original_weight=0.82, expanded_weight=0.18):
    unique_terms = set(' '.join(documents).split())
    term_embeddings = {term: sentence_model.encode(term) for term in unique_terms}

    query_terms = query.split()
    query_embeddings = [sentence_model.encode(term) for term in query_terms]

    query_embedding = np.mean(query_embeddings, axis=0)
    similarities = {
        term: cosine_similarity([query_embedding], [embedding])[0][0]
        for term, embedding in term_embeddings.items()
    }
    expanded_terms = sorted(similarities, key=similarities.get, reverse=True)[:top_n]

    expanded_embeddings = [term_embeddings[term] for term in expanded_terms]
    combined_embedding = (
        original_weight * np.mean(query_embeddings, axis=0) +
        expanded_weight * np.mean(expanded_embeddings, axis=0)
    )

    expanded_query_terms = set(query_terms).union(expanded_terms)
    return combined_embedding, ' '.join(expanded_query_terms)


In [None]:
def retrieve_documents_with_cur(query, documents, C, U, R, model, lda_topics, lda_vocab, top_n=5):
    """
    Retrieve documents using CUR-reduced embeddings and query expansion.
    """
    # Approximate the embeddings using CUR
    reduced_embeddings = C@U@R

    # Expand the query dynamically
    query_embedding, expanded_query = dynamic_query_expansion(query, documents, model)
    print(f"Original Query: '{query}'")
    print(f"Expanded Query: '{expanded_query}'")

    # Compute similarity with CUR-reduced embeddings
    similarities = cosine_similarity([query_embedding], reduced_embeddings).flatten()
    top_indices = np.argsort(-similarities)[:top_n]

    print("\nLDA Topics Distribution for Top Documents:")
    for idx in top_indices:
        doc_topics = lda_topics[idx]
        print(f"Document: {documents[idx]}\nTopic Distribution: {doc_topics}\n")

    return top_indices


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
#top_indices = retrieve_documents_with_cur(queries['query'][0], documents, C, U, R, model, loaded_doc_topic_dist, vocab)

#print("\nTop Matching Documents (with CUR):")
#for idx in top_indices:
#    print(f"Document: {documents[idx]}")

In [None]:
expanded_query_embeddings = []
expanded_queries = []

for query in noisy_queries['query']:
    expanded_embedding, expanded_query = dynamic_query_expansion(
        query=query,
        documents=documents,
        model=sentence_model,
        top_n=3,
        original_weight=0.82,
        expanded_weight=0.18
    )
    expanded_query_embeddings.append(expanded_embedding)
    expanded_queries.append(expanded_query)

In [None]:
noisy_queries['expanded_query'] = expanded_queries
expanded_query_embeddings = np.array(expanded_query_embeddings)

In [None]:
lda_word_dist = loaded_lda_model['topic_word_dist']
vocab = loaded_lda_model['vocab']
lda_topics = doc_topic_dist 

In [None]:
document_embeddings = sentence_model.encode(documents)

In [None]:
'''def infer_query_topic_distribution(query, lda_word_dist, vocab, alpha=0.1):
    query_terms = query.split()
    query_indices = [vocab.index(word) for word in query_terms if word in vocab]

    if not query_indices:
        raise ValueError("No query terms match the vocabulary of the LDA model.")

    topic_probs = np.zeros(lda_word_dist.shape[0])  # Number of topics
    for topic_idx in range(lda_word_dist.shape[0]):
        for word_idx in query_indices:
            topic_probs[topic_idx] += lda_word_dist[topic_idx, word_idx]

    topic_probs = (topic_probs + alpha) / (np.sum(topic_probs + alpha))
    return topic_probs'''

In [None]:
'''def retrieve_documents_for_all_queries(queries, documents, lda_topics, lda_word_dist, vocab, model, top_n=5):
    results = []
    for idx, query in enumerate(queries):
        expanded_embedding, expanded_query = dynamic_query_expansion(
            query=query,
            documents=documents,
            model=model,
            top_n=3,
            original_weight=0.7,
            expanded_weight=0.3
        )
        
        try:
            query_topic_dist = infer_query_topic_distribution(expanded_query, lda_word_dist, vocab)
        except ValueError as e:
            print(f"Error for query {idx + 1}: {e}")
            query_topic_dist = np.zeros(lda_topics.shape[1])
        
        similarities_lda = cosine_similarity([query_topic_dist], lda_topics).flatten()
        top_indices_lda = np.argsort(-similarities_lda)[:top_n]

        C, U, R = cur_decomposition(document_embeddings, rank=4)
        reduced_document_embeddings = C @ U
        reduced_query_embedding = expanded_embedding @ (R).T

        similarities_direct = cosine_similarity([reduced_query_embedding], reduced_document_embeddings).flatten()
        top_indices_direct = np.argsort(-similarities_direct)[:top_n]

        # Store results
        results.append({
            'query_index': idx + 1,
            'original_query': query,
            'expanded_query': expanded_query,
            'lda_top_documents': [documents[i] for i in top_indices_lda],
            'direct_top_documents': [documents[i] for i in top_indices_direct],
            'lda_similarities': similarities_lda[top_indices_lda].tolist(),
            'direct_similarities': similarities_direct[top_indices_direct].tolist()
        })
        
        # Optional: Print results for each query
        print(f"Query {idx + 1}:")
        print(f"Original Query: '{query}'")
        print(f"Expanded Query: '{expanded_query}'")
        print(f"Expanded Query Topic Distribution: {query_topic_dist}")
        print("\nTop Matching Documents (Expanded Query with LDA):")
        for i, doc_idx in enumerate(top_indices_lda):
            print(f"Document {i + 1}: {documents[doc_idx]} - Similarity: {similarities_lda[doc_idx]}")
        print("\nTop Matching Documents (Expanded Query with Direct Embedding Similarity):")
        for i, doc_idx in enumerate(top_indices_direct):
            print(f"Document {i + 1}: {documents[doc_idx]} - Similarity: {similarities_direct[doc_idx]}")
        print("="*50)

    return results'''

In [None]:
C, U, R = cur_decomposition(document_embeddings, rank=100)

In [None]:
all_queries = noisy_queries['query'].tolist()
all_expanded_queries = noisy_queries['expanded_query'].tolist()
all_expanded_embeddings = expanded_query_embeddings 

In [None]:
def infer_query_topic_distribution(query, lda_word_dist, vocab, alpha=0.1):
    query_terms = query.split()
    query_indices = [vocab.index(word) for word in query_terms if word in vocab]

    if not query_indices:
        raise ValueError("No query terms match the vocabulary of the LDA model.")

    topic_probs = np.zeros(lda_word_dist.shape[0])  # Number of topics
    for topic_idx in range(lda_word_dist.shape[0]):
        for word_idx in query_indices:
            topic_probs[topic_idx] += lda_word_dist[topic_idx, word_idx]

    topic_probs = (topic_probs + alpha) / (np.sum(topic_probs + alpha))
    return topic_probs

In [None]:
'''retrieval_results = retrieve_documents_for_all_queries(
    queries=all_queries,
    documents=documents,
    lda_topics=lda_topics,
    lda_word_dist=lda_word_dist,
    vocab=vocab,
    model=model,
    top_n=5
)'''

Query 1:
Original Query: 'how much does an average person make for tutoring'
Expanded Query: 'person does one make much an making tutoring how for average'
Expanded Query Topic Distribution: [0.24071379 0.17456687 0.21950725 0.17729228 0.18791981]

Top Matching Documents (Expanded Query with LDA):
Document 1: answer thibaut descarte gavrilo princip assassinate archduke franz ferdinand sarejevo gavrilo princip member black hand serbian nationalist group gavrilo princip assassinate archduke franz ferdinand sarejevo gavrilo princip member black hand serbian nationalist group - Similarity: 0.9917371494595331
Document 2: gavrilo princip assassin affiliate serbian black hand terrorist organization kill archduke franz ferdinand - Similarity: 0.9917371494595025
Document 3: good answer though gavrilo princip kill franz ferdinand pawn huge assasination plan black hand serbian freedom fighter group hope eliminate archduke due fact archduke plan give concession south slavs therefore make unified g

In [None]:
'''import json
with open('retrieval_results.json', 'w') as f:
    json.dump(retrieval_results, f, indent=4)

# Optionally, convert to DataFrame and save as CSV
results_df = pd.DataFrame(retrieval_results)
results_df.to_csv('retrieval_results.csv', index=False)'''

In [None]:
def retrieve_documents_for_all_queries(queries, expanded_queries, expanded_query_embeddings, documents, lda_topics, lda_word_dist, vocab, model, C, U, R, top_n=5):
    results = []
    reduced_document_embeddings = C @ U  # Precompute reduced document embeddings
    for idx, (original_query, expanded_query, expanded_embedding) in enumerate(zip(queries, expanded_queries, expanded_query_embeddings)):
        try:
            query_topic_dist = infer_query_topic_distribution(expanded_query, lda_word_dist, vocab)
        except ValueError as e:
            print(f"Error for query {idx + 1}: {e}")
            query_topic_dist = np.zeros(lda_topics.shape[1])
        
        similarities_lda = cosine_similarity([query_topic_dist], lda_topics).flatten()
        top_indices_lda = np.argsort(-similarities_lda)[:top_n]

        reduced_query_embedding = expanded_embedding @ R.T
        similarities_direct = cosine_similarity([reduced_query_embedding], reduced_document_embeddings).flatten()
        top_indices_direct = np.argsort(-similarities_direct)[:top_n]

        # Store results
        results.append({
            'query_index': idx + 1,
            'original_query': original_query,
            'expanded_query': expanded_query,
            'lda_top_documents': [documents[i] for i in top_indices_lda],
            'direct_top_documents': [documents[i] for i in top_indices_direct],
            'lda_similarities': similarities_lda[top_indices_lda].tolist(),
            'direct_similarities': similarities_direct[top_indices_direct].tolist()
        })
        
        # Optional: Print results for each query
        print(f"Query {idx + 1}:")
        print(f"Original Query: '{original_query}'")
        print(f"Expanded Query: '{expanded_query}'")
        print(f"Expanded Query Topic Distribution: {query_topic_dist}")
        print("\nTop Matching Documents (Expanded Query with LDA):")
        for i, doc_idx in enumerate(top_indices_lda):
            print(f"Document {i + 1}: {documents[doc_idx]} - Similarity: {similarities_lda[doc_idx]}")
        print("\nTop Matching Documents (Expanded Query with Direct Embedding Similarity):")
        for i, doc_idx in enumerate(top_indices_direct):
            print(f"Document {i + 1}: {documents[doc_idx]} - Similarity: {similarities_direct[doc_idx]}")
        print("="*50)

    return results

In [None]:
retrieval_results = retrieve_documents_for_all_queries(
    queries=all_queries,
    expanded_queries=all_expanded_queries,
    expanded_query_embeddings=all_expanded_embeddings,
    documents=documents,
    lda_topics=lda_topics,
    lda_word_dist=lda_word_dist,
    vocab=vocab,
    model=sentence_model,
    C=C,
    U=U,
    R=R,
    top_n=5
)

Query 1:
Original Query: 'how much does an average person make for tutoring'
Expanded Query: 'much tutoring making for make how does person one an average'
Expanded Query Topic Distribution: [0.15982071 0.15770034 0.24776134 0.28887831 0.14583929]

Top Matching Documents (Expanded Query with LDA):
Document 1: shock wave sonic boom doppler effect observe whenever speed source move slow speed wave source actually move speed fast wave move different phenomenon observe - Similarity: 0.9610997789903493
Document 2: move thing mind easily â chance learn believe itâs true learn concentrate enhance brain function â train mind move object art visualization major key success - Similarity: 0.9610997789891714
Document 3: august th denver international airport twice size manhattan second large public work project world second chunnel link england france land area large us airport third large world one hundred ten million cubic yard ydâ³ earth move building project equivalent one third amount earth m

In [None]:
import json
with open('retrieval_results3000.json', 'w') as f:
    json.dump(retrieval_results, f, indent=4)

# Optionally, convert to DataFrame and save as CSV
results_df = pd.DataFrame(retrieval_results)
results_df.to_csv('retrieval_results3000.csv', index=False)

In [None]:
retrieval_results1=pd.read_csv("/home/student/vishaka/retrieval_results3000.csv")
retrieval_results1.head()

Unnamed: 0,query_index,original_query,expanded_query,lda_top_documents,direct_top_documents,lda_similarities,direct_similarities
0,1,how much does an average person make for tutoring,much tutoring making for make how does person ...,['shock wave sonic boom doppler effect observe...,['think well help clarify question pun intend ...,"[0.9610997789903493, 0.9610997789891714, 0.961...","[0.29303720593452454, 0.23435001075267792, 0.2..."
1,2,put yourself on child support in texas,involved on in child yourself put texas support,['cause mineral always certain color part chem...,['think well help clarify question pun intend ...,"[0.9859564615580424, 0.9859564615579699, 0.985...","[0.2840292453765869, 0.242509126663208, 0.2146..."
2,3,what causes arm and shoulder pain,shoulder what causes arm and pain,['shock wave sonic boom doppler effect observe...,['chivalrous pervert speedy gonzales friend ev...,"[0.9991056543649082, 0.9991056543648408, 0.999...","[0.22930587828159332, 0.22601383924484253, 0.2..."
3,4,what season does november fall in,what does in season fall november,['shock wave sonic boom doppler effect observe...,['lloyd klein bear th february wednesday lloyd...,"[0.9997220996718843, 0.9997220996718271, 0.999...","[0.2702176868915558, 0.23884712159633636, 0.23..."
4,5,mode of acquisition,of mode acquire acquisition,['shock wave sonic boom doppler effect observe...,['bio andrew âandyâ lambros web graphic design...,"[0.9958582487896412, 0.995858248789307, 0.9958...","[0.22947575151920319, 0.2189749926328659, 0.20..."


In [None]:
qrels_train_df = pd.read_csv('/home/student/vishaka/data/qrels.train.tsv', sep='\t')
qrels_train_df

Unnamed: 0,1185869,0,0.1,1
0,1185868,0,16,1
1,597651,0,49,1
2,403613,0,60,1
3,1183785,0,389,1
4,312651,0,616,1
...,...,...,...,...
532755,19285,0,8841362,1
532756,558837,0,4989159,1
532757,559149,0,8841547,1
532758,706678,0,8841643,1


In [2]:
import pandas as pd

In [3]:
collection_data_df=pd.read_csv('/home/student/vishaka/collection_data_df_3000.csv')

In [4]:
qrels_train_df=pd.read_csv("/home/student/vishaka/qrels_train_df.csv")

In [5]:
doc_text_to_id = collection_data_df.set_index('processed_document')['document_id'].to_dict()

In [10]:
relevant_docs = {}
for query_id in noisy_queries['query_id']:
    #print(query_id)
    relevant = qrels_train_df[qrels_train_df['query_id'] == query_id]['document_id'].tolist()
    #print(relevant)
    relevant_docs[query_id] = relevant
    #print(relevant_docs)

In [11]:
print(noisy_queries)

     Unnamed: 0  query_id                                              query
0             0    312651  how much does an average person make for tutoring
1             1    484187             put yourself on child support in texas
2             2    585440                  what causes arm and shoulder pain
3             3    892800                  what season does november fall in
4             4    455279                                mode of acquisition
..          ...       ...                                                ...
115         115    994926             where is the cecum located in the body
116         116   1028406                    who hired comey as fbi director
117         117    932391                         what's the meaning of wifi
118         118    401029                           is a static stretch safe
119         119    852919                   what is the vehicle height on rv

[120 rows x 3 columns]


In [12]:
print(relevant_docs)

{312651: [616], 484187: [1822], 585440: [830633, 1335], 892800: [2817], 455279: [2599], 457407: [1172], 852302: [769], 1032812: [1210], 208108: [2704], 900736: [827, 5397227, 5397228, 820], 1029043: [1849], 248994: [2147], 947970: [2817], 608727: [816], 738038: [680], 645590: [944], 649720: [1306], 944600: [2315], 881523: [1566], 1170036: [1258], 994926: [1307], 852919: [592], 704038: [1067], 637313: [646], 850072: [905], 759692: [2549], 182487: [2277], 80385: [723], 829755: [2782], 654557: [2272], 1152312: [2210], 320418: [2989], 1028406: [1211], 827277: [1757], 696762: [778], 187504: [2313], 966347: [2670], 695572: [448], 787960: [2179], 955259: [2314517, 1900], 139022: [1067], 435796: [2085], 1164799: [1713], 608175: [656], 945951: [2817], 500517: [2507], 683408: [1605], 277756: [2507], 1168119: [905], 645337: [1054], 584690: [1838], 313342: [975], 84453: [81], 240132: [2313], 1177180: [1160], 733739: [2816], 403613: [60], 401029: [1967], 1185868: [16], 608730: [813], 441383: [1389]

In [13]:
def calculate_precision_recall(retrieved, relevant, k):
    retrieved_top_k = retrieved[:k]
    relevant_set = set(relevant)
    print(relevant_set)
    retrieved_set = set(retrieved_top_k)
    print(retrieved_set)
    intersection = retrieved_set.intersection(relevant_set)
    print(intersection)
    precision = len(intersection) / k if k!=0 else 0.0
    recall = len(intersection) / len(relevant_set) if len(relevant_set) != 0 else 0.0
    return precision, recall

In [14]:
import json

# Load the JSON file directly into a list of dictionaries
with open('retrieval_results3000.json', 'r') as f:
    retrieval_results3000 = json.load(f)

In [15]:
# Verify the data structure
print(type(retrieval_results3000))  # Should be <class 'list'>
print(type(retrieval_results3000[0]))  # Should be <class 'dict'>
print(retrieval_results3000[0].keys())  # Should include 'query_index', 'lda_top_documents', etc.

<class 'list'>
<class 'dict'>
dict_keys(['query_index', 'original_query', 'expanded_query', 'lda_top_documents', 'direct_top_documents', 'lda_similarities', 'direct_similarities'])


In [16]:
retrieval_results3000

[{'query_index': 1,
  'original_query': 'how much does an average person make for tutoring',
  'expanded_query': 'much tutoring making for make how does person one an average',
  'lda_top_documents': ['shock wave sonic boom doppler effect observe whenever speed source move slow speed wave source actually move speed fast wave move different phenomenon observe',
   'move thing mind easily â chance learn believe itâs true learn concentrate enhance brain function â train mind move object art visualization major key success',
   'august th denver international airport twice size manhattan second large public work project world second chunnel link england france land area large us airport third large world one hundred ten million cubic yard ydâ³ earth move building project equivalent one third amount earth move build panama canal',
   'process move acceptance easy describe understand however challenge experience acceptance order experience acceptance need change attitude order change attitud

In [17]:
def query_index_to_id(query_index, retrieval_results, noisy_queries):
    original_query = next(
        (result['original_query'] for result in retrieval_results if result['query_index'] == query_index),
        None,
    )
    if not original_query:
        return None

    # Match the original query to the query in noisy_queries
    query_id_row = noisy_queries[noisy_queries['query'] == original_query]
    if not query_id_row.empty:
        return query_id_row.iloc[0]['query_id']
    
    return None

In [19]:
k = 4  # Define your k
precision_scores = []
recall_scores = []

for result in retrieval_results3000:
    query_index = result['query_index']
    retrieved_documents = result['lda_top_documents']
    retrieved_doc_ids = [doc_text_to_id.get(doc, -1) for doc in retrieved_documents]
    retrieved_doc_ids = [doc_id for doc_id in retrieved_doc_ids if doc_id != -1]
    
    # Retrieve query_id based on query_index
    query_id = query_index_to_id(query_index, retrieval_results3000, noisy_queries)
    if query_id is None:
        print(f"Skipping query index {query_index} due to missing query_id.")
        continue
    
    # Get relevant document_ids for this query
    relevant = relevant_docs.get(query_id, [])
    
    # Calculate Precision@k and Recall@k
    precision, recall = calculate_precision_recall(retrieved_doc_ids, relevant,k)
    precision_scores.append(precision)
    recall_scores.append(recall)

{616}
{850, 700, 1196, 2090}
set()
{1822}
{876, 598, 1364, 2582}
set()
{830633, 1335}
{850, 700, 1196, 853}
set()
{2817}
{850, 700, 1196, 2090}
set()
{2599}
{850, 700, 853, 2090}
set()
{1172}
{850, 1196, 2090, 853}
set()
{769}
{850, 700, 1196, 2090}
set()
{1210}
{850, 1196, 2090, 853}
set()
{2704}
{850, 700, 1196, 853}
set()
{827, 5397228, 820, 5397227}
{595, 555, 1364, 2582}
set()
{1849}
{2894, 555, 1364, 2582}
set()
{2147}
{555, 1364, 2582, 2711}
set()
{2817}
{2705, 2707, 1364, 2582}
set()
{816}
{25, 850, 700, 1716}
set()
{680}
{850, 1196, 2090, 853}
set()
{944}
{598, 2898, 1364, 2582}
set()
{1306}
{598, 2898, 1364, 2582}
set()
{2315}
{598, 2898, 1364, 2582}
set()
{1566}
{850, 700, 1196, 853}
set()
{1258}
{850, 1196, 2090, 853}
set()
{1307}
{2894, 555, 1364, 2582}
set()
{592}
{850, 700, 1196, 853}
set()
{1067}
{850, 700, 853, 2090}
set()
{646}
{850, 700, 1196, 2090}
set()
{905}
{598, 2898, 1364, 2582}
set()
{2549}
{850, 700, 2090, 1196}
set()
{2277}
{555, 1364, 2582, 2711}
set()
{723

In [20]:
avg_precision = sum(precision_scores) / len(precision_scores)
avg_recall = sum(recall_scores) / len(recall_scores)

print(f"Average Precision@{k}: {avg_precision:.4f}")
print(f"Average Recall@{k}: {avg_recall:.4f}")

Average Precision@4: 0.0000
Average Recall@4: 0.0000


In [None]:
print(retrieval_results)



In [None]:
qrels_train_df=pd.read_csv("/home/student/vishaka/qrels_train_df.csv")
qrels_train_df

Unnamed: 0.1,Unnamed: 0,query_id,iteration,document_id,relevance_label
0,0,1185868,0,16,1
1,1,597651,0,49,1
2,2,403613,0,60,1
3,3,1183785,0,389,1
4,4,312651,0,616,1
...,...,...,...,...,...
532755,532755,19285,0,8841362,1
532756,532756,558837,0,4989159,1
532757,532757,559149,0,8841547,1
532758,532758,706678,0,8841643,1


In [None]:
qrels_train_df['relevance_label'].value_counts()

relevance_label
1    532760
Name: count, dtype: int64

In [23]:
def calculate_precision_recall(retrieved, k):
    retrieved_top_k = retrieved[:min(k, len(retrieved))]
    precision = len(retrieved_top_k) / k
    recall = len(retrieved_top_k) / len(retrieved)  # Assuming all retrieved docs are relevant
    return precision, recall

k = 5  # Define your k
precision_scores = []
recall_scores = []

for result in retrieval_results3000:
    query_index = result['query_index']
    retrieved_documents = result['lda_top_documents']
    retrieved_doc_ids = [doc_text_to_id.get(doc, -1) for doc in retrieved_documents]
    retrieved_doc_ids = [doc_id for doc_id in retrieved_doc_ids if doc_id != -1]

    # Retrieve query_id based on query_index
    query_id = query_index_to_id(query_index, retrieval_results3000, noisy_queries)  # Correctly call the function
    if query_id is None:
        print(f"Skipping query index {query_index} due to missing query_id.")
        continue

    # Calculate Precision@k and Recall@k
    precision, recall = calculate_precision_recall(retrieved_doc_ids, k)
    precision_scores.append(precision)
    recall_scores.append(recall)

In [None]:
avg_precision = sum(precision_scores) / (len(precision_scores))
avg_recall = sum(recall_scores) / (len(recall_scores))

print(f"Average Precision@{k}: {avg_precision:.4f}")
print(f"Average Recall@{k}: {avg_recall:.4f}")

Average Precision@5: 0.5970
Average Recall@5: 0.6818


In [None]:
def calculate_precision_recall(retrieved, k):
    retrieved_top_k =retrieved_top_k = retrieved[:min(k, len(retrieved))]
    relevant_doc_ids = query_to_relevant_docs.get(query_id, [])
    retrieved_relevant_docs = set(retrieved_doc_ids) & set(relevant_doc_ids)  
    precision = len(retrieved_top_k) / k
    print(precision)
    recall = len(retrieved_relevant_docs) / len(relevant_doc_ids)  # Assuming all retrieved docs are relevant
    print(recall)
    return precision, recall

k = 5  # Define your k
precision_scores = []
recall_scores = []

for result in retrieval_results1:
    query_index = result['query_index']
    retrieved_documents = result['lda_top_documents']
    retrieved_doc_ids = [doc_text_to_id.get(doc, -1) for doc in retrieved_documents]
    retrieved_doc_ids = [doc_id for doc_id in retrieved_doc_ids if doc_id != -1]

    # Retrieve query_id based on query_index
    query_id = query_index_to_id.get(query_index, None)
    if query_id is None:
        print(f"Skipping query index {query_index} due to missing query_id.")
        continue

    # Calculate Precision@k and Recall@k
    precision, recall = calculate_precision_recall(retrieved_doc_ids, k)
    precision_scores.append(precision)
    recall_scores.append(recall)

AttributeError: 'function' object has no attribute 'get'

In [None]:
def calculate_map(qrels_df, retrieval_results, k):
    map_scores = []

    for query_id in qrels_df['query_id'].unique():
        if query_id not in retrieval_results:
            continue

        relevant_docs = set(qrels_df[qrels_df['query_id'] == query_id]['document_id'].tolist())
        retrieved_docs = set(retrieval_results[query_id])

        k = min(len(retrieved_docs), k)
        precision_at_k = []

        for i in range(1, k+1):
            if retrieved_docs[:i].intersection(relevant_docs):
                precision_at_k.append(len(retrieved_docs[:i].intersection(relevant_docs)) / i)
            else:
                precision_at_k.append(0)

        if len(relevant_docs) > 0:  # Check if relevant documents exist
            ap = sum(precision_at_k) / len(relevant_docs)
            map_scores.append(ap)

    if len(map_scores) > 0:
        map_score = sum(map_scores) / len(map_scores)
    else:
        map_score = 0  # Or handle it differently

    return map_score

In [None]:
calculate_map(qrels_train_df, retrieval_results1, k)

0

In [None]:
def calculate_map(qrels_df, retrieval_results, k):
    map_scores = []

    for query_id in qrels_df['query_id'].unique():
        if query_id not in retrieval_results:
            continue

        relevant_docs = set(qrels_df[qrels_df['query_id'] == query_id]['document_id'].tolist())
        retrieved_docs = set(retrieval_results[query_id][:k])

        if not relevant_docs:  # Handle queries with no relevant documents
            continue

        precision_at_k = []
        for i in range(1, k+1):
            if retrieved_docs[:i].intersection(relevant_docs):
                precision_at_k.append(len(retrieved_docs[:i].intersection(relevant_docs)) / i)
            else:
                precision_at_k.append(0)

        ap = sum(precision_at_k) / len(relevant_docs)
        map_scores.append(ap)

    if len(map_scores) > 0:
        map_score = sum(map_scores) / len(map_scores)
    else:
        map_score = 0

    return map_score

In [None]:
calculate_map(qrels_train_df, retrieval_results1, k)

0

In [None]:
documents1=pd.DataFrame(documents)

In [None]:
vocab1 = {word: idx for idx, word in enumerate(vocab)}
def tokenize_document(text, vocab):
    text = str(text)
    return [vocab[word] for word in text.split() if word in vocab]
tokenized_documents = collection_data_df['processed_document'].apply(lambda x: tokenize_document(x, vocab1)).tolist()

print("Vocabulary:", vocab)
print("Tokenized Documents:", tokenized_documents)

Tokenized Documents: [[10301, 741, 5868, 5292, 10145, 2295, 5589, 4278, 6837, 5375, 8791, 6270, 2459, 5868, 5480, 9895, 2984, 6325, 8511], [9386, 10301, 741, 10301, 741, 10301, 741, 10387, 281, 5868, 5292, 3523, 5491, 741, 1570, 8434, 7204, 4278, 8434, 281, 12272, 10197, 7250, 7151], [10301, 741, 2893, 741, 2621, 4278, 6837, 5375, 5957, 8786, 5868, 5292, 2534, 4306, 9435, 741, 3748, 12030, 6226, 5553, 5479, 4980, 9224, 538, 4697, 5096, 1211], [8851, 11193, 480, 8781, 8409, 8786, 11472, 10301, 741, 5721, 1713, 74, 5406, 6325, 7200, 5831, 8409, 12675, 5698, 5406, 6325, 7200, 5831, 5450, 2967, 9376], [10301, 741, 12772, 12830, 2287, 8786, 5868, 5292, 3748, 3564, 5868, 386, 11617, 2097, 2967, 2221, 9203, 8869, 5680, 11007, 2434, 10808], [7884, 5279, 12748, 6123, 9843, 5868, 5292, 5589, 4278, 6837, 5375, 3553, 7884, 7905, 10864, 11105, 10301, 741], [10301, 741, 10301, 741, 7863, 11105, 3829, 4278, 6837, 5375, 5141, 8786, 2967, 3564, 12120, 11328, 7507, 12419, 11328, 529, 2419, 741, 3019, 10

In [None]:
import math
def compute_perplexity(lda_word_dist, lda_topic_dist, tokenized_documents):
    total_log_prob = 0
    total_words = 0

    for doc_idx, doc in enumerate(tokenized_documents):
        for word_idx in doc:
            word_prob = np.dot(lda_topic_dist[doc_idx], lda_word_dist[:, word_idx])
            if word_prob > 0:
                total_log_prob += math.log(word_prob)
            total_words += 1

    perplexity = math.exp(-total_log_prob / total_words)
    return perplexity

In [None]:
perplexity = compute_perplexity(lda_word_dist,lda_topics,tokenized_documents)
print("Perplexity:", perplexity)

Perplexity: 1365.2826451915291


In [None]:
print(lda_word_dist)

[[-8.05757388e-04  4.72946355e-06  5.10782635e-03 ...  4.72946355e-06
   4.72946355e-06 -1.56420159e-04]
 [ 5.16961465e-05  6.15560284e-06  3.42723364e-03 ...  6.15560284e-06
   6.15560284e-06 -4.05723745e-04]
 [ 5.16668603e-03  2.24799909e-05 -1.93137249e-02 ...  2.24799909e-05
   2.24799909e-05 -1.16244021e-05]
 [-1.98928670e-05  1.78097202e-05 -3.54415682e-03 ...  1.78097202e-05
   1.78097202e-05  2.78267765e-03]
 [ 2.36039603e-05  7.95907220e-06 -4.22975745e-04 ...  7.95907220e-06
   7.95907220e-06 -7.89976275e-04]]


In [None]:
print(lda_topics)

[[ 0.2         0.2         0.2         0.2         0.2       ]
 [-0.35222973  0.2720542   0.00321556 -0.00238597  1.07934594]
 [-0.04399411  0.77853118  0.65619221 -0.10638485 -0.28434443]
 ...
 [-0.05476308  0.00603818 -0.15745474 -0.00293658  1.20911622]
 [ 0.374967    0.23980763  0.44821701 -0.05987136 -0.00312028]
 [ 0.2         0.2         0.2         0.2         0.2       ]]


In [None]:
from collections import Counter

def compute_coherence(lda_word_dist, vocab, tokenized_documents, top_n=4):
    num_topics, vocab_size = lda_word_dist.shape
    coherence_scores = []

    # Get word co-occurrences from documents
    word_count = Counter()
    for doc in tokenized_documents:
        word_count.update(doc)

    # Compute coherence for each topic
    for topic_idx in range(num_topics):
        # Get top N words for the topic
        top_word_indices = lda_word_dist[topic_idx].argsort()[-top_n:][::-1]
        top_words = [vocab[i] for i in top_word_indices]

        # Calculate pairwise coherence for top words
        coherence = 0
        for i, word1 in enumerate(top_words):
            for j, word2 in enumerate(top_words):
                if i < j:
                    word1_idx, word2_idx = vocab.index(word1), vocab.index(word2)
                    # Co-occurrence approximation
                    word1_count = word_count[word1_idx]
                    word2_count = word_count[word2_idx]
                    pair_count = sum(1 for doc in tokenized_documents if word1_idx in doc and word2_idx in doc)

                    # Compute log coherence score
                    if pair_count > 0:
                        coherence += math.log((pair_count + 1) / word2_count)

        coherence_scores.append(coherence)

    # Average coherence score over topics
    avg_coherence = np.mean(coherence_scores)
    return avg_coherence

coherence_score = compute_coherence(lda_word_dist, vocab, tokenized_documents)
print("Coherence Score:", coherence_score)


Coherence Score: -10.13600316228496
