In [1]:
def cur_decomposition(X, rank):
    """Perform CUR decomposition on a matrix X with a specified rank."""
    col_norms = np.sum(X**2, axis=0)
    row_norms = np.sum(X**2, axis=1)
    prob_cols = col_norms / np.sum(col_norms)
    prob_rows = row_norms / np.sum(row_norms)

    selected_cols = np.random.choice(X.shape[1], rank, replace=False, p=prob_cols)
    selected_rows = np.random.choice(X.shape[0], rank, replace=False, p=prob_rows)

    C = X[:, selected_cols]
    R = X[selected_rows, :]

    W = X[np.ix_(selected_rows, selected_cols)]
    U = np.linalg.pinv(W)

    return C, U, R

In [2]:
def lda_gibbs_sampling(docs, num_topics, num_iter=1000, alpha=0.1, beta=0.1):
    vocab = list(set(word for doc in docs for word in doc.split()))
    term_doc_matrix = np.zeros((len(docs), len(vocab)))

    for i, doc in enumerate(docs):
        for word in doc.split():
            term_doc_matrix[i, vocab.index(word)] += 1

    num_docs, vocab_size = term_doc_matrix.shape
    topic_assignments = np.random.randint(0, num_topics, size=(num_docs, vocab_size))

    doc_topic_counts = np.zeros((num_docs, num_topics))
    topic_word_counts = np.zeros((num_topics, vocab_size))
    topic_counts = np.zeros(num_topics)

    for d in range(num_docs):
        for w in range(vocab_size):
            word_count = term_doc_matrix[d, w]
            if word_count == 0:
                continue

            topic = topic_assignments[d, w]
            doc_topic_counts[d, topic] += word_count
            topic_word_counts[topic, w] += word_count
            topic_counts[topic] += word_count

    for _ in range(num_iter):
        for d in range(num_docs):
            for w in range(vocab_size):
                word_count = term_doc_matrix[d, w]
                if word_count == 0:
                    continue

                topic = topic_assignments[d, w]
                doc_topic_counts[d, topic] -= word_count
                topic_word_counts[topic, w] -= word_count
                topic_counts[topic] -= word_count

                topic_probs = (
                    (doc_topic_counts[d, :] + alpha)
                    * (topic_word_counts[:, w] + beta)
                    / (topic_counts + beta * vocab_size)
                )
                topic_probs /= topic_probs.sum()

                new_topic = np.random.choice(num_topics, p=topic_probs)
                topic_assignments[d, w] = new_topic
                doc_topic_counts[d, new_topic] += word_count
                topic_word_counts[new_topic, w] += word_count
                topic_counts[new_topic] += word_count

    doc_topic_dist = (doc_topic_counts + alpha) / (doc_topic_counts.sum(axis=1, keepdims=True) + num_topics * alpha)
    topic_word_dist = (topic_word_counts + beta) / (topic_word_counts.sum(axis=1, keepdims=True) + vocab_size * beta)

    return doc_topic_dist, topic_word_dist, vocab

In [3]:
def dynamic_query_expansion(query, documents, model, top_n=3, original_weight=0.7, expanded_weight=0.3):
    unique_terms = set(' '.join(documents).split())
    term_embeddings = {term: model.encode(term) for term in unique_terms}

    query_terms = query.split()
    query_embeddings = [model.encode(term) for term in query_terms]

    query_embedding = np.mean(query_embeddings, axis=0)
    similarities = {
        term: cosine_similarity([query_embedding], [embedding])[0][0]
        for term, embedding in term_embeddings.items()
    }
    expanded_terms = sorted(similarities, key=similarities.get, reverse=True)[:top_n]

    expanded_embeddings = [term_embeddings[term] for term in expanded_terms]
    combined_embedding = (
        original_weight * np.mean(query_embeddings, axis=0) +
        expanded_weight * np.mean(expanded_embeddings, axis=0)
    )

    expanded_query_terms = set(query_terms).union(expanded_terms)
    return combined_embedding, ' '.join(expanded_query_terms)

In [None]:
def lda_gibbs_sampling_with_cur(docs, num_topics, num_iter=1000, alpha=0.1, beta=0.1, rank=4, verbose=False):
    # Step 1: Create Term-Document Matrix
    vocab = list(set(word for doc in docs for word in doc.split()))
    term_doc_matrix = np.zeros((len(docs), len(vocab)))

    for i, doc in enumerate(docs):
        for word in doc.split():
            term_doc_matrix[i, vocab.index(word)] += 1

    # Step 2: Apply CUR decomposition
    C, U, R = cur_decomposition(term_doc_matrix, rank)
    reduced_matrix = C @ U @ R

    # Step 3: Use reduced_matrix for LDA
    num_docs, vocab_size = reduced_matrix.shape
    topic_assignments = np.random.randint(0, num_topics, size=(num_docs, vocab_size))

    doc_topic_counts = np.zeros((num_docs, num_topics))
    topic_word_counts = np.zeros((num_topics, vocab_size))
    topic_counts = np.zeros(num_topics)

    for d in range(num_docs):
        for w in range(vocab_size):
            topic = topic_assignments[d, w]
            doc_topic_counts[d, topic] += reduced_matrix[d, w]
            topic_word_counts[topic, w] += reduced_matrix[d, w]
            topic_counts[topic] += reduced_matrix[d, w]

    for iteration in range(num_iter):
        if verbose:
            print(f"Iteration {iteration + 1}/{num_iter}")

        for d in range(num_docs):
            if verbose and d % 10 == 0:  # Print progress for every 10 documents
                print(f"Processing document {d + 1}/{num_docs}")

            for w in range(vocab_size):
                word_count = reduced_matrix[d, w]
                if word_count == 0:
                    continue

                topic = topic_assignments[d, w]
                doc_topic_counts[d, topic] -= word_count
                topic_word_counts[topic, w] -= word_count
                topic_counts[topic] -= word_count

                topic_probs = (
                    (doc_topic_counts[d, :] + alpha)
                    * (topic_word_counts[:, w] + beta)
                    / (topic_counts + beta * vocab_size)
                )
                topic_probs = np.maximum(topic_probs, 0)  # Ensure non-negative
                prob_sum = topic_probs.sum()
                if prob_sum > 0:
                    topic_probs /= prob_sum
                else:
                    topic_probs = np.ones(num_topics) / num_topics

                new_topic = np.random.choice(num_topics, p=topic_probs)
                topic_assignments[d, w] = new_topic
                doc_topic_counts[d, new_topic] += word_count
                topic_word_counts[new_topic, w] += word_count
                topic_counts[new_topic] += word_count

    doc_topic_dist = (doc_topic_counts + alpha) / (doc_topic_counts.sum(axis=1, keepdims=True) + num_topics * alpha)
    topic_word_dist = (topic_word_counts + beta) / (topic_word_counts.sum(axis=1, keepdims=True) + vocab_size * beta)

    return doc_topic_dist, topic_word_dist, vocab

In [4]:
def retrieve_documents_with_cur(query, documents, model, lda_topics, lda_vocab, rank=10, top_n=5):
    query_embedding, expanded_query = dynamic_query_expansion(query, documents, model)
    print(f"Expanded Query: '{expanded_query}'\n")
    document_embeddings = model.encode(documents)

    print("Applying CUR Decomposition to reduce document embeddings dimensionality...")
    C, U, R = cur_decomposition(document_embeddings, rank=rank)
    reduced_document_embeddings = C @ U
    reduced_query_embedding = query_embedding @ (R).T

    similarities = cosine_similarity([reduced_query_embedding], reduced_document_embeddings).flatten()
    top_indices = np.argsort(-similarities)[:top_n]

    print("LDA Topics Distribution for Top Documents:")
    for idx in top_indices:
        doc_topics = lda_topics[idx]
        print(f"Document: {documents[idx]}\nTopic Distribution: {doc_topics}\n")

    return top_indices

In [5]:
import pandas as pd
collection_data_df=pd.read_csv('collection_data_df.csv')

In [6]:
import os
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

# Set the proxy environment variables
os.environ['HTTP_PROXY'] = 'socks5h://127.0.0.1:1080'
os.environ['HTTPS_PROXY'] = 'socks5h://127.0.0.1:1080'

# Explicitly download the model
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Wrap it with SentenceTransformer for compatibility
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model loaded successfully.")


Model loaded successfully.


In [7]:
# Use the processed text for LDA and CUR
documents = collection_data_df['processed_document'].tolist()


In [8]:
import numpy as np

In [9]:
#from sentence_transformers import SentenceTransformer

# Load a SentenceTransformer model
#model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


# Generate document embeddings
document_embeddings = sentence_model.encode(documents)



In [10]:
print(document_embeddings)
print(document_embeddings.shape)

[[ 0.00369316  0.12447743 -0.03030166 ... -0.0348379  -0.04860682
   0.0244041 ]
 [-0.04892228  0.07967986  0.00721633 ... -0.00510206 -0.04808084
   0.00623703]
 [-0.0628251   0.03445498 -0.0747632  ...  0.01157742 -0.0669866
   0.02865066]
 ...
 [ 0.06008795 -0.01425387 -0.02162606 ...  0.03436214  0.00333819
  -0.07693569]
 [ 0.01510265 -0.07874505  0.06587734 ...  0.06211259  0.07921816
  -0.09959285]
 [ 0.06404319  0.02151622 -0.02984285 ...  0.05427389  0.01936611
  -0.07033923]]
(20000, 384)


CUR:

In [11]:
# Apply CUR decomposition
rank = 200  # You can tune this based on the dataset size
C, U, R = cur_decomposition(document_embeddings, rank)

print("C matrix shape:", C.shape)
print("U matrix shape:", U.shape)
print("R matrix shape:", R.shape)


C matrix shape: (20000, 200)
U matrix shape: (200, 200)
R matrix shape: (200, 384)


LDA:

In [12]:
# Perform LDA using the Gibbs sampling function
num_topics = 5  # Number of topics to extract
num_iter = 1000  # Number of iterations for Gibbs sampling

doc_topic_dist, topic_word_dist, vocab = lda_gibbs_sampling(documents, num_topics, num_iter)

"""print("Document-Topic Distribution Shape:", doc_topic_dist.shape)
print("Topic-Word Distribution Shape:", topic_word_dist.shape)
print("Vocabulary Size:", len(vocab))
"""

KeyboardInterrupt: 

In [None]:
# Save CUR matrices
np.save("C_matrix.npy", C)
np.save("U_matrix.npy", U)
np.save("R_matrix.npy", R)

# Save LDA distributions
np.save("doc_topic_dist.npy", doc_topic_dist)
np.save("topic_word_dist.npy", topic_word_dist)
