In [None]:
# # from sentence_transformers import util

# # def similarity_search(query, embedding_model, index, k=10):
# #     query_embedding = embedding_model.encode([query], device=device)
# #     distances, indices = index.search(query_embedding, k)
# #     return distances, indices

# def similarity_search(query, embedding_model, index, k=10):
#     query_embedding = embedding_model.encode([query], device=device)

#     # Ensure correct dtype and shape
#     query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)

#     # Debugging prints
#     print("Query embedding shape:", query_embedding.shape)
#     print("Index dimension:", index.d)
#     print("Index size:", index.ntotal)

#     distances, indices = index.search(query_embedding, k)
#     return distances, indices

# def retrieve_answers(query, m_key):
#     index, embedding_model = results[m_key]
#     distances, indices = similarity_search(query, embedding_model, index)
#     doc_texts = [df.iloc[i]['review_text'] for i in indices[0]]
#     return doc_texts

# #test similarity search
# sample_query = df.iloc[0]['review_text']
# sample_index, sample_embedder = results['all-MiniLM-L6-v2']
# distances, indices = similarity_search(sample_query, sample_embedder, sample_index)
# print(distances, indices)

# #test answer query
# sample_query = 'What do people like about Spotify?'

# for m_key in models.keys():
#     answer = retrieve_answers(sample_query, m_key)
#     print(f"Model: {m_key}")
#     print(answer)
    

In [None]:
import pandas as pd
import cudf
from cuml.feature_extraction.text import TfidfVectorizer as cuTfidfVectorizer
import cupyx.scipy.sparse as cpx   # GPU sparse operations
from cupyx.scipy.sparse import vstack   # To stack batches on GPU

def batch_vectorize_tfidf(df, text_column="review_text", sample_size = 500000, batch_size=500000):
    """
    Fit a cuML TF-IDF vectorizer on a sample of the data and then transform the full dataset in batches.
    
    Args:
      df (pd.DataFrame): Input DataFrame.
      text_column (str): Column containing text.
      sample_size (int): Number of rows used to fit the vocabulary.
      batch_size (int): Batch size for transforming.
    
    Returns:
      (vectorizer, full_sparse_matrix): The fitted vectorizer and combined sparse TF-IDF matrix.
    """
    # Fit the vectorizer on a sample to learn the vocabulary.
    sample_texts = df[text_column].iloc[:sample_size]
    vectorizer = cuTfidfVectorizer(stop_words="english")
    vectorizer.fit(sample_texts)
    print("Vocabulary size:", len(vectorizer.vocabulary_))
    
    # Process the full dataset in batches
    batches = []
    n = df.shape[0]
    for start in range(0, n, batch_size):
        end = min(n, start + batch_size)
        batch_texts = df[text_column].iloc[start:end]
        batch_matrix = vectorizer.transform(batch_texts)
        batches.append(batch_matrix)
        print(f"Processed rows {start} to {end}")
    
    # Stack the batches together into one sparse matrix.
    full_sparse_matrix = vstack(batches)
    return vectorizer, full_sparse_matrix

# Example usage:
# Assuming df is your DataFrame with 3.4M rows in "review_text"
vectorizer, tfidf_matrix = batch_vectorize_tfidf(df, text_column="review_text", sample_size=500000, batch_size=500000)

# Now, convert the CPU-based sparse matrix to a CuPy sparse format for GPU operations.
tfidf_matrix_gpu = cpx.csr_matrix(tfidf_matrix)
print("TF-IDF matrix is ready for GPU processing.")

Vocabulary size: 66688
Processed rows 0 to 500000
Processed rows 500000 to 1000000
Processed rows 1000000 to 1500000
Processed rows 1500000 to 1862328
TF-IDF matrix is ready for GPU processing.


In [None]:
import cudf
from cuml.feature_extraction.text import TfidfVectorizer as cuTfidfVectorizer
from tqdm import tqdm
import pandas as pd
import cupy as cp  # GPU-accelerated NumPy
import cupyx.scipy.sparse as cpx  # Sparse CuPy operations
from cuml.neighbors import NearestNeighbors  # RAPIDS GPU-optimized ANN

def vectorized_deduplicate_dataframe(tfidf_matrix_gpu, threshold=0.8):

    # Use RAPIDS NearestNeighbors (FAISS alternative)
    nn = NearestNeighbors(n_neighbors=5, metric="cosine", algorithm="brute", output_type="cupy")
    nn.fit(tfidf_matrix_gpu)

    print("Fitted")

    # Identify duplicates
    unique_indexes = []
    seen = set()

    distances, indices = nn.kneighbors(tfidf_matrix_gpu, n_neighbors=5)

    print("Computed Nearest Neighbors on GPU")


    for i in tqdm(range(len(df))):
        if i in seen:
            continue
        # Find similar reviews
        # similar_indexes = pairwise_distances(similarity_matrix_gpu[i], similarity_matrix_gpu[i], metric="cosine").toarray()
        # similar_indexes = [idx for idx, val in enumerate(similar_indexes) if val > threshold]

        similar_indexes = indices[i][distances[i] < threshold]        
        seen.update(similar_indexes)  # Mark them as seen
        unique_indexes.append(i)  # Keep only the first occurrence

    # Return deduplicated DataFrame
    return df.iloc[unique_indexes].reset_index(drop=True)

# Assuming df is your DataFrame
df_dedup1 = vectorized_deduplicate_dataframe(tfidf_matrix_gpu)

print(df_dedup1.shape)
print(df.shape)

df = df_dedup1

In [None]:
# def get_closest(vector_store, embedder, query, k=3):
#     q_embed = embedder.embed_query(query)
#     return vector_store.similarity_search_by_vector(q_embed, k=k) #, distances=[], labels=[])

# query = 'What do people like about Spotify?'

# comparisons = {}
# for m_key, (vector_store, embedder) in results.items():
#     close = get_closest(vector_store, embedder, query)
#     comparisons[m_key] = close

# for m_key, close in comparisons.items():
#     print(f"Model: {m_key}")
#     for n in close:
#         print(n)
#     print("---------------------------------------- \n")

In [None]:
# import faiss
# import torch

# # Assume you have an index (e.g. a flat index on CPU)
# cpu_index = faiss.IndexFlatL2(128)

# if torch.cuda.is_available():
#     print("Using FAISS GPU")
#     # Create GPU resources and transfer the index to GPU 0
#     res = faiss.StandardGpuResources()
#     gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
#     index = gpu_index
# else:
#     print("Using FAISS CPU")
#     index = cpu_index

# # Now use `index` for your similarity search

In [None]:
# from rapidfuzz import process, fuzz

# # Function to remove duplicate reviews while keeping all columns
# def deduplicate_dataframe(df, text_column="review_text", threshold=80):
#     seen = {}  # Dictionary to track unique reviews with indexes
#     unique_indexes = []

#     for index, review in tqdm(df[text_column].items(), total=len(df), desc="Deduplicating reviews"):
#         # Find best match from seen reviews
#         match = process.extractOne(review, seen.keys(), scorer=fuzz.ratio)
        
#         # If no close match or similarity is below threshold, keep review
#         if not match or match[1] < threshold:
#             seen[review] = index
#             unique_indexes.append(index)

#     # Return deduplicated DataFrame
#     return df.loc[unique_indexes].reset_index(drop=True)

# # Apply deduplication while preserving all columns
# df_dedup = deduplicate_dataframe(df)


Deduplicating reviews: 100%|██████████| 3004/3004 [00:03<00:00, 828.49it/s] 


In [None]:
# print(f"Original DataFrame shape: {df.shape}")
# print(f"Deduplicated DataFrame shape: {df_dedup.shape}")

Original DataFrame shape: (3004, 3)
Deduplicated DataFrame shape: (2951, 3)
