In [None]:
# import faiss
# faiss.read_index("index.faiss")

RuntimeError: Error in faiss::FileIOReader::FileIOReader(const char*) at /project/faiss/faiss/impl/io.cpp:67: Error: 'f' failed: could not open index.faiss for reading: No such file or directory

In [5]:
import pandas as pd
from rapidfuzz import process, fuzz

# Example dataset
df = pd.DataFrame({
    "review_text": [
        "Great app!", "Love this app", "Awesome!", "great app", 
        "Great app!!", "Very useful", "Very very useful", "Not bad", "Not bad at all"
    ],
    "rating": [5, 5, 4, 5, 5, 4, 4, 3, 3],  # Keeping an additional column
    "user_id": [101, 102, 103, 104, 105, 106, 107, 108, 109]  # Example user data
})

# Function to remove duplicate reviews while keeping all columns
def deduplicate_dataframe(df, text_column="review_text", threshold=80):
    seen = {}  # Dictionary to track unique reviews with indexes
    unique_indexes = []

    for index, review in df[text_column].items():
        # Find best match from seen reviews
        match = process.extractOne(review, seen.keys(), scorer=fuzz.ratio)
        
        # If no close match or similarity is below threshold, keep review
        if not match or match[1] < threshold:
            seen[review] = index
            unique_indexes.append(index)

    # Return deduplicated DataFrame
    return df.loc[unique_indexes].reset_index(drop=True)

# Apply deduplication while preserving all columns
df_deduplicated = deduplicate_dataframe(df)

print(df_deduplicated)


      review_text  rating  user_id
0      Great app!       5      101
1   Love this app       5      102
2        Awesome!       4      103
3     Very useful       4      106
4         Not bad       3      108
5  Not bad at all       3      109


In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def vectorized_deduplicate_dataframe(df, text_column="review_text", threshold=0.8):
    """
    Efficiently deduplicates a dataframe based on text similarity using TF-IDF and cosine similarity.
    
    Args:
    - df (pd.DataFrame): Input DataFrame containing text data.
    - text_column (str): Column name of text data.
    - threshold (float): Similarity threshold (0 to 1), where higher means stricter deduplication.
    
    Returns:
    - pd.DataFrame: Deduplicated DataFrame with all original columns preserved.
    """

    # Convert text data to lowercase and drop NaNs
    df = df.dropna(subset=[text_column]).copy()
    df[text_column] = df[text_column].astype(str).str.lower()

    # Compute TF-IDF embeddings
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(df[text_column])

    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Identify duplicates
    unique_indexes = []
    seen = set()

    for i in range(len(df)):
        if i in seen:
            continue
        # Find similar reviews
        similar_indexes = np.where(similarity_matrix[i] > threshold)[0]
        seen.update(similar_indexes)  # Mark them as seen
        unique_indexes.append(i)  # Keep only the first occurrence

    # Return deduplicated DataFrame
    return df.iloc[unique_indexes].reset_index(drop=True)

# Example dataset
df = pd.DataFrame({
    "review_text": [
        "Great app!", "Love this app", "Awesome!", "great app", 
        "Great app!!", "Very useful", "Very very useful", "Not bad", "Not bad at all"
    ],
    "rating": [5, 5, 4, 5, 5, 4, 4, 3, 3],  # Additional metadata
    "user_id": [101, 102, 103, 104, 105, 106, 107, 108, 109]  # Example user data
})

# Apply deduplication
df_deduplicated = vectorized_deduplicate_dataframe(df)

print(df_deduplicated)


     review_text  rating  user_id
0     great app!       5      101
1  love this app       5      102
2       awesome!       4      103
3    very useful       4      106
4        not bad       3      108
