In [1]:
from pathlib import Path

import pandas as pd

In [2]:
DATA_DIR = Path().cwd().parent / "data_pipeline" / "notebooks" / "data"
REVIEWS_DATA_FILE = DATA_DIR / "reviews.csv"

In [3]:
reviews_df = pd.read_csv(REVIEWS_DATA_FILE)
reviews_df

Unnamed: 0,CRN,Course Name,Instructor,Subject,Course Number,Question,Review
0,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,Timing and Canvas are Organized.
1,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,very responsive professor
2,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,"Passionate, knowledgeable, extremely accommoda..."
3,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,Scott was a pleasure to have as a professor! I...
4,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,"Assignments (homework, labs), and live lecture..."
...,...,...,...,...,...,...,...
12657,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,Studied more before class so that I could foll...
12658,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,watch vedios and books ahead may be better.
12659,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,I can utilize the additional resources provide...
12660,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,Spent more time outside of course hours review...


In [4]:
reviews_df.iloc[0, :]

CRN                                                          35056
Course Name             Data Str, Algo App in CmpSys (Spring 2024)
Instructor                                         Valcourt, Scott
Subject                                                         CS
Course Number                                                 5008
Question         What were the strengths of this course and/or ...
Review                            Timing and Canvas are Organized.
Name: 0, dtype: object

In [5]:
def stringify_review_instance(row: pd.Series) -> str:
    template = f"""Metadata:
    CRN: {row['CRN']}, Course Name: {row['Course Name']}, Instructor: {row['Instructor']},
    Course Number: {row['Subject']}{row['Course Number']}

    Question:
    {row['Question']}

    Review:
    {row['Review']}
    """

    return template

In [6]:
stringified_reviews = reviews_df.apply(stringify_review_instance, axis=1)
stringified_reviews

0        Metadata:\n    CRN: 35056, Course Name: Data S...
1        Metadata:\n    CRN: 35056, Course Name: Data S...
2        Metadata:\n    CRN: 35056, Course Name: Data S...
3        Metadata:\n    CRN: 35056, Course Name: Data S...
4        Metadata:\n    CRN: 35056, Course Name: Data S...
                               ...                        
12657    Metadata:\n    CRN: 35062, Course Name: Object...
12658    Metadata:\n    CRN: 35062, Course Name: Object...
12659    Metadata:\n    CRN: 35062, Course Name: Object...
12660    Metadata:\n    CRN: 35062, Course Name: Object...
12661    Metadata:\n    CRN: 35062, Course Name: Object...
Length: 12662, dtype: object

In [7]:
stringified_reviews[0]

'Metadata:\n    CRN: 35056, Course Name: Data Str, Algo App in CmpSys (Spring 2024), Instructor: Valcourt, Scott,\n    Course Number: CS5008\n\n    Question:\n    What were the strengths of this course and/or this instructor?\n\n    Review:\n    Timing and Canvas are Organized.\n    '

In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

In [9]:
# Step 1: Prepare Corpus
# Convert Series to list
stringified_reviews_list = stringified_reviews.tolist()

# Step 2: Embed Texts
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(stringified_reviews_list)

# Step 3: Compute Cosine Similarity Matrix
similarity_matrix = cosine_similarity(embeddings)

# Step 4: Identify Near-Duplicates
threshold = 0.9  # Similarity threshold for duplicates
to_remove = set()

for i in range(len(similarity_matrix)):
    for j in range(i + 1, len(similarity_matrix)):
        if similarity_matrix[i, j] > threshold:
            to_remove.add(j)  # Mark the later document as a duplicate

# Remove duplicates from text_list and embeddings
filtered_reviews = [text for idx, text in enumerate(stringified_reviews_list) if idx not in to_remove]
filtered_embeddings = [emb for idx, emb in enumerate(embeddings) if idx not in to_remove]

# Step 5: Store in ChromaDB
# Initialize Chroma client
client = chromadb.Client()

# Create a collection
collection = client.get_or_create_collection("text_embeddings_rag")


# Add texts and embeddings to ChromaDB
for idx, (review, embedding) in enumerate(zip(filtered_reviews, filtered_embeddings)):
    collection.add(
        documents=[review],
        metadatas=[{"index": idx}],
        ids=[str(idx)],
        embeddings=[embedding.tolist()]
    )

# Step 6: Verify the Collection
print(f"Original Texts: {len(stringified_reviews_list)}")
print(f"Filtered Texts (After Deduplication): {len(filtered_texts)}")


KeyboardInterrupt

