GenerateNU · bamarler · Nov 11, 2025 · Oct 23, 2025 · Oct 24, 2025 · Oct 26, 2025
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -32,4 +32,4 @@ docker-compose.override.yml
 
 # Supabase
 .env.supabase.local
-kong.yml
+kong.yml
diff --git a/backend/Dockerfile.dev b/backend/Dockerfile.dev
@@ -1,7 +1,27 @@
 FROM python:3.13.9-slim
+
 WORKDIR /app
+
+# Install system dependencies needed to build hdbscan, scipy, and support cosine distance
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    python3-dev \
+    gfortran \
+    libopenblas-dev \
+    liblapack-dev \        
+    && rm -rf /var/lib/apt/lists/*
+
+# Upgrade pip
+RUN pip install --upgrade pip
+
+# Copy requirements and install Python dependencies
 COPY requirements.txt .
-RUN pip install -r requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the code
 COPY . .
+
 EXPOSE 8000
+
+# Run the app with reload enabled
 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
diff --git a/backend/app/core/litellm.py b/backend/app/core/litellm.py
@@ -25,6 +25,9 @@ class EmbeddingModelType(Enum):
     OPENAI_SMALL = "text-embedding-3-small"  # 1536 dimensions
     OPENAI_LARGE = "text-embedding-3-large"  # 3072 dimensions
 
+    # BERT models
+    # MODERNBERT_EMBED_BASE = "modernbert-embed-base" # 768 dimensions
+
 
 class LLMClient:
     """Simplified LLM client for agentic workflows."""

diff --git a/backend/app/routes/classification_routes.py b/backend/app/routes/classification_routes.py
@@ -91,7 +91,7 @@ async def create_classifications(
             )
 
         classification_names: list[str] = await create_classifications_helper(
-            tenant_id,
+            extracted_files,
             [classification.name for classification in initial_classifications],
         )
 

diff --git a/backend/app/utils/classification/classify_files.py b/backend/app/utils/classification/classify_files.py
@@ -1,3 +1,6 @@
+import numpy as np
+
+from app.core.litellm import LLMClient
 from app.schemas.classification_schemas import Classification, ExtractedFile
 
 
@@ -6,8 +9,77 @@ async def classify_files(
     classifications: list[Classification],
 ) -> list[ExtractedFile]:
     """
-    Anlyzes all of the extracted files and the classifications
-    to classify each file, returing the classified extracted files
+    Classifies extracted files by comparing their embeddings to
+    classification name embeddings.
+
+    Each file is assigned the classification with the highest cosine similarity.
     """
-    # TODO: Implement the logic that analyzes each file and assigns classifications, returning the updated list
+
+    if not extracted_files or not classifications:
+        return extracted_files
+
+    client = LLMClient()
+    classification_embeddings = {}
+
+    # Generate embeddings for each classification name
+    print(f"Generating embeddings for {len(classifications)} classifications...")
+    for classification in classifications:
+        try:
+            emb = await client.embed(classification.name)
+            classification_embeddings[classification.classification_id] = emb
+        except Exception as e:
+            print(f"Error generating embedding for '{classification.name}': {e}")
+            continue
+
+    if not classification_embeddings:
+        print("No classification embeddings generated, returning files unchanged")
+        return extracted_files
+
+    # Assign best matching classification to each file
+    print(f"Classifying {len(extracted_files)} files using embedding similarity...")
+    for file in extracted_files:
+        if file.embedding is None or len(file.embedding) == 0:
+            print(f"File {file.name} has no embedding, skipping")
+            continue
+
+        best_similarity = -1
+        best_classification = None
+
+        for class_id, class_emb in classification_embeddings.items():
+            sim = _cosine_similarity(file.embedding, class_emb)
+            if sim > best_similarity:
+                best_similarity = sim
+                best_classification = next(
+                    (c for c in classifications if c.classification_id == class_id),
+                    None,
+                )
+
+        if best_classification:
+            file.classification = best_classification
+            print(
+                f"  {file.name} → {best_classification.name} (similarity: {best_similarity:.3f})"
+            )
+
     return extracted_files
+
+
+def _cosine_similarity(vec1: list[float], vec2: list[float]) -> float:
+    """
+    Calculate cosine similarity between two vectors.
+
+    Returns a value between -1 and 1:
+    - 1 means vectors point in same direction (very similar)
+    - 0 means vectors are orthogonal (unrelated)
+    - -1 means vectors point in opposite directions (very different)
+    """
+    v1 = np.array(vec1)
+    v2 = np.array(vec2)
+
+    dot_product = np.dot(v1, v2)
+    norm1 = np.linalg.norm(v1)
+    norm2 = np.linalg.norm(v2)
+
+    if norm1 == 0 or norm2 == 0:
+        return 0.0
+
+    return float(dot_product / (norm1 * norm2))
diff --git a/backend/app/utils/classification/create_classifications.py b/backend/app/utils/classification/create_classifications.py
@@ -1,3 +1,8 @@
+import hdbscan
+import numpy as np
+from sklearn.preprocessing import normalize
+
+from app.core.litellm import LLMClient
 from app.schemas.classification_schemas import ExtractedFile
 
 
@@ -10,4 +15,134 @@ async def create_classifications(
     to iteratively set new classifications returning the final result
     """
     # TODO: Implement the logic that creates/edits classifications from the extracted files.
-    return ["Request For Quote", "Product Specification", "Purchase Order", "Quote"]
+
+    embeddings = []
+    valid_files = []
+
+    for file in extracted_files:
+        if file.embedding is not None and len(file.embedding) > 0:
+            embeddings.append(file.embedding)
+            valid_files.append(file)
+
+    if len(embeddings) < 3:
+        print(
+            f"Not enough files for clustering ({len(embeddings)}), returning initial classifications"
+        )
+        return initialClassifications
+
+    embeddings_array = np.array(embeddings)
+
+    # Normalize embeddings so that cosine similarity ≈ euclidean distance
+    normalized_embeddings = normalize(embeddings_array)  # L2 normalization
+
+    clusterer = hdbscan.HDBSCAN(
+        min_cluster_size=2,
+        min_samples=1,
+        metric="euclidean",
+        cluster_selection_method="eom",
+    )
+
+    cluster_labels = clusterer.fit_predict(normalized_embeddings)
+    # HDBSCAN marks outliers with -1
+    _outlier_indices = np.where(cluster_labels == -1)[0]
+
+    clusters = {}
+
+    for i, label in enumerate(cluster_labels):
+        if label not in clusters:
+            clusters[label] = []
+        clusters[label].append(valid_files[i])
+
+    outliers = clusters.pop(-1, [])  # Remove -1 cluster if it exists
+    print(f"Found {len(clusters)} clusters, {len(outliers)} outliers")
+
+    client = LLMClient()
+    classification_names = []
+
+    for cluster_id, files_in_cluster in clusters.items():
+        print(f"Analyzing cluster {cluster_id} with {len(files_in_cluster)} files...")
+
+        # Get sample documents from cluster (up to 5 for context)
+        sample_texts = []
+        for file in files_in_cluster[:5]:
+            text = _extract_text_from_file(file)
+            sample_texts.append(text[:500])  # Limit text length
+
+        # Use LLM to name the cluster
+        prompt = f"""Analyze these similar documents and provide a single, concise classification name.
+
+    Sample documents from this cluster:
+
+    {chr(10).join(f"Document {i + 1}: {text}" for i, text in enumerate(sample_texts))}
+
+    What type of documents are these? Respond with ONLY the category name.
+    Do not include any explanation or punctuation."""
+
+        try:
+            response = await client.chat(prompt, temperature=0.3, max_tokens=50)
+            category_name = response.choices[0].message.content.strip()
+            if not category_name:
+                category_name = f"Document Type {cluster_id}"
+        except Exception as e:
+            print(f"  → Error generating name: {e}")
+            category_name = f"Document Type {cluster_id}"
+
+        classification_names.append(category_name)
+        print(f"  → Named: {category_name}")
+
+    # Handle outliers individually
+    for i, file in enumerate(outliers):
+        print(f"Analyzing outlier {i} (single file)...")
+        text = _extract_text_from_file(file)
+        prompt = f"""Analyze this document and provide a concise classification name.
+
+    Document:
+
+    {text}
+
+    Respond with ONLY the category name."""
+
+        try:
+            response = await client.chat(prompt, temperature=0.3, max_tokens=50)
+            category_name = response.choices[0].message.content.strip()
+            if category_name:
+                classification_names.append(category_name)
+                print(f"  → Outlier named: {category_name}")
+            else:
+                fallback_name = f"Document Type Outlier {i}"
+                classification_names.append(fallback_name)
+                print(f"  → Outlier named: {fallback_name}")
+        except Exception as e:
+            print(f"  → Error naming outlier: {e}")
+            fallback_name = f"Document Type Outlier {i}"
+            print(f"  → Outlier named: {fallback_name}")
+
+    all_classifications = classification_names + initialClassifications
+    final_classifications = list(set(all_classifications))
+
+    print(f"Final classifications: {final_classifications}")
+    return final_classifications
+
+
+def _extract_text_from_file(file: ExtractedFile) -> str:
+    """Convert extracted file to text representation for analysis."""
+    parts = []
+
+    # Add filename
+    if file.name:
+        parts.append(f"Filename: {file.name}")
+
+    # Add extracted content
+    if isinstance(file.extracted_data, dict):
+        for key, value in file.extracted_data.items():
+            if isinstance(value, dict | list):
+                continue
+            parts.append(f"{key}: {value}")
+    elif isinstance(file.extracted_data, list):
+        parts.append(
+            f"Items: {', '.join(str(item) for item in file.extracted_data[:5])}"
+        )
+    else:
+        parts.append(str(file.extracted_data))
+
+    return " ".join(parts)
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -24,6 +24,9 @@ umap-learn==0.5.7
 scikit-learn==1.5.2
 numpy==2.1.3
 
+# Classification
+hdbscan>=0.8.33
+
 # docling==2.55.1
 # docling-core==2.48.4
 # docling-ibm-models==3.9.1

diff --git a/frontend/src/config/supabase.config.ts b/frontend/src/config/supabase.config.ts
@@ -4,6 +4,9 @@ import type { Database } from '../types/database.types'
 const supabaseUrl = import.meta.env.VITE_SUPABASE_URL
 const supabaseAnonKey = import.meta.env.VITE_SUPABASE_ANON_KEY
 
+console.log('supabaseURL:', supabaseUrl)
+console.log('supabaseAnonKey:', supabaseAnonKey)
+
 if (!supabaseUrl || !supabaseAnonKey) {
   throw new Error('Missing Supabase environment variables')
 }

diff --git a/frontend/src/contexts/AuthContext.tsx b/frontend/src/contexts/AuthContext.tsx
@@ -178,6 +178,7 @@ export function AuthProvider({ children }: { children: React.ReactNode }) {
         subscription.unsubscribe()
       }
     }
+    /* eslint-disable-next-line react-hooks/exhaustive-deps */
   }, [])
 
   const value: AuthContextType = {

diff --git a/frontend/src/types/database.types.ts b/frontend/src/types/database.types.ts
@@ -283,7 +283,7 @@ export type Database = {
       }
       l2_normalize: {
         Args: { '': string } | { '': unknown } | { '': unknown }
-        Returns: unknown
+        Returns: string
       }
       match_documents: {
         Args: {
@@ -299,46 +299,10 @@ export type Database = {
           source_file_id: string
         }[]
       }
-      sparsevec_out: {
-        Args: { '': unknown }
-        Returns: unknown
-      }
-      sparsevec_send: {
-        Args: { '': unknown }
-        Returns: string
-      }
-      sparsevec_typmod_in: {
-        Args: { '': unknown[] }
-        Returns: number
-      }
       update_webhook_config: {
         Args: { secret: string; url: string }
         Returns: undefined
       }
-      vector_avg: {
-        Args: { '': number[] }
-        Returns: string
-      }
-      vector_dims: {
-        Args: { '': string } | { '': unknown }
-        Returns: number
-      }
-      vector_norm: {
-        Args: { '': string }
-        Returns: number
-      }
-      vector_out: {
-        Args: { '': string }
-        Returns: unknown
-      }
-      vector_send: {
-        Args: { '': string }
-        Returns: string
-      }
-      vector_typmod_in: {
-        Args: { '': unknown[] }
-        Returns: number
-      }
     }
     Enums: {
       extraction_status: 'queued' | 'processing' | 'completed' | 'failed'

diff --git a/package-lock.json b/package-lock.json
-Original file line number
+Diff line change
@@ Expand Up @@
             subscription.unsubscribe()
           }
         }
+        /* eslint-disable-next-line react-hooks/exhaustive-deps */
       }, [])
       const value: AuthContextType = {
@@ Expand Down @@